diff --git a/.gitignore b/.gitignore
index 4f9205f8..e42680c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,13 +2,20 @@
 *.jl.cov
 *.jl.mem
 .DS_Store
-/Manifest.toml
-/LocalPreferences.toml
+
+Manifest.toml
+LocalPreferences.toml
+
 /dev/
 /docs/build/
 /docs/site/
-/docs/Manifest.toml
+
 /tmp/
+
 *.vtu
 *.pvtu
 *.pvd
+
+CLAUDE.md
+AGENTS.md
+
diff --git a/Project.toml b/Project.toml
index 8bb21ec8..431e2cb6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -24,7 +24,7 @@ ForwardDiff = "0.10, 1"
 Gridap = "0.20.6"
 LinearAlgebra = "1"
 MPI = "0.16, 0.17, 0.18, 0.19, 0.20"
-PartitionedArrays = "0.3.3"
+PartitionedArrays = "0.5"
 SparseArrays = "1.3"
 SparseMatricesCSR = "0.6.6"
 WriteVTK = "1.12.0"
diff --git a/docs/Project.toml b/docs/Project.toml
index db33639a..a0838543 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,3 +1,4 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Gridap = "56d4f2e9-7ea1-5844-9cf6-b9c51ca7ce8e"
 GridapDistributed = "f9701e48-63b3-45aa-9a63-9bc6c271f355"
diff --git a/docs/make.jl b/docs/make.jl
index 73d36b0a..1bd2438d 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,17 +1,15 @@
-using Documenter, GridapDistributed
+using Documenter, GridapDistributed, Gridap
 
 pages = [
-  "Home" => "index.md",
-  "Getting Started" => "getting-started.md",
-  "GridapDistributed" => "GridapDistributed.md",
-  "Algebra" => "Algebra.md",
-  "CellData" => "CellData.md",
-  "FESpaces" => "FESpaces.md",
-  "Geometry" => "Geometry.md",
-  "MultiField" => "MultiField.md",
-  "Visualization" => "Visualization.md",
-  "Adaptivity" => "Adaptivity.md",
- ]
+  "Home"          => "index.md",
+  "Backends"      => "backends.md",
+  "Algebra"       => "algebra.md",
+  "Geometry"      => "geometry.md",
+  "FESpaces"      => "fespaces.md",
+  "Assembly"      => "assembly.md",
+  "Adaptivity"    => "adaptivity.md",
+  "Visualization" => "visualization.md",
+]
 
 makedocs(;
     modules=[GridapDistributed],
@@ -20,7 +18,7 @@ makedocs(;
     repo="https://github.com/gridap/GridapDistributed.jl/blob/{commit}{path}#L{line}",
     sitename="GridapDistributed.jl",
     authors="S. Badia <santiago.badia@monash.edu>, A. F. Martin <alberto.f.martin@anu.edu.au>, F. Verdugo <fverdugo@cimne.upc.edu>",
-    # warnonly=true, # for debugging
+    warnonly=true,
 )
 
 deploydocs(;
diff --git a/docs/src/Adaptivity.md b/docs/src/Adaptivity.md
deleted file mode 100644
index 3bf14320..00000000
--- a/docs/src/Adaptivity.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Adaptivity
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["Adaptivity.jl","Redistribution.jl"]
-```
diff --git a/docs/src/Algebra.md b/docs/src/Algebra.md
deleted file mode 100644
index 462ac8d4..00000000
--- a/docs/src/Algebra.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Algebra
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["Algebra.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/CellData.md b/docs/src/CellData.md
deleted file mode 100644
index 7e6a2c1e..00000000
--- a/docs/src/CellData.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# CellData
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["CellData.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/FESpaces.md b/docs/src/FESpaces.md
deleted file mode 100644
index 37f10693..00000000
--- a/docs/src/FESpaces.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# FESpaces
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["FESpaces.jl","DivConformingFESpaces.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/Geometry.md b/docs/src/Geometry.md
deleted file mode 100644
index 11d2c37c..00000000
--- a/docs/src/Geometry.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Geometry
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["Geometry.jl"]
-```
-
-## MacroDiscreteModels
-
-These are functionalities to select and globally number the interfaces between processors:
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["MacroDiscreteModels.jl"]
-```
diff --git a/docs/src/GridapDistributed.md b/docs/src/GridapDistributed.md
deleted file mode 100644
index 022222d1..00000000
--- a/docs/src/GridapDistributed.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# GridapDistributed
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["GridapDistributed.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/MultiField.md b/docs/src/MultiField.md
deleted file mode 100644
index f1472582..00000000
--- a/docs/src/MultiField.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# MultiField
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["MultiField.jl","BlockPartitionedArrays.jl"]
-```
diff --git a/docs/src/Visualization.md b/docs/src/Visualization.md
deleted file mode 100644
index 8eda502c..00000000
--- a/docs/src/Visualization.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Visualization
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["Visualization.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/adaptivity.md b/docs/src/adaptivity.md
new file mode 100644
index 00000000..ed86167d
--- /dev/null
+++ b/docs/src/adaptivity.md
@@ -0,0 +1,91 @@
+# [Adaptivity](@id adaptivity)
+
+`GridapDistributed.jl` provides an interface for the adaptive mesh refinement (AMR)
+abstractions defined in `Gridap.jl`. The framework works out of the box for Gridap's
+built-in refinement routines. For scalable AMR on unstructured meshes with full MPI
+performance, see [`GridapP4est.jl`](https://github.com/gridap/GridapP4est.jl), which
+integrates the p4est library with `GridapDistributed.jl`.
+
+## Adapted discrete models
+
+`DistributedAdaptedDiscreteModel` represents a distributed mesh obtained by refining a
+parent mesh. It wraps both the fine (child) mesh, the coarse (parent) mesh, and a
+distributed array of `AdaptivityGlue` objects that record the parent-child cell
+relationship.
+
+### Uniform Cartesian refinement
+
+```julia
+using Gridap, GridapDistributed, PartitionedArrays
+
+function main(distribute)
+  ranks  = distribute(LinearIndices((4,)))
+  coarse = CartesianDiscreteModel(ranks, (2,2), (0,1,0,1), (4,4))
+
+  # Refine uniformly by a factor of 2 in every direction
+  fine = refine(coarse, 2)   # returns a DistributedAdaptedDiscreteModel
+
+  parent     = get_parent(fine)
+  glue_array = get_adaptivity_glue(fine)
+
+  # The fine mesh is used like any other DistributedDiscreteModel
+  reffe = ReferenceFE(lagrangian, Float64, 1)
+  V     = TestFESpace(fine, reffe)
+  uh    = interpolate(x -> sum(x), V)
+end
+
+with_debug(main)
+```
+
+`refine(model, r)` accepts an integer or a tuple refinement factor; `refine(model, 2)`
+halves the cell size in every direction.
+
+## Sub-communicators
+
+AMR workflows sometimes involve different sets of ranks at different stages (e.g. a coarse
+mesh on 2 ranks that is refined onto 8). Use `generate_subparts` to carve out a
+sub-communicator of `n` ranks from the full set:
+
+```julia
+fine_ranks   = distribute(LinearIndices((8,)))
+coarse_ranks = generate_subparts(fine_ranks, 2)  # ranks 1–2 only
+
+if i_am_in(coarse_ranks)
+  # only executed on ranks 1 and 2
+end
+```
+
+Ranks outside the sub-communicator receive an inert placeholder and skip the guarded block.
+
+## API
+
+### Adapted models
+
+```@docs
+GridapDistributed.DistributedAdaptedDiscreteModel
+```
+
+### Sub-communicators
+
+```@docs
+GridapDistributed.generate_subparts
+GridapDistributed.i_am_in
+```
+
+### Refinement internals
+
+```@docs
+GridapDistributed.refine_local_models
+GridapDistributed.refine_cell_gids
+```
+
+### Redistribution
+
+```@docs
+GridapDistributed.RedistributeGlue
+GridapDistributed.redistribute
+GridapDistributed.redistribute_cartesian
+GridapDistributed.redistribute_array_by_cells
+GridapDistributed.redistribution_local_indices
+GridapDistributed.redistribute_indices
+```
diff --git a/docs/src/algebra.md b/docs/src/algebra.md
new file mode 100644
index 00000000..47ba97ea
--- /dev/null
+++ b/docs/src/algebra.md
@@ -0,0 +1,75 @@
+# [Algebra](@id algebra)
+
+`GridapDistributed.jl` relies on [`PartitionedArrays.jl`](https://github.com/fverdugo/PartitionedArrays.jl) for all distributed linear algebra. Users interacting with the assembled system — for example, to inspect residuals or feed vectors into a custom solver — will work directly with PartitionedArrays types. This page gives a brief conceptual overview; the [PartitionedArrays documentation](https://fverdugo.github.io/PartitionedArrays.jl/stable/) is the authoritative reference.
+
+## Distributed index ranges: `PRange`
+
+A `PRange` is a distributed range of integers. It is the backbone of every distributed array: it records, for each local index on each rank, its global ID and which rank owns it.
+
+Each rank's slice of a `PRange` is an `AbstractLocalIndices` object that exposes the local index layout:
+
+| Function | Returns |
+|---|---|
+| `local_to_global(idx)` | Global ID for each local index (owned + ghost) |
+| `own_to_global(idx)` | Global IDs of owned (non-ghost) indices |
+| `ghost_to_global(idx)` | Global IDs of ghost indices |
+| `own_to_local(idx)` | Local positions of owned indices |
+| `ghost_to_local(idx)` | Local positions of ghost indices |
+| `local_to_owner(idx)` | Owner rank for each local index |
+
+Access the local indices on each rank via `partition`:
+
+```julia
+map(partition(prange)) do indices
+  lid_to_gid = local_to_global(indices)
+end
+```
+
+`PRanges` will be used as axes for distributed vectors and matrices, and to define the communication pattern for synchronizing ghost values.
+
+## Distributed vectors: `PVector`
+
+A `PVector` is a distributed dense vector. Each rank stores a local segment that includes both **owned** entries (for which this rank is authoritative) and **ghost** entries (copies of entries owned by neighboring ranks).
+
+Two operations synchronize the distributed state:
+
+- `consistent!(x)` — broadcasts owned values to ghost copies on neighboring ranks. Call
+  this after a solve to ensure ghost entries reflect the current solution.
+- `assemble!(x)` — accumulates ghost contributions into the owning rank's entry. Used
+  during assembly to collect off-rank contributions.
+
+Both return a task; wait for completion with `|> wait` or `fetch`.
+
+A normal workflow would be:
+
+```julia
+  map(partition(x)) do x
+    # Do something locally with the values of x
+    # NO communication is allowed here!
+  end
+  consistent!(x) |> wait # Synchronize ghost values after local updates
+```
+
+## Distributed sparse matrices: `PSparseMatrix`
+
+A `PSparseMatrix` is a distributed sparse matrix stored in compressed form. Each rank owns a block of rows. Non-zero entries in off-rank columns are stored locally and communicated as needed during matrix-vector products (`mul!`).
+
+## GridapDistributed additions
+
+`GridapDistributed.jl` adds a small set of utilities on top of PartitionedArrays to support the FE assembly pipeline.
+
+### API
+
+```@docs
+GridapDistributed.permuted_variable_partition
+GridapDistributed.unpermute
+GridapDistributed.locally_repartition
+GridapDistributed.filter_and_replace_ghost
+```
+
+### Block arrays
+
+```@docs
+GridapDistributed.BlockPRange
+GridapDistributed.BlockPArray
+```
diff --git a/docs/src/assembly.md b/docs/src/assembly.md
new file mode 100644
index 00000000..a2e789e6
--- /dev/null
+++ b/docs/src/assembly.md
@@ -0,0 +1,88 @@
+# [Assembly](@id assembly)
+
+`GridapDistributed.jl` provides three assembly strategies that control how local
+contributions from different ranks are combined into the global system. These are implemented as Gridap `AssemblyStrategy` instances, and can be specified via the `strategy` argument of `SparseMatrixAssembler`.
+
+```julia
+assem = SparseMatrixAssembler(U, V, strategy)
+op = AffineFEOperator(a, l, U, V, assem)
+```
+
+## `Assembled` (default)
+
+Standard parallel assembly:
+
+- Each rank integrates over its **owned** cells. Contributions to owned rows coming from other ranks are communicated and assembled.
+- The result is a fully assembled system, where each processor holds the correct portions of the global matrix and vector for its owned rows.
+- Extra rows corresponding to ghost DOFs are allocated as cache for in-place re-assembly (e.g. for nonlinear or transient problems), but may contain garbage at all times.
+
+```julia
+assem = SparseMatrixAssembler(U, V, Assembled())
+```
+
+!!! note
+    Unless you know what you are doing, this is the assembly strategy you should use.
+
+## `SubAssembled`
+
+- Each rank integrates over its **owned** cells, and assembles the local contributions for both owned and ghost rows. No communication of ghost contributions is performed.
+- The result is a sub-assembled system, where local matrices are incomplete (sub-assembled). As a consequence, matrix-vector products require assembly of the resulting vector (which makes it a bit more expensive than the `Assembled` case).
+- The advantage: The row and column layout of the resulting system is closer to the DOF layout of the FESpace. This is ideal for Domain Decomposition methods (or other geometrical solvers).
+
+```julia
+assem = SparseMatrixAssembler(U, V, SubAssembled())
+```
+
+## `LocallyAssembled`
+
+- Each rank integrates over its **owned and ghost** cells. Contributions to ghost rows are discarded, and no communication is performed. **This assumes that all local contributions to owned rows can be computed without communication**. This is in general **NOT** true.
+- If the invariant holds, the result is a fully assembled system like in the `Assembled` case, but without the communication cost.
+- No extra rows corresponding to ghost DOFs are required for in-place re-assembly.
+
+```julia
+assem = SparseMatrixAssembler(U, V, LocallyAssembled())
+```
+
+!!! warning
+    `LocallyAssembled` produces incorrect results if the assumption above is violated.
+    Use only when you are sure that owned-row contributions are purely local.
+
+## Changing the sparse matrix format
+
+The default format is `SparseMatrixCSC`. To use a different format (e.g. CSR for
+better SpMV performance), pass a custom assembler:
+
+```julia
+using SparseMatricesCSR
+assem = SparseMatrixAssembler(
+  SparseMatrixCSR{0,Float64,Int},
+  Vector{Float64},
+  U, V, strategy
+)
+op = AffineFEOperator(a, l, U, V, assem)
+```
+
+## API
+
+### Strategies
+
+```@docs
+Assembled
+SubAssembled
+LocallyAssembled
+```
+
+### Assembler type
+
+```@docs
+GridapDistributed.DistributedSparseMatrixAssembler
+```
+
+### Builder types
+
+```@docs
+GridapDistributed.DistributedArrayBuilder
+GridapDistributed.DistributedCounter
+GridapDistributed.DistributedAllocation
+GridapDistributed.TrackedArrayAllocation
+```
diff --git a/docs/src/backends.md b/docs/src/backends.md
new file mode 100644
index 00000000..cae27f48
--- /dev/null
+++ b/docs/src/backends.md
@@ -0,0 +1,102 @@
+# [Backends and Philosophy](@id backends)
+
+`GridapDistributed.jl` is a parallel extension of [`Gridap.jl`](https://github.com/gridap/Gridap.jl) for distributed-memory environments. The central design goal is **minimal API change**: a simulation written for a single process can be made parallel with only a handful of modifications.
+
+## A sequential Poisson problem
+
+Consider the Poisson equation
+
+```math
+-\Delta u = f \quad \text{in } \Omega = (0,1)^2, \qquad u = 0 \text{ on } \partial\Omega,
+```
+
+solved with Gridap on a single process:
+
+```julia
+using Gridap
+
+model = CartesianDiscreteModel((0,1,0,1), (8,8))
+
+reffe = ReferenceFE(lagrangian, Float64, 1)
+V  = TestFESpace(model, reffe, dirichlet_tags="boundary")
+U  = TrialFESpace(V, x -> 0.0)
+
+Ω  = Triangulation(model)
+dΩ = Measure(Ω, 2)
+
+a(u,v) = ∫( ∇(v)⋅∇(u) )dΩ
+l(v)   = ∫( v*1 )dΩ
+
+op = AffineFEOperator(a, l, U, V)
+uh = solve(op)
+```
+
+## Converting to distributed
+
+To run the same problem across multiple MPI ranks, two changes are needed:
+
+1. Wrap the driver into a function that accepts a `distribute` function argument and the number of processors `parts`.
+2. Distribute the model. In this case, we can use `CartesianDiscreteModel(ranks, parts, domain, cells)`.
+
+Everything else will dispatch on the distributed model, and thus the rest of the driver can be left unchanged:
+
+```julia
+using Gridap, GridapDistributed, PartitionedArrays
+
+function poisson(distribute, parts)
+  ranks = distribute(LinearIndices((prod(parts),)))
+
+  model = CartesianDiscreteModel(ranks, parts, (0,1,0,1), (8,8))
+
+  reffe = ReferenceFE(lagrangian, Float64, 1)
+  V  = TestFESpace(model, reffe, dirichlet_tags="boundary")
+  U  = TrialFESpace(V, x -> 0.0)
+
+  Ω  = Triangulation(model)
+  dΩ = Measure(Ω, 2)
+
+  a(u,v) = ∫( ∇(v)⋅∇(u) )dΩ
+  l(v)   = ∫( v*1 )dΩ
+
+  op = AffineFEOperator(a, l, U, V)
+  solve(op)
+end
+```
+
+`distribute` is a function that maps a `LinearIndices` range onto a
+set of MPI ranks and returns an array of rank IDs.
+
+## The two backends
+
+### `with_debug` — single-process simulation
+
+Unfortunately, good MPI debugging tools are scarce. Moreover, they never integrate well within Julia's development workflow (that is using `Revise.jl` to edit code and re-run it interactively).
+
+To facilitate development and debugging, `PartitionedArrays.jl` provides a `DebugArray` backend that **simulates multiple MPI ranks inside a single Julia process**. This allows you to print, inspect, and debug distributed objects within the REPL using your ordinary Julia workflow. To use it, wrap your driver function with `with_debug`:
+
+```julia
+with_debug() do distribute
+  poisson(distribute, (2,2))
+end
+```
+
+Use this during development and for running tests in CI without MPI.
+
+!!!note
+    The only thing `DebugArray` does is wrap serial arrays with `DebugArray`, which is a serial array that does not allow scalar indexing (e.g. `a[i]`). If you know what you are doing, you can directly use arrays and call `poisson(collect, (2,2))`.
+
+### `with_mpi` — true MPI execution
+
+Once the code is working with `with_debug`, you can switch to the real MPI backend by replacing `with_debug` with `with_mpi`. This will run the code across multiple processes using MPI.
+
+Place the following at the bottom of `poisson.jl`:
+
+```julia
+with_mpi() do distribute
+  poisson(distribute, (2,2))
+end
+```
+
+```bash
+mpiexecjl --project=. -n 4 julia poisson.jl
+```
diff --git a/docs/src/changelog.md b/docs/src/changelog.md
new file mode 100644
index 00000000..a7b10c54
--- /dev/null
+++ b/docs/src/changelog.md
@@ -0,0 +1,3 @@
+# Changelog
+
+
diff --git a/docs/src/fespaces.md b/docs/src/fespaces.md
new file mode 100644
index 00000000..b7dc4c15
--- /dev/null
+++ b/docs/src/fespaces.md
@@ -0,0 +1,105 @@
+# [FE Spaces and Cell Data](@id fespaces)
+
+## Distributed FESpaces
+
+`DistributedSingleFieldFESpace` is the parallel counterpart of Gridap's
+`SingleFieldFESpace`. It is constructed with the same syntax:
+
+```julia
+reffe = ReferenceFE(lagrangian, Float64, 1)
+V = TestFESpace(model, reffe, dirichlet_tags="boundary")
+U = TrialFESpace(V, u_dirichlet)
+```
+
+where `model` is a `DistributedDiscreteModel`.
+
+Internally, each rank holds a local `FESpace` on its local portion of the mesh
+(owned + ghost cells). These are accessible via `local_views`:
+
+```julia
+map(local_views(V)) do local_V
+  println("Local ndofs = ", num_free_dofs(local_V))
+end
+```
+
+This API also extends to multi-field spaces.
+
+## Global DOF numbering and ownership
+
+Global DOF IDs are stored as a `PRange` (see [Algebra](@ref algebra)), which can be accessed via `get_free_dof_gids(V)`. Some general considerations:
+
+- DOFs are **owned by the rank that owns the cell(s) they are attached to**. DOFs on the interface between two partitions are owned by the rank **with the highest rank ID**. This convention ensures that each DOF is owned by exactly one rank and appears as a ghost DOF on all neighboring ranks.
+- The global DOF numbering is such that DOFs owned by rank 1 come first, followed by those owned by rank 2, and so on. This means that the global DOF IDs are contiguous within each rank's owned portion, which is convenient for assembly and solver interfaces.
+- We preserve the local DOF numbering of the serial `FESpace` on each rank. That means that locally, owned DOFs are not necessarily (locally) numbered before ghost DOFs.
+
+## Ghost DOF consistency
+
+Within the library, we internally maintain the consistency of ghost DOF values as needed. When implementing your own low-level operations, it is important to do the same. In particular, before using an `FEFunction` for assembly it is important to ensure that ghost DOF values are consistent with their owned counterparts. See for instance the following example:
+
+```julia
+uh = zero(U)
+r(v) = ∫( v*uh )dΩ
+
+x = get_free_dof_values(uh)
+map(partition(x)) do x
+  # Do something locally with the values of x
+  # After this, ghost values may be inconsistent!
+end
+
+b = assemble_vector(r, V) # May be WRONG!
+
+consistent!(x) |> wait
+b = assemble_vector(r, V) # OK
+```
+
+## API
+
+### Abstract interface
+
+```@docs
+GridapDistributed.DistributedFESpace
+```
+
+### Single-field spaces
+
+```@docs
+GridapDistributed.DistributedSingleFieldFESpace
+```
+
+### Multi-field spaces
+
+```@docs
+GridapDistributed.DistributedMultiFieldFESpace
+```
+
+### Cell data
+
+```@docs
+GridapDistributed.DistributedCellField
+GridapDistributed.DistributedMeasure
+GridapDistributed.DistributedDomainContribution
+GridapDistributed.DistributedCellPoint
+```
+
+### FE functions
+
+```@docs
+GridapDistributed.DistributedFEFunctionData
+GridapDistributed.DistributedSingleFieldFEFunction
+```
+
+### Constant FE space
+
+```@docs
+Gridap.FESpaces.ConstantFESpace(::GridapDistributed.DistributedDiscreteModel; constraint_type, kwargs...)
+```
+
+### DOF numbering utilities
+
+```@docs
+GridapDistributed.generate_gids
+GridapDistributed.generate_posneg_gids
+GridapDistributed.generate_gids_by_color
+GridapDistributed.split_gids_by_color
+GridapDistributed.vcat_gids
+```
diff --git a/docs/src/geometry.md b/docs/src/geometry.md
new file mode 100644
index 00000000..35c16ecb
--- /dev/null
+++ b/docs/src/geometry.md
@@ -0,0 +1,135 @@
+# [Geometry](@id geometry)
+
+## The concept of a distributed mesh
+
+Each MPI rank holds a local portion of the global mesh divided into two parts:
+
+- **Owned cells** — cells that this rank is responsible for. Each cell is owned by exactly one rank.
+- **Ghost cells** — cells owned by neighboring ranks, but needed locally to ensure consistency during local operations.
+
+This overlapping layout means every rank has (in most cases) enough context to perform local operations to its owned cells without communication. Ghost cells are synchronized as needed to ensure consistency across ranks.
+
+## `DistributedDiscreteModel`
+
+The parallel counterpart of Gridap's `DiscreteModel`. Internally it wraps a distributed array of local `DiscreteModel` objects, one per rank.
+
+### Cartesian meshes
+
+The simplest way to build a distributed mesh is `CartesianDiscreteModel`, which partitions a structured Cartesian grid according to a `parts` tuple:
+
+```julia
+parts = (2, 2) # 2×2 rank layout
+ranks = distribute(LinearIndices((prod(parts),)))
+
+# 2D Cartesian mesh on [0,1]² with 8×8 cells
+model = CartesianDiscreteModel(ranks, parts, (0,1,0,1), (8,8))
+```
+
+In this case, each rank holds a `3x3` block of cells, of which only `2x2` are owned and the remaining are ghost cells from neighboring ranks.
+
+### Distributing a serial mesh
+
+An existing serial `DiscreteModel` can be distributed by providing a cell-to-rank assignment vector:
+
+```julia
+serial_model  = DiscreteModelFromFile("mesh.msh")
+cell_to_part  = ...  # integer vector of length num_cells(serial_model)
+
+model = DiscreteModel(ranks, serial_model, cell_to_part)
+```
+
+The `cell_to_part` vector assigns each cell in the serial model to a rank. This array can be generated by a graph partitioner like METIS or Scotch, or by a custom geometric criterion.
+
+This is the mechanism used internally by [`GridapGmsh.jl`](https://github.com/gridap/GridapGmsh.jl): it reads the Gmsh file on rank 0, partitions it with METIS, and distributes it via this function.
+
+## Accessing local data: `local_views` and `map`
+
+The central idiom in `GridapDistributed.jl` is `map` over `local_views`:
+
+```julia
+map(local_views(model)) do local_model
+  # This block runs independently on every rank
+  n = num_cells(local_model)   # includes ghost cells
+end
+```
+
+`local_views(x)` returns the distributed array of per-rank local objects. `map` applies a function to each local piece simultaneously (sequentially in `DebugArray` mode; one MPI process per rank in `MPIArray` mode). Most distributed objects support `local_views`: models, triangulations, FE spaces, cell fields, vectors, matrices.
+
+`get_parts(x)` returns the rank array associated with `x`, useful when you need to build new distributed objects with the same communicator.
+
+## Global cell and face IDs
+
+```julia
+cell_gids = get_cell_gids(model)       # PRange of global cell IDs
+face_gids = get_face_gids(model, d)    # PRange of global d-face IDs
+```
+
+These `PRange` objects (see [Algebra](@ref algebra)) record, for each local cell/face, its global ID and which rank owns it. They are the foundation for assembling global sparse matrices and vectors.
+
+## Ghost vs. no-ghost triangulations
+
+Triangulations can include or exclude ghost cells:
+
+```julia
+Ωg  = Triangulation(model, with_ghost) # owned + ghost cells (default)
+Ωng = Triangulation(model, no_ghost)   # owned cells only
+```
+
+Generally speaking, triangulations used for assembly should exclude ghost cells to avoid double-counting contributions, while triangulations used to create other objects (e.g. FE spaces) should include ghost cells to ensure overlapping consistency.
+
+The default is `no_ghost`, but no special care is needed when building FE spaces.
+
+## API
+
+### Discrete models
+
+```@docs
+GridapDistributed.DistributedDiscreteModel
+GridapDistributed.GenericDistributedDiscreteModel
+```
+
+### Global index ranges
+
+```@docs
+get_cell_gids
+get_face_gids
+```
+
+### Local data access
+
+```@docs
+local_views
+get_parts
+```
+
+### Ghost control
+
+```@docs
+with_ghost
+no_ghost
+```
+
+### Triangulations
+
+```@docs
+GridapDistributed.DistributedTriangulation
+```
+
+### Supporting types
+
+```@docs
+GridapDistributed.DistributedGrid
+GridapDistributed.DistributedGridTopology
+GridapDistributed.DistributedFaceLabeling
+```
+
+### Macro-element models
+
+```@docs
+GridapDistributed.MacroDiscreteModel
+GridapDistributed.classify_interfaces
+GridapDistributed.generate_nbors_and_keys
+GridapDistributed.generate_interface_gids
+GridapDistributed.get_local_face_labeling
+GridapDistributed.get_global_face_labeling
+```
diff --git a/docs/src/getting-started.md b/docs/src/getting-started.md
deleted file mode 100644
index c580ebe2..00000000
--- a/docs/src/getting-started.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Getting Started
-
-## Installation requirements
-
-GridapDistributed is tested on Linux, but it should be also possible to use it on Mac OS and Windows since it is written exclusively in Julia and it only depends on registered Julia packages.
-
-## Installation
-
-GridapDistributed is a registered package. Thus, the installation should be straight forward using the Julia's package manager [Pkg](https://julialang.github.io/Pkg.jl/v1/). To this end, open the Julia REPL (i.e., execute the `julia` binary), type `]` to enter package mode, and install GridapDistributed as follows
-
-```julia
-pkg> add GridapDistributed
-```
-
-You will also need the `PartitionedArrays` package. 
-
-```julia
-pkg> add PartitionedArrays
-```
-
-If you want to leverage the satellite packages of `GridapDistributed.jl`, i.e., `GridapPETSc.jl`, `GridapP4est.jl`, and/or `GridapGmsh.jl`, you have to install them separately as 
-
-```julia
-pkg> add GridapPETSc
-```
-
-```julia
-pkg> add GridapGmsh
-```
-
-```julia
-pkg> add GridapP4est
-```
-
-Please note that these three packages depend on binary builds of the corresponding libraries they wrap (i.e., PETSc, Gmsh, and P4est). By default, they will leverage binary builds available at the Julia package registry. However, one may also use custom installations of these libraries. See the documentation of the corresponding package for more details
-
-
-For further information about how to install and manage Julia packages, see the
-[Pkg documentation](https://julialang.github.io/Pkg.jl/v1/).
-
-## Further steps
-
-If you are new to the `Gridap` ecosystem of packages, we recommend that you first follow the [Gridap Tutorials](https://gridap.github.io/Tutorials/dev/) step by step in order to get familiar with the `Gridap.jl` library. `GridapDistributed.jl` and `Gridap.jl` share almost the same high-level API. Therefore, some familiarity with `Gridap.jl` is highly recommended (if not essential) before starting with `GridapDistributed.jl`. 
-
-If you are already familiarized with `Gridap.jl`, we recommend you to start straight away with the following [tutorial](https://gridap.github.io/Tutorials/dev/pages/t016_poisson_distributed).
\ No newline at end of file
diff --git a/docs/src/index.md b/docs/src/index.md
index 9fcca624..3cc2de9e 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,46 +1,66 @@
 # GridapDistributed.jl
 
-Documentation of the `GridapDistributed.jl` library.
+**Parallel finite element computations on distributed-memory machines.**
 
-!!! note
+`GridapDistributed.jl` extends [`Gridap.jl`](https://github.com/gridap/Gridap.jl) to distributed-memory parallel environments using MPI. It mirrors the Gridap sequential API almost exactly: a simulation script written for a single process can be made parallel with minimal changes.
 
-     These documentation pages are under construction.
+## Installation
 
-## Introduction
+`GridapDistributed.jl` is a registered Julia package. We recommend a minimum environment containing `Gridap` (our Finite Elements backend) and `PartitionedArrays` (our distributed arrays backend). To install it, simply run:
 
-The ever-increasing demand for resolution and accuracy in mathematical models of physical processes governed by systems of Partial Differential Equations (PDEs) 
-can only be addressed using fully-parallel advanced numerical discretization methods and scalable solution methods, thus able to exploit the vast amount of computational resources in state-of-the-art supercomputers. To this end, `GridapDistributed.jl` is a registered software package which provides 
-fully-parallel distributed memory data structures and associated methods
-for the Finite Element (FE) numerical solution of PDEs on parallel computers. Thus, it can be run on multi-core CPU desktop computers at small scales, as well as on HPC clusters and supercomputers at medium/large scales. The data structures in `GridapDistributed.jl` are designed to mirror as far as possible their counterparts in the `Gridap.jl` Julia software package, while implementing/leveraging most of their abstract interfaces. As a result, sequential Julia scripts written in the high-level Application Programming Interface (API) of `Gridap.jl` can be used verbatim up to minor adjustments in a parallel distributed memory context using `GridapDistributed.jl`.
-This equips end-users with a tool for the development of simulation codes able to solve real-world application problems on massively parallel supercomputers while using a highly expressive, compact syntax, that resembles mathematical notation. This is indeed one of the main advantages of `GridapDistributed.jl` and a major design goal that we pursue.
+```julia
+pkg> add Gridap, GridapDistributed, PartitionedArrays
+```
 
-In order to scale FE simulations to large core counts, the mesh used to discretize the computational domain on which the PDE is posed must be partitioned (distributed) among the parallel tasks such that each of these only holds a local portion of the global mesh. The same requirement applies to the rest of data structures in the FE simulation pipeline, i.e., FE space, linear system, solvers, data output, etc. The local portion of each task is composed by a set of cells that it owns, i.e., the **local cells** of the task, and a set of off-processor cells (owned by remote processors) which are in touch with its local cells, i.e., the **ghost cells** of the task.
-This overlapped mesh partition is used by `GridapDistributed.jl`, among others, to exchange data among nearest neighbors, and to glue together global Degrees of Freedom (DoFs) which are sitting on the interface among subdomains. Following this design principle, `GridapDistributed.jl` provides scalable parallel data structures and associated methods for simple grid handling (in particular, Cartesian-like meshes of arbitrary-dimensional, topologically n-cube domains), FE spaces setup, and distributed linear system assembly. It is in our future plans to provide highly scalable linear and nonlinear solvers tailored for the FE discretization of PDEs (e.g., linear and nonlinear matrix-free geometric multigrid and domain decomposition preconditioners). In the meantime, however, `GridapDistributed.jl` can be combined with other Julia packages in order to realize the full potential required in real-world applications. These packages and their relation with `GridapDistributed.jl` are overviewed in the next section. 
+Running `GridapDistributed.jl` requires an MPI implementation (e.g., MPICH, OpenMPI) and the [`MPI.jl` Julia wrapper](https://juliaparallel.org/MPI.jl/stable).
 
-## Building blocks and composability
+By default, `MPI.jl` will download and link against an MPI artifact (Microsoft MPI on Windows and MPICH on all other platforms). Then drivers should be run using [Julia's wrapper for `mpiexec`](https://juliaparallel.org/MPI.jl/stable/usage/#Julia-wrapper-for-mpiexec).
 
-The figure below depicts the relation among `GridapDistributed.jl` and other packages in the Julia package ecosystem. The interaction of `GridapDistributed.jl` and its dependencies is mainly designed with separation of concerns in mind towards high composability and modularity. On the one hand, `Gridap.jl` provides a rich set of abstract types/interfaces suitable for the FE solution of PDEs. It also provides realizations (implementations) of these abstractions tailored to serial/multi-threaded computing environments. `GridapDistributed.jl` **implements** these abstractions for parallel distributed-memory computing environments. To this end, `GridapDistributed.jl` also leverages (**uses**) the serial realizations in `Gridap.jl` and associated methods to handle the local portion on each parallel task. (See arrow labels in the figure below.)  On the other hand, `GridapDistributed.jl` relies on `PartitionedArrays.jl` in order to handle the parallel execution model (e.g., message-passing via the Message Passing Interface (MPI)), global data distribution layout, and communication among tasks. `PartitionedArrays.jl` also provides a parallel implementation of partitioned global linear systems (i.e., linear algebra vectors and sparse matrices) as needed in grid-based numerical simulations. While `PartitionedArrays.jl` is an stand-alone package, segregated from `GridapDistributed.jl`, it was designed with parallel FE packages such as `GridapDistributed.jl` in mind. In any case, `GridapDistributed.jl` is designed so that a different distributed linear algebra library from `PartitionedArrays.jl` might be used as well, as far as it is able to provide the same functionality. 
+Within clusters, however, you might want to use the (often custom) MPI implementation provided by your system administrators. To do so, you should use [`MPIPreferences.jl`](https://juliaparallel.org/MPI.jl/stable/configuration/#using_system_mpi) and follow the instructions provided in [the `MPI.jl` documentation](https://juliaparallel.org/MPI.jl/stable/configuration/).
 
+## Package ecosystem
 
-| ![fig:packages](./packages_sketchy.png) |
-|:--:|
-|`GridapDistributed.jl` and its relation to other packages in the Julia package ecosystem. In this diagram, each rectangle represents  a Julia package, while the (directed) arrows represent relations (dependencies) among packages. Both the direction of the arrow and the label attached to the arrows are used to denote the nature of the relation. Thus, e.g., `GridapDistributed.jl` depends on `Gridap.jl` and `PartitionedArrays.jl` , and GridapPETSc depends on `Gridap.jl`  and `PartitionedArrays.jl` . Note that, in the diagram, the arrow direction is relevant, e.g., GridapP4est depends on `GridapDistributed.jl` but not conversely.|
+`GridapDistributed.jl` is integrated into the Gridap ecosystem. In particular, it can be used together with most packages within the ecosystem, such as:
 
-## How to use this documentation
+- [`GridapGmsh.jl`](https://github.com/gridap/GridapGmsh.jl) for mesh generation and import.
+- [`GridapSolvers.jl`](https://github.com/gridap/GridapSolvers.jl) and [`GridapPETSc.jl`](https://github.com/gridap/GridapPETSc.jl) for distributed solvers and preconditioners.
+- [`GridapP4est.jl`](https://github.com/gridap/GridapP4est.jl) for parallel adaptive mesh refinement.
+- [`GridapEmbedded.jl`](https://github.com/gridap/GridapEmbedded.jl) for embedded boundary methods.
+- [`GridapTopOpt.jl`](https://github.com/zjwegert/GridapTopOpt.jl) for parallel topology optimization.
 
-* The first step for new users is to visit the [Getting Started](@ref) page.
+## Documentation overview
 
-* A set of tutorials written as Jupyter notebooks and html pages are available [here](https://github.com/gridap/Tutorials).
+| Section | What you will find |
+|---|---|
+| [Backends](@ref backends) | Design goals, serial-to-distributed conversion, debug vs MPI backends |
+| [Algebra](@ref algebra) | PartitionedArrays overview: `PRange`, `PVector`, `PSparseMatrix` |
+| [Geometry](@ref geometry) | Distributed meshes, ghost cells, Cartesian and unstructured models |
+| [FESpaces](@ref fespaces) | DOF numbering, ghost DOFs, multi-field spaces, cell fields |
+| [Assembly](@ref assembly) | `Assembled`, `SubAssembled`, `LocallyAssembled` strategies |
+| [Adaptivity](@ref adaptivity) | AMR interface, uniform refinement, GridapP4est |
+| [Visualization](@ref visualization) | Parallel VTK output for ParaView |
 
-* The detailed documentation is in the rest of sections of the left hand side menu. (WIP)
+## Tutorials
 
-* Guidelines for developers of the Gridap project is found in the [Gridap wiki](https://github.com/gridap/Gridap.jl/wiki) page.
+If you are new to the `Gridap` ecosystem of packages, we recommend that you first follow the [Gridap Tutorials](https://gridap.github.io/Tutorials/dev/) step by step in order to get familiar with the `Gridap.jl` library. `GridapDistributed.jl` and `Gridap.jl` share almost the same high-level API. Therefore, some familiarity with `Gridap.jl` is highly recommended (if not essential) before starting with `GridapDistributed.jl`
 
-## Julia educational resources
+We also provide some distributed-specific tutorials in [Gridap Tutorials](https://gridap.github.io/Tutorials/dev/). Examples with distributed solvers can also be found within the documentation of [`GridapSolvers.jl`](https://github.com/gridap/GridapSolvers.jl).
 
-A basic knowledge of the Julia programming language is needed to use the GridapDistributed package.
-Here, one can find a list of resources to get started with this programming language.
+## Citation
 
-* First steps to learn Julia form the [Gridap wiki](https://github.com/gridap/Gridap.jl/wiki/Start-learning-Julia) page.
-* Official webpage [docs.julialang.org](https://docs.julialang.org/)
-* Official list of learning resources [julialang.org/learning](https://julialang.org/learning/)
\ No newline at end of file
+In order to give credit to the `Gridap` and `GridapDistributed` contributors, we simply ask you to cite the `Gridap` main project as indicated [here](https://github.com/gridap/Gridap.jl#how-to-cite-gridap) and the sub-packages you use as indicated in the corresponding repositories. Please, use the reference below in any publication in which you have made use of `GridapDistributed`:
+
+```latex
+@article{Badia2022,
+  doi = {10.21105/joss.04157},
+  url = {https://doi.org/10.21105/joss.04157},
+  year = {2022},
+  publisher = {The Open Journal},
+  volume = {7},
+  number = {74},
+  pages = {4157},
+  author = {Santiago Badia and Alberto F. Martín and Francesc Verdugo},
+  title = {GridapDistributed: a massively parallel finite element toolbox in Julia},
+  journal = {Journal of Open Source Software}
+}
+```
diff --git a/docs/src/visualization.md b/docs/src/visualization.md
new file mode 100644
index 00000000..d294e7c9
--- /dev/null
+++ b/docs/src/visualization.md
@@ -0,0 +1,40 @@
+# [Visualization](@id visualization)
+
+`GridapDistributed.jl` supports parallel VTK output via `writevtk`, using the same
+interface as sequential Gridap. Each MPI rank writes its local portion as a `.vtu` file;
+the VTK parallel format (`.pvtu`) is written by rank 0 and references all pieces.
+Open the `.pvtu` file in [ParaView](https://www.paraview.org/) to visualise the global
+solution.
+
+## Static output
+
+```julia
+writevtk(Ω, "solution", cellfields=["uh" => uh, "eh" => eh])
+```
+
+This generates `solution.pvtu` (the parallel descriptor) and one `solution_n.vtu` per
+rank. Ghost cells are omitted by default; each cell appears in exactly one `.vtu` file.
+
+## Time-dependent output
+
+Use `createpvd` / `savepvd` to collect snapshots from a time loop into a single `.pvd`
+collection that ParaView can animate:
+
+```julia
+pvd = createpvd(get_parts(model), "simulation")
+
+for (t, uh) in time_steps
+  pvd[t] = createvtk(Ω, "solution_$(t)", cellfields=["uh" => uh])
+end
+
+savepvd(pvd)
+```
+
+`get_parts(model)` provides the rank array needed to coordinate parallel file I/O.
+
+## API
+
+```@docs
+GridapDistributed.DistributedPvd
+GridapDistributed.DistributedVisualizationData
+```
diff --git a/src/Adaptivity.jl b/src/Adaptivity.jl
index a9e798c4..572c898e 100644
--- a/src/Adaptivity.jl
+++ b/src/Adaptivity.jl
@@ -1,6 +1,16 @@
 
 # DistributedAdaptedDiscreteModels
 
+"""
+    DistributedAdaptedDiscreteModel{Dc,Dp}
+
+Type alias for a [`GenericDistributedDiscreteModel`](@ref) whose local models are
+`AdaptedDiscreteModel` objects from Gridap. Represents a distributed mesh obtained by
+refining a coarser mesh, together with the parent-child cell relationship (adaptivity glue).
+
+Constructed by `DistributedAdaptedDiscreteModel(fine_model, coarse_model, glue_array)`.
+Use `get_model`, `get_parent`, and `get_adaptivity_glue` to retrieve components.
+"""
 const DistributedAdaptedDiscreteModel{Dc,Dp} = GenericDistributedDiscreteModel{Dc,Dp,<:AbstractArray{<:AdaptedDiscreteModel{Dc,Dp}}}
 
 struct DistributedAdaptedDiscreteModelCache{A,B,C}
@@ -297,8 +307,8 @@ function get_cartesian_redistribute_glue(
       # If I'm in the old subprocessor,
       #   - I send all owned old cells that I don't own in the new model.
       #   - I receive all owned new cells that I don't own in the old model.
-      old2new = replace(find_local_to_local_map(old_ids,new_ids), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_ids,old_ids), -1 => 0)
+      old2new = replace(local_to_local_map(old_ids,new_ids), -1 => 0)
+      new2old = replace(local_to_local_map(new_ids,old_ids), -1 => 0)
 
       new_l2o = local_to_own(new_ids)
       old_l2o = local_to_own(old_ids)
@@ -604,12 +614,12 @@ function refine_cell_gids(
   ranks = linear_indices(cgids)
 
   # Create own numbering (without ghosts)
-  num_f_owned_cells = map(length,f_own_to_local)
-  num_f_gids = reduce(+,num_f_owned_cells)
-  first_f_gid = scan(+,num_f_owned_cells,type=:exclusive,init=1)
+  num_f_oids  = map(length,f_own_to_local)
+  num_f_gids  = reduction(+,num_f_oids,destination=:all,init=0)
+  first_f_gid = scan(+,num_f_oids,type=:exclusive,init=1)
   
-  own_fgids = map(ranks,first_f_gid,num_f_owned_cells) do rank,first_f_gid,num_f_owned_cells
-    f_o2g = collect(first_f_gid:first_f_gid+num_f_owned_cells-1)
+  own_fgids = map(ranks,num_f_gids,first_f_gid,num_f_oids) do rank,num_f_gids,first_f_gid,num_f_oids
+    f_o2g = collect(first_f_gid:first_f_gid+num_f_oids-1)
     own   = OwnIndices(num_f_gids,rank,f_o2g)
     ghost = GhostIndices(num_f_gids) # No ghosts
     return OwnAndGhostIndices(own,ghost)
@@ -628,7 +638,9 @@ function refine_cell_gids(
   # collect two keys: 
   #   1. The global id of the coarse parent
   #   2. The child id of the fine cell
-  parent_gids_snd, child_ids_snd = map(cgids,cmodels,fmodels,lids_snd) do cgids,cmodel,fmodel,lids_snd
+  parent_gids_snd, child_ids_snd = map(
+    cgids,cmodels,fmodels,lids_snd
+  ) do cgids,cmodel,fmodel,lids_snd
     glue = get_adaptivity_glue(fmodel)
     f2c_map = glue.n2o_faces_map[Dc+1]
     child_map = glue.n2o_cell_to_child_id
@@ -656,8 +668,8 @@ function refine_cell_gids(
   # We process the received keys, and collect the global ids of the fine cells
   # that have been requested by our neighbors.
   child_gids_rcv = map(
-    cgids,own_fgids,f_own_to_local,cmodels,fmodels,parent_gids_rcv,child_ids_rcv
-  ) do cgids,own_fgids,f_own_to_local,cmodel,fmodel,parent_gids_rcv,child_ids_rcv
+    cgids,own_fgids,f_own_to_local,fmodels,parent_gids_rcv,child_ids_rcv
+  ) do cgids,own_fgids,f_own_to_local,fmodel,parent_gids_rcv,child_ids_rcv
     glue = get_adaptivity_glue(fmodel)
     c2f_map = glue.o2n_faces_map
     child_map = glue.n2o_cell_to_child_id
@@ -683,15 +695,15 @@ function refine_cell_gids(
   t = exchange(child_gids_rcv,graph)
   child_gids_snd = fetch(t)
   
-  # We finally can create the global numeration of the fine cells by piecing together: 
+  # We finally can create the global numeration of the fine cells by piecing together:
   #   1. The (local ids,global ids) of the owned fine cells
   #   2. The (owners,local ids,global ids) of the ghost fine cells
-  fgids = map(
+  local2global, local2owner = map(
     ranks,f_own_to_local,own_fgids,parts_snd,lids_snd,child_gids_snd
   ) do rank,own_lids,own_gids,nbors,ghost_lids,ghost_gids
 
     own2global = own_to_global(own_gids)
-  
+
     n_own   = length(own_lids)
     n_ghost = length(ghost_lids.data)
     local2global = fill(0,n_own+n_ghost)
@@ -702,7 +714,7 @@ function refine_cell_gids(
       local2global[lid] = own2global[oid]
       local2owner[lid]  = rank
     end
-    
+
     # Ghost cells
     for (n,nbor) in enumerate(nbors)
       for i in ghost_lids.ptrs[n]:ghost_lids.ptrs[n+1]-1
@@ -712,9 +724,13 @@ function refine_cell_gids(
         local2owner[lid]  = nbor
       end
     end
-    return LocalIndices(num_f_gids,rank,local2global,local2owner)
-  end
+    return local2global, local2owner
+  end |> tuple_of_arrays
 
+  fgids = permuted_variable_partition(
+    num_f_oids, local2global, local2owner;
+    n_global=num_f_gids, start=first_f_gid
+  )
   return PRange(fgids)
 end
 
@@ -760,7 +776,7 @@ function Adaptivity.coarsen(fmodel::DistributedDiscreteModel, ptopo::Distributed
 
   # First, some preliminary checks
   fgids = partition(get_cell_gids(fmodel))
-  map(local_views(ptopo), fgids) do ptopo, fids
+  foreach(local_views(ptopo), fgids) do ptopo, fids
     patch_cells = Geometry.get_patch_cells(ptopo)
     lcell_to_ocell = local_to_own(fids)
     ocell_to_lcell = own_to_local(fids)
diff --git a/src/Algebra.jl b/src/Algebra.jl
index 8964fc88..0ee01ed7 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -29,212 +29,32 @@ function Algebra.allocate_in_domain(matrix::BlockPMatrix)
   allocate_in_domain(BlockPVector{V},matrix)
 end
 
-# PSparseMatrix copy
-
-function Base.copy(a::PSparseMatrix)
-  mats = map(copy,partition(a))
-  cache = map(PartitionedArrays.copy_cache,a.cache)
-  return PSparseMatrix(mats,partition(axes(a,1)),partition(axes(a,2)),cache)
-end
-
-# PartitionedArrays extras
-
-function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
-  @check partition(axes(x,1)) === partition(axes(y,1))
-  map(partition(x),partition(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  consistent!(y) |> wait
-  return y
-end
-
-function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
-  map(blocks(x),blocks(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  return y
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::PSparseMatrix, B::PSparseMatrix;
-  check::Bool=true
-)
-# We should definitely check here that the index partitions are the same. 
-# However: Because the different matrices are assembled separately, the objects are not the 
-# same (i.e can't use ===). Checking the index partitions would then be costly...
-  @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
-  @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
-  map(partition(A),partition(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::BlockPMatrix, B::BlockPMatrix;
-  check::Bool=true
-)
-  map(blocks(A),blocks(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.ArrayCounter,axes)
-  @notimplemented
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
-  b = Algebra.CounterCOO{T}(axes)
-  b.nnz = a.nnz
-  b
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
-  counter = change_axes(a.counter,axes)
-  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
-end
-
-# Array of PArrays -> PArray of Arrays 
-function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      PartitionedArrays.getany(aj)
-    end
-  end
-end
-
-function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      aj.items[i]
-    end
-  end
-end
-
-function to_parray_of_arrays(a::AbstractArray{<:Vector})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      aj[i]
-    end
-  end
-end
-
-# This type is required because MPIArray from PArrays 
-# cannot be instantiated with a NULL communicator
-struct MPIVoidVector{T} <: AbstractVector{T}
-  comm::MPI.Comm
-  function MPIVoidVector(::Type{T}) where {T}
-    new{T}(MPI.COMM_NULL)
-  end
-end
-
-Base.size(a::MPIVoidVector) = (0,)
-Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
-function Base.getindex(a::MPIVoidVector,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.setindex!(a::MPIVoidVector,v,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
-  println(io,"MPIVoidVector")
-end
-
-function num_parts(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    nparts = MPI.Comm_size(comm)
-  else
-    nparts = -1
-  end
-  nparts
-end
-@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
-@inline num_parts(comm::DebugArray) = length(comm.items)
-@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
-
-function get_part_id(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    id = MPI.Comm_rank(comm)+1
-  else
-    id = -1
-  end
-  id
-end
-@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
-@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
+# local_views
 
 """
-    i_am_in(comm::MPIArray)
-    i_am_in(comm::DebugArray)
-  
-  Returns `true` if the processor is part of the subcommunicator `comm`.
-"""
-function i_am_in(comm::MPI.Comm)
-  get_part_id(comm) >=0
-end
-@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
-@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
-@inline i_am_in(comm::DebugArray) = true
-
-change_parts(x, new_parts; default=nothing) = change_parts(x, new_parts, map(_ -> default, new_parts))
-
-function change_parts(x::MPIArray, new_parts, defaults)
-  x_new = map((p,d) -> x.item, new_parts, defaults)
-  return x_new
-end
-change_parts(x::DebugArray, new_parts, args...) = change_parts(x.items, new_parts, args...)
-function change_parts(x::AbstractArray, new_parts, defaults)
-  x_new = map((p,d) -> (p <= length(x)) ? x[p] : d, new_parts, defaults)
-  return x_new
-end
-function change_parts(::Union{Nothing,MPIVoidVector}, new_parts, defaults)
-  return defaults
-end
+    local_views(a)
 
-change_parts(f::Function, x, args...; kwargs...) = change_parts(f(x), args...; kwargs...)
-change_parts(f::Function, x::Nothing, args...; kwargs...) = change_parts(nothing, args...; kwargs...)
+Return the distributed array of local (per-rank) objects wrapped by `a`.
 
-function generate_subparts(parts::MPIArray,new_comm_size)
-  root_comm = parts.comm
-  root_size = MPI.Comm_size(root_comm)
-  rank = MPI.Comm_rank(root_comm)
+This is the central accessor for all distributed objects in `GridapDistributed.jl`.
+Use it together with `map` to operate on each rank's local data independently:
 
-  @static if isdefined(MPI,:MPI_UNDEFINED)
-    mpi_undefined = MPI.MPI_UNDEFINED[]
-  else
-    mpi_undefined = MPI.API.MPI_UNDEFINED[]
-  end
-  
-  if root_size == new_comm_size
-    return parts
-  else
-    if rank < new_comm_size
-      comm = MPI.Comm_split(root_comm,0,0)
-      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
-    else
-      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
-      return MPIVoidVector(eltype(parts))
-    end
-  end
-end
-
-function generate_subparts(parts::DebugArray,new_comm_size)
-  DebugArray(LinearIndices((new_comm_size,)))
+```julia
+map(local_views(model)) do local_model
+  println(num_cells(local_model))
 end
-
-# local_views
-
+```
+"""
 function local_views(a)
   @abstractmethod
 end
 
+"""
+    get_parts(a)
+
+Return a distributed array of rank indices for the partition of `a`.
+Equivalent to `linear_indices(local_views(a))`.
+"""
 function get_parts(a)
   return linear_indices(local_views(a))
 end
@@ -299,44 +119,23 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_
   return BlockPVector(vals,ids)
 end
 
-# This function computes a mapping among the local identifiers of a and b
-# for which the corresponding global identifiers are both in a and b. 
-# Note that the haskey check is necessary because in the general case 
-# there might be gids in b which are not present in a
-function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
-  a_local_to_b_local = fill(Int32(-1),local_length(a))
-  b_local_to_global  = local_to_global(b)
-  a_global_to_local  = global_to_local(a)
-  for blid in 1:local_length(b)
-    gid = b_local_to_global[blid]
-    if a_global_to_local[gid] != zero(eltype(a_global_to_local))
-      alid = a_global_to_local[gid]
-      a_local_to_b_local[alid] = blid
-    end  
-  end
-  a_local_to_b_local
-end
-
-# This type is required in order to be able to access the local portion 
-# of distributed sparse matrices and vectors using local indices from the 
-# distributed test and trial spaces
-struct LocalView{T,N,A,B} <:AbstractArray{T,N}
+struct LocalView{T,N,A} <:AbstractArray{T,N}
   plids_to_value::A
-  d_to_lid_to_plid::B
+  d_to_lid_to_plid::NTuple{N,Vector{Int32}}
   local_size::NTuple{N,Int}
   function LocalView(
-    plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N}
+    plids_to_value::AbstractArray{T,N}, d_to_lid_to_plid::NTuple{N,Vector{Int32}}
+  ) where {T,N}
     A = typeof(plids_to_value)
-    B = typeof(d_to_lid_to_plid)
     local_size = map(length,d_to_lid_to_plid)
-    new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size)
+    new{T,N,A}(plids_to_value,d_to_lid_to_plid,local_size)
   end
 end
 
 Base.size(a::LocalView) = a.local_size
 Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
 function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  plids = map(getindex,a.d_to_lid_to_plid,lids)
   if all(i->i>0,plids)
     a.plids_to_value[plids...]
   else
@@ -344,23 +143,18 @@ function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
   end
 end
 function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  plids = map(getindex,a.d_to_lid_to_plid,lids)
   @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
   a.plids_to_value[plids...] = v
 end
 
-function _lid_to_plid(lid,lid_to_plid)
-  plid = lid_to_plid[lid]
-  plid
-end
-
 function local_views(a::PVector,new_rows::PRange)
   old_rows = axes(a,1)
   if partition(old_rows) === partition(new_rows)
     partition(a)
   else
-    map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows
-      LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),))
+    map(partition(a),partition(old_rows),partition(new_rows)) do a,old_rows,new_rows
+      LocalView(a,(local_to_local_map(new_rows,old_rows),))
     end
   end
 end
@@ -370,12 +164,12 @@ function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
   if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
     partition(a)
   else
-    map(partition(a),
-        partition(old_rows),partition(old_cols),
-        partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols
-      rl2lmap = find_local_to_local_map(new_rows,old_rows)
-      cl2lmap = find_local_to_local_map(new_cols,old_cols)
-      LocalView(matrix_partition,(rl2lmap,cl2lmap))
+    map(
+      partition(a),partition(old_rows),partition(old_cols),partition(new_rows),partition(new_cols)
+    ) do a,old_rows,old_cols,new_rows,new_cols
+      rl2lmap = local_to_local_map(new_rows,old_rows)
+      cl2lmap = local_to_local_map(new_cols,old_cols)
+      LocalView(a,(rl2lmap,cl2lmap))
     end
   end
 end
@@ -391,867 +185,3 @@ function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange
   end |> to_parray_of_arrays
   return map(mortar,vals)
 end
-
-# PSparseMatrix assembly
-
-struct FullyAssembledRows end
-struct SubAssembledRows end
-
-# For the moment we use COO format even though
-# it is quite memory consuming.
-# We need to implement communication in other formats in
-# PartitionedArrays.jl
-
-struct PSparseMatrixBuilderCOO{T,B}
-  local_matrix_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(
-  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}) where A
-  test_dofs_gids_prange, trial_dofs_gids_prange = axs
-  counters = map(partition(test_dofs_gids_prange),partition(trial_dofs_gids_prange)) do r,c
-    axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
-    Algebra.CounterCOO{A}(axs)
-  end
-  DistributedCounterCOO(builder.par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange)
-end
-
-function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
-  T = eltype(Tv)
-  return PSparseMatrix{T}
-end
-
-
-"""
-"""
-struct DistributedCounterCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  counters::B
-  test_dofs_gids_prange::C
-  trial_dofs_gids_prange::D
-  function DistributedCounterCOO(
-    par_strategy,
-    counters::AbstractArray{<:Algebra.CounterCOO},
-    test_dofs_gids_prange::PRange,
-    trial_dofs_gids_prange::PRange)
-    A = typeof(par_strategy)
-    B = typeof(counters)
-    C = typeof(test_dofs_gids_prange)
-    D = typeof(trial_dofs_gids_prange)
-    new{A,B,C,D}(par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange)
-  end
-end
-
-function local_views(a::DistributedCounterCOO)
-  a.counters
-end
-
-function local_views(a::DistributedCounterCOO,test_dofs_gids_prange,trial_dofs_gids_prange)
-  @check test_dofs_gids_prange === a.test_dofs_gids_prange
-  @check trial_dofs_gids_prange === a.trial_dofs_gids_prange
-  a.counters
-end
-
-function Algebra.nz_allocation(a::DistributedCounterCOO)
-  allocs = map(nz_allocation,a.counters)
-  DistributedAllocationCOO(a.par_strategy,allocs,a.test_dofs_gids_prange,a.trial_dofs_gids_prange)
-end
-
-struct DistributedAllocationCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  allocs::B
-  test_dofs_gids_prange::C
-  trial_dofs_gids_prange::D
-  function DistributedAllocationCOO(
-    par_strategy,
-    allocs::AbstractArray{<:Algebra.AllocationCOO},
-    test_dofs_gids_prange::PRange,
-    trial_dofs_gids_prange::PRange)
-    A = typeof(par_strategy)
-    B = typeof(allocs)
-    C = typeof(test_dofs_gids_prange)
-    D = typeof(trial_dofs_gids_prange)
-    new{A,B,C,D}(par_strategy,allocs,test_dofs_gids_prange,trial_dofs_gids_prange)
-  end
-end
-
-function change_axes(a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
-                     axes::Tuple{<:PRange,<:PRange}) where {A,B}
-  local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
-    (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
-  end
-  allocs = map(change_axes,a.allocs,local_axes)
-  DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
-end
-
-function change_axes(a::MatrixBlock{<:DistributedAllocationCOO},
-                     axes::Tuple{<:Vector,<:Vector})
-  block_ids  = CartesianIndices(a.array)
-  rows, cols = axes
-  array = map(block_ids) do I
-    change_axes(a[I],(rows[I[1]],cols[I[2]]))
-  end
-  return ArrayBlock(array,a.touched)
-end
-
-function local_views(a::DistributedAllocationCOO)
-  a.allocs
-end
-
-function local_views(a::DistributedAllocationCOO,test_dofs_gids_prange,trial_dofs_gids_prange)
-  @check test_dofs_gids_prange === a.test_dofs_gids_prange
-  @check trial_dofs_gids_prange === a.trial_dofs_gids_prange
-  a.allocs
-end
-
-function local_views(a::MatrixBlock{<:DistributedAllocationCOO})
-  array = map(local_views,a.array) |> to_parray_of_arrays
-  return map(ai -> ArrayBlock(ai,a.touched),array)
-end
-
-function get_allocations(a::DistributedAllocationCOO)
-  I,J,V = map(local_views(a)) do alloc
-    alloc.I, alloc.J, alloc.V
-  end |> tuple_of_arrays
-  return I,J,V
-end
-
-function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
-  tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays
-  return tuple_of_array_of_parrays
-end
-
-get_test_gids(a::DistributedAllocationCOO)  = a.test_dofs_gids_prange
-get_trial_gids(a::DistributedAllocationCOO) = a.trial_dofs_gids_prange
-get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
-get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
-
-function Algebra.create_from_nz(a::PSparseMatrix)
-  # For FullyAssembledRows the underlying Exchanger should
-  # not have ghost layer making assemble! do nothing (TODO check)
-  assemble!(a) |> wait
-  a
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:FullyAssembledRows})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:FullyAssembledRows}})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function _fa_create_from_nz_with_callback(callback,a)
-
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_dofs_gids_prange  = get_test_gids(a)
-  trial_dofs_gids_prange = get_trial_gids(a)
-
-  rows = _setup_prange(test_dofs_gids_prange,I;ghost=false,ax=:rows)
-  b = callback(rows)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_dofs_gids_prange;ax=:rows)
-  to_global_indices!(J,trial_dofs_gids_prange;ax=:cols)
-
-  # Create the range for cols
-  cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  # Adjust local matrix size to linear system's index sets
-  asys = change_axes(a,(rows,cols))
-
-  # Compress local portions
-  values = map(create_from_nz,local_views(asys))
-
-  # Finally build the matrix
-  A = _setup_matrix(values,rows,cols)
-  return A, b
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:SubAssembledRows})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:SubAssembledRows}})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_dofs_gids_prange = get_test_gids(a)
-  trial_dofs_gids_prange = get_trial_gids(a)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_dofs_gids_prange;ax=:rows)
-  to_global_indices!(J,trial_dofs_gids_prange;ax=:cols)
-
-  # Create the Prange for the rows
-  rows = _setup_prange(test_dofs_gids_prange,I;ax=:rows)
-  
-  # Move (I,J,V) triplets to the owner process of each row I.
-  # The triplets are accompanyed which Jo which is the process column owner
-  Jo = get_gid_owners(J,trial_dofs_gids_prange;ax=:cols)
-  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
-
-  # Here we can overlap computations
-  # This is a good place to overlap since
-  # sending the matrix rows is a lot of data
-  if !isa(b,Nothing)
-    bprange=_setup_prange_from_pvector_allocation(b)
-    b = callback(bprange)
-  end
-
-  # Wait the transfer to finish
-  wait(t)
-
-  # Create the Prange for the cols
-  cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols,owners=Jo)
-
-  # Overlap rhs communications with CSC compression
-  t2 = async_callback(b)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  # Adjust local matrix size to linear system's index sets
-  asys = change_axes(a,(rows,cols))
-
-  # Compress the local matrices
-  values = map(create_from_nz,local_views(asys))
-
-  # Wait the transfer to finish
-  if !isa(t2,Nothing)
-    wait(t2)
-  end
-
-  # Finally build the matrix
-  A = _setup_matrix(values,rows,cols)
-  return A, b
-end
-
-
-# PVector assembly 
-
-struct PVectorBuilder{T,B}
-  local_vector_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange})
-  T = builder.local_vector_type
-  rows, = axs
-  counters = map(partition(rows)) do rows
-    axs = (Base.OneTo(local_length(rows)),)
-    nz_counter(ArrayBuilder(T),axs)
-  end
-  PVectorCounter(builder.par_strategy,counters,rows)
-end
-
-function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
-  T = eltype(Tv)
-  return PVector{T}
-end
-
-struct PVectorCounter{A,B,C}
-  par_strategy::A
-  counters::B
-  test_dofs_gids_prange::C
-end
-
-Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
-
-function local_views(a::PVectorCounter)
-  a.counters
-end
-
-function local_views(a::PVectorCounter,rows)
-  @check rows === a.test_dofs_gids_prange
-  a.counters
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:FullyAssembledRows})
-  dofs = a.test_dofs_gids_prange
-  values = map(nz_allocation,a.counters)
-  PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
-end
-
-struct PVectorAllocationTrackOnlyValues{A,B,C}
-  par_strategy::A
-  values::B
-  test_dofs_gids_prange::C
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues)
-  a.values
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues,rows)
-  @check rows === a.test_dofs_gids_prange
-  a.values
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-  rows = _setup_prange_without_ghosts(a.test_dofs_gids_prange)
-  _rhs_callback(a,rows)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:SubAssembledRows})
-  # This point MUST NEVER be reached. If reached there is an inconsistency
-  # in the parallel code in charge of vector assembly
-  @assert false
-end
-
-function _rhs_callback(row_partitioned_vector_partition,rows)
-  # The ghost values in row_partitioned_vector_partition are 
-  # aligned with the FESpace but not with the ghost values in the rows of A
-  b_fespace = PVector(row_partitioned_vector_partition.values,
-                      partition(row_partitioned_vector_partition.test_dofs_gids_prange))
-
-  # This one is aligned with the rows of A
-  b = similar(b_fespace,eltype(b_fespace),(rows,))
-
-  # First transfer owned values
-  b .= b_fespace
-
-  # Now transfer ghost
-  function transfer_ghost(b,b_fespace,ids,ids_fespace)
-    num_ghosts_vec = ghost_length(ids)
-    gho_to_loc_vec = ghost_to_local(ids)
-    loc_to_glo_vec = local_to_global(ids)
-    gid_to_lid_fe  = global_to_local(ids_fespace)
-    for ghost_lid_vec in 1:num_ghosts_vec
-      lid_vec     = gho_to_loc_vec[ghost_lid_vec]
-      gid         = loc_to_glo_vec[lid_vec]
-      lid_fespace = gid_to_lid_fe[gid]
-      b[lid_vec] = b_fespace[lid_fespace]
-    end
-  end
-  map(
-    transfer_ghost,
-    partition(b),
-    partition(b_fespace),
-    b.index_partition,
-    b_fespace.index_partition)
-
-  return b
-end
-
-function Algebra.create_from_nz(a::PVector)
-  assemble!(a) |> wait
-  return a
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:FullyAssembledRows},
-  c_fespace::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-
-  function callback(rows)
-    _rhs_callback(c_fespace,rows)
-  end
-
-  A,b = _fa_create_from_nz_with_callback(callback,a)
-  return A,b
-end
-
-struct PVectorAllocationTrackTouchedAndValues{A,B,C}
-  allocations::A
-  values::B
-  test_dofs_gids_prange::C
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:SubAssembledRows},
-  b::PVectorAllocationTrackTouchedAndValues)
-
-  function callback(rows)
-    _rhs_callback(b,rows)
-  end
-
-  function async_callback(b)
-    # now we can assemble contributions
-    assemble!(b)
-  end
-
-  A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  return A,b
-end
-
-struct ArrayAllocationTrackTouchedAndValues{A}
-  touched::Vector{Bool}
-  values::A
-end
-
-Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop()
-
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
-  @check rows === a.test_dofs_gids_prange
-  a.allocations
-end
-
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if i>0
-    if !(a.touched[i])
-      a.touched[i]=true
-    end
-    if !isa(v,Nothing)
-      a.values[i]=c(v,a.values[i])
-    end
-  end
-  nothing
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if !isa(v,Nothing)
-    for (ve,ie) in zip(v,i)
-      Arrays.add_entry!(c,a,ve,ie)
-    end
-  else
-    for ie in i
-      Arrays.add_entry!(c,a,nothing,ie)
-    end
-  end
-  nothing
-end
-
-
-function _setup_touched_and_allocations_arrays(values)
-  touched = map(values) do values
-    fill!(Vector{Bool}(undef,length(values)),false)
-  end
-  allocations = map(values,touched) do values,touched
-   ArrayAllocationTrackTouchedAndValues(touched,values)
-  end
-  touched, allocations
-end
-
-function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows},
-                              b::PVectorCounter{<:SubAssembledRows})
-  A      = nz_allocation(a)
-  dofs   = b.test_dofs_gids_prange
-  values = map(nz_allocation,b.counters)
-  touched,allocations=_setup_touched_and_allocations_arrays(values)
-  B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-  return A,B
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:SubAssembledRows})
-  dofs = a.test_dofs_gids_prange
-  values = map(nz_allocation,a.counters)
-  touched,allocations=_setup_touched_and_allocations_arrays(values)
-  return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-end
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues)
-  a.allocations
-end
-
-function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues)
-
-  # Find the ghost rows
-  allocations = local_views(a.allocations)
-  indices = partition(a.test_dofs_gids_prange)
-  I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
-    dofs_lids_touched = findall(allocation.touched)
-    loc_to_gho = local_to_ghost(indices)
-    n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched)
-    I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids)
-    cur = 1
-    for lid in dofs_lids_touched
-      dof_lid = loc_to_gho[lid]
-      if dof_lid != 0
-        I_ghost_lids[cur] = dof_lid
-        cur = cur+1
-      end
-    end
-    I_ghost_lids
-  end
-
-  ghost_to_global, ghost_to_owner = map(
-    find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
-
-  ngids = length(a.test_dofs_gids_prange)
-  _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues)
-  rows = _setup_prange_from_pvector_allocation(a)
-  b    = _rhs_callback(a,rows)
-  t2   = assemble!(b)
-  # Wait the transfer to finish
-  if t2 !== nothing
-    wait(t2)
-  end
-  return b
-end
-
-# Common Assembly Utilities
-function first_gdof_from_ids(ids)
-  if own_length(ids) == 0
-    return 1
-  end
-  lid_to_gid   = local_to_global(ids) 
-  owned_to_lid = own_to_local(ids)
-  return Int(lid_to_gid[first(owned_to_lid)])
-end
-
-function find_gid_and_owner(ighost_to_jghost,jindices)
-  jghost_to_local  = ghost_to_local(jindices)
-  jlocal_to_global = local_to_global(jindices)
-  jlocal_to_owner  = local_to_owner(jindices)
-  ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost))
-
-  ighost_to_global = jlocal_to_global[ighost_to_jlocal]
-  ighost_to_owner  = jlocal_to_owner[ighost_to_jlocal]
-  return ighost_to_global, ighost_to_owner
-end
-
-# The given ids are assumed to be a sub-set of the lids
-function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer})
-  glo_to_loc = global_to_local(a)
-  loc_to_gho = local_to_ghost(a)
-  
-  # First pass: Allocate
-  i = 0
-  ghost_lids_touched = fill(false,ghost_length(a))
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      i += 1
-    end
-  end
-  gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i)
-
-  # Second pass: fill 
-  i = 1
-  fill!(ghost_lids_touched,false)
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      gids_ghost_lid_to_ghost_lid[i] = ghost_lid
-      i += 1
-    end
-  end
-
-  return gids_ghost_lid_to_ghost_lid
-end
-
-# Find the neighbours of partition1 trying 
-# to use those in partition2 as a hint 
-function _find_neighbours!(partition1, partition2)
-  partition2_snd, partition2_rcv = assembly_neighbors(partition2)
-  partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv)
-  return assembly_neighbors(partition1; neighbors=partition2_graph)
-end
-
-# to_global! & to_local! analogs, needed for dispatching in block assembly
-
-function to_local_indices!(I,ids::PRange;kwargs...)
-  map(to_local!,I,partition(ids))
-end
-
-function to_global_indices!(I,ids::PRange;kwargs...)
-  map(to_global!,I,partition(ids))
-end
-
-function get_gid_owners(I,ids::PRange;kwargs...)
-  map(I,partition(ids)) do I, indices 
-    glo_to_loc = global_to_local(indices) 
-    loc_to_own = local_to_owner(indices)
-    map(x->loc_to_own[glo_to_loc[x]], I)
-  end 
-end
-
-for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
-  @eval begin
-    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
-      map($f,I,ids)
-    end
-
-    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
-      @check ax ∈ [:rows,:cols]
-      block_ids = CartesianIndices(I)
-      map(block_ids) do id
-        i = id[1]; j = id[2];
-        if ax == :rows
-          $f(I[i,j],ids[i])
-        else
-          $f(I[i,j],ids[j])
-        end
-      end
-    end
-  end
-end
-
-# _setup_matrix : local matrices + global PRanges -> Global matrix
-
-function _setup_matrix(values,rows::PRange,cols::PRange)
-  return PSparseMatrix(values,partition(rows),partition(cols))
-end
-
-function _setup_matrix(values,rows::Vector{<:PRange},cols::Vector{<:PRange})
-  block_ids  = CartesianIndices((length(rows),length(cols)))
-  block_mats = map(block_ids) do I
-    block_values = map(v -> blocks(v)[I],values)
-    return _setup_matrix(block_values,rows[I[1]],cols[I[2]])
-  end
-  return mortar(block_mats)
-end
-
-# _assemble_coo! : local coo triplets + global PRange -> Global coo values
-
-function _assemble_coo!(I,J,V,rows::PRange;owners=nothing)
-  if isa(owners,Nothing)
-    PArrays.assemble_coo!(I,J,V,partition(rows))
-  else
-    assemble_coo_with_column_owner!(I,J,V,partition(rows),owners)
-  end
-end
-
-function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing)
-  block_ids = CartesianIndices(I)
-  map(block_ids) do id
-    i = id[1]; j = id[2];
-    if isa(owners,Nothing)
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i])
-    else
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j])
-    end
-  end
-end
-
-function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
-  """
-    Returns three JaggedArrays with the coo triplets
-    to be sent to the corresponding owner parts in parts_snd
-  """
-  function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner)
-    global_to_local_row = global_to_local(row_lids)
-    local_row_to_owner = local_to_owner(row_lids)
-    owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) ))
-    ptrs = zeros(Int32,length(parts_snd)+1)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        ptrs[owner_to_i[owner]+1] +=1
-      end
-    end
-    PArrays.length_to_ptrs!(ptrs)
-    gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1)
-    gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1)
-    jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1)
-    v_snd_data = zeros(eltype(k_v),ptrs[end]-1)
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        gj = k_gj[k]
-        v = k_v[k]
-        p = ptrs[owner_to_i[owner]]
-        gi_snd_data[p] = gi
-        gj_snd_data[p] = gj
-        jo_snd_data[p] = k_jo[k]
-        v_snd_data[p]  = v
-        k_v[k] = zero(v)
-        ptrs[owner_to_i[owner]] += 1
-      end
-    end
-    PArrays.rewind_ptrs!(ptrs)
-    gi_snd = JaggedArray(gi_snd_data,ptrs)
-    gj_snd = JaggedArray(gj_snd_data,ptrs)
-    jo_snd = JaggedArray(jo_snd_data,ptrs)
-    v_snd = JaggedArray(v_snd_data,ptrs)
-    gi_snd, gj_snd, jo_snd, v_snd
-  end
-  """
-    Pushes to coo_entries_with_column_owner the tuples 
-    gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes
-  """
-  function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    current_n = length(k_gi)
-    new_n = current_n + length(gi_rcv.data)
-    resize!(k_gi,new_n)
-    resize!(k_gj,new_n)
-    resize!(k_jo,new_n)
-    resize!(k_v,new_n)
-    for p in 1:length(gi_rcv.data)
-        k_gi[current_n+p] = gi_rcv.data[p]
-        k_gj[current_n+p] = gj_rcv.data[p]
-        k_jo[current_n+p] = jo_rcv.data[p]
-        k_v[current_n+p] = v_rcv.data[p]
-    end
-  end
-  part = linear_indices(row_partition)
-  parts_snd, parts_rcv = assembly_neighbors(row_partition)
-  coo_entries_with_column_owner = map(tuple,I,J,Jown,V)
-  gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays
-  graph = ExchangeGraph(parts_snd,parts_rcv)
-  t1 = exchange(gi_snd,graph)
-  t2 = exchange(gj_snd,graph)
-  t3 = exchange(jo_snd,graph)
-  t4 = exchange(v_snd,graph)
-  @async begin
-      gi_rcv = fetch(t1)
-      gj_rcv = fetch(t2)
-      jo_rcv = fetch(t3)
-      v_rcv = fetch(t4)
-      map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-      I,J,Jown,V
-  end
-end
-
-# dofs_gids_prange can be either test_dofs_gids_prange or trial_dofs_gids_prange
-# In the former case, gids is a vector of global test dof identifiers, while in the 
-# latter, a vector of global trial dof identifiers
-function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
-  if !ghost
-    _setup_prange_without_ghosts(dofs_gids_prange)
-  elseif isa(owners,Nothing)
-    _setup_prange_with_ghosts(dofs_gids_prange,gids)
-  else
-    _setup_prange_with_ghosts(dofs_gids_prange,gids,owners)
-  end
-end
-
-function _setup_prange(
-  dofs_gids_prange::AbstractVector{<:PRange},
-  gids::AbstractMatrix;
-  ax=:rows,ghost=true,owners=nothing
-)
-  @check ax ∈ (:rows,:cols)
-  block_ids = LinearIndices(dofs_gids_prange)
-  pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x))
-
-  gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange
-    gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id]
-    gids_union = pvcat(gids_slice)
-
-    owners_union = nothing
-    if !isnothing(owners)
-      owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id]
-      owners_union = pvcat(owners_slice)
-    end
-
-    return gids_union, owners_union
-  end |> tuple_of_arrays
-  
-  return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union)
-end
-
-# Create PRange for the rows of the linear system
-# without local ghost dofs as per required in the 
-# FullyAssembledRows() parallel assembly strategy 
-function _setup_prange_without_ghosts(dofs_gids_prange::PRange)
-  ngdofs = length(dofs_gids_prange)
-  indices = map(partition(dofs_gids_prange)) do dofs_indices 
-    owner = part_id(dofs_indices)
-    own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,Int64[],Int32[])
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  return PRange(indices)
-end
-
-# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition
-# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system.
-# The precondition required for the consistency of any parallel assembly process in GridapDistributed 
-# is that each processor can determine locally with a single layer of ghost cells the global indices and associated 
-# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities 
-# (i.e., either cells or faces). 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> dof PRange ghost ids
-  gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition) |> tuple_of_arrays
-
-  return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner)
-end
-
-# Here we cannot assume that the sparse communication graph underlying 
-# trial_dofs_gids_partition is a superset of the one underlying indices.
-# Here we chould check whether it is included and call _find_neighbours!()
-# if this is the case. At present, we are not taking advantage of this, 
-# but let the parallel scalable algorithm to compute the reciprocal to do the job. 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    gids,owners,dofs_gids_partition) do gids, owners, indices
-    ghost_touched   = Dict{Int,Bool}()
-    ghost_to_global = Int64[] 
-    ghost_to_owner  = Int64[]
-    me = part_id(indices)
-    for (j,jo) in zip(gids,owners)
-      if jo != me
-        if !haskey(ghost_touched,j)
-          push!(ghost_to_global,j)
-          push!(ghost_to_owner,jo)
-          ghost_touched[j] = true
-        end
-      end
-    end
-    ghost_to_global, ghost_to_owner
-  end |> tuple_of_arrays
-
-  return _setup_prange_impl_(ngdofs,
-                             dofs_gids_partition,
-                             gids_ghost_to_global,
-                             gids_ghost_to_owner;
-                             discover_neighbours=false)
-end 
-
-function _setup_prange_impl_(ngdofs,
-                             dofs_gids_partition,
-                             gids_ghost_to_global,
-                             gids_ghost_to_owner;
-                             discover_neighbours=true)
-  indices = map(dofs_gids_partition, 
-                gids_ghost_to_global, 
-                gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner 
-    owner = part_id(dofs_indices)
-    own_indices   = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner)
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  if discover_neighbours
-    _find_neighbours!(indices,dofs_gids_partition)
-  end
-  return PRange(indices)
-end
diff --git a/src/Assembly.jl b/src/Assembly.jl
new file mode 100644
index 00000000..27b25d8c
--- /dev/null
+++ b/src/Assembly.jl
@@ -0,0 +1,490 @@
+
+# Parallel assembly strategies
+
+"""
+    Assembled <: AssemblyStrategy
+
+Standard parallel assembly strategy. Local contributions from ghost cells are
+communicated to their owning ranks and summed. The result is a fully consistent
+global `PSparseMatrix` / `PVector` with each row owned by exactly one rank.
+
+This is the DEFAULT strategy for all standard FE problems.
+"""
+struct Assembled <: FESpaces.AssemblyStrategy end
+
+"""
+    SubAssembled <: AssemblyStrategy
+
+Sub-assembled strategy. Each rank assembles only over its owned cells; no
+communication of ghost-cell contributions is performed. Ghost-DOF rows are left
+incomplete. Intended for subdomain-based methods (e.g. domain decomposition) that
+handle the interface coupling themselves.
+"""
+struct SubAssembled <: FESpaces.AssemblyStrategy end
+
+"""
+    LocallyAssembled <: AssemblyStrategy
+
+Local-only assembly strategy. Each rank assembles over both owned and ghost cells,
+but only owned-DOF rows are kept; ghost-DOF contributions are discarded. This ASSUMES
+that all local contributions for owned rows can be computed locally, without communication.
+
+WARNING: This is NOT true in general and might yield incorrect assembled systems. 
+Use with caution and only if you know what you are doing.
+"""
+struct LocallyAssembled <: FESpaces.AssemblyStrategy end
+
+function local_assembly_strategy(::Union{Assembled,SubAssembled},rows,cols)
+  DefaultAssemblyStrategy()
+end
+
+# When LocallyAssembling, make sure that you also loop over ghost cells.
+function local_assembly_strategy(::LocallyAssembled,rows,cols)
+  rows_local_to_ghost = local_to_ghost(rows)
+  GenericAssemblyStrategy(
+    identity,
+    identity,
+    row->iszero(rows_local_to_ghost[row]),
+    col->true
+  )
+end
+
+# PSparseMatrix and PVector builders
+
+"""
+    DistributedArrayBuilder{T,N,B}
+
+Builder object used by Gridap's assembly machinery to allocate distributed arrays
+(`PVector` or `PSparseMatrix`). `T` is the local array type, `N` its dimensionality
+(1 for vectors, 2 for matrices), and `B` the assembly strategy.
+
+Constructed automatically by `SparseMatrixAssembler` — users rarely need to build
+this directly.
+"""
+struct DistributedArrayBuilder{T,N,B}
+  local_array_type::Type{T}
+  strategy::B
+  function DistributedArrayBuilder(
+    local_array_type::Type{T},
+    strategy::AssemblyStrategy
+  ) where T
+    N = ndims(T)
+    B = typeof(strategy)
+    new{T,N,B}(local_array_type,strategy)
+  end
+end
+
+const PVectorBuilder{T,B} = DistributedArrayBuilder{T,1,B}
+const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T,2,B}
+
+function Algebra.get_array_type(::PSparseMatrixBuilder{Tm}) where Tm
+  T = eltype(Tm)
+  return PSparseMatrix{T}
+end
+
+function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
+  T = eltype(Tv)
+  return PVector{T}
+end
+
+# Distributed counters and allocators
+
+"""
+    DistributedCounter{S,T,N} <: GridapType
+
+Distributed N-dimensional counter, with local counters of type T.
+Follows assembly strategy S.
+"""
+struct DistributedCounter{S,T,N,A,B} <: GridapType
+  counters :: A
+  axes     :: B
+  strategy :: S
+  function DistributedCounter(
+    counters :: AbstractArray{T},
+    axes     :: NTuple{N,<:PRange},
+    strategy :: AssemblyStrategy
+  ) where {T,N}
+    A, B, S = typeof(counters), typeof(axes), typeof(strategy)
+    new{S,T,N,A,B}(counters,axes,strategy)
+  end
+end
+
+Base.axes(a::DistributedCounter) = a.axes
+Base.axes(a::DistributedCounter,d::Integer) = a.axes[d]
+local_views(a::DistributedCounter) = a.counters
+
+function local_views(a::DistributedCounter,axes::PRange...)
+  @check all(map(PArrays.matching_local_indices,axes,a.axes))
+  return a.counters
+end
+
+const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B}
+Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
+
+const PSparseMatrixCounter{S,T,A,B} = DistributedCounter{S,T,2,A,B}
+Algebra.LoopStyle(::Type{<:PSparseMatrixCounter}) = Loop()
+
+"""
+    DistributedAllocation{S,T,N} <: GridapType
+
+Distributed N-dimensional allocator, with local allocators of type T. 
+Follows assembly strategy S.
+"""
+struct DistributedAllocation{S,T,N,A,B} <: GridapType
+  allocs   :: A
+  axes     :: B
+  strategy :: S
+  function DistributedAllocation(
+    allocs   :: AbstractArray{T},
+    axes     :: NTuple{N,<:PRange},
+    strategy :: AssemblyStrategy
+  ) where {T,N}
+    A, B, S = typeof(allocs), typeof(axes), typeof(strategy)
+    new{S,T,N,A,B}(allocs,axes,strategy)
+  end
+end
+
+Base.axes(a::DistributedAllocation) = a.axes
+Base.axes(a::DistributedAllocation,d::Integer) = a.axes[d]
+local_views(a::DistributedAllocation) = a.allocs
+
+function local_views(a::DistributedAllocation,axes::PRange...)
+  @check all(map(PArrays.matching_local_indices,axes,a.axes))
+  return a.allocs
+end
+
+function change_axes(a::DistributedAllocation{S,T,N},axes::NTuple{N,<:PRange}) where {S,T,N}
+  indices = map(partition,axes)
+  local_axes = map(indices...) do indices...
+    map(ids -> Base.OneTo(local_length(ids)), indices)
+  end
+  allocs = map(change_axes,a.allocs,local_axes)
+  DistributedAllocation(allocs,axes,a.strategy)
+end
+
+const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
+const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
+
+function get_allocations(a::PSparseMatrixAllocation{S,<:Algebra.AllocationCOO}) where S
+  I,J,V = map(local_views(a)) do alloc
+    alloc.I, alloc.J, alloc.V
+  end |> tuple_of_arrays
+  return I,J,V
+end
+
+function collect_touched_ids(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
+  touched_ids = map(local_views(a),partition(axes(a,1))) do a, ids
+    n_global = global_length(ids)
+    rows = remove_ghost(unpermute(ids))
+
+    ghost_lids = ghost_to_local(ids)
+    touched_ghost_lids = collect(Int,filter(lid -> a.touched[lid], ghost_lids))
+    touched_ghost_owners = local_to_owner(ids)[touched_ghost_lids]
+    touched_ghost_gids = to_global!(touched_ghost_lids, ids)
+    ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners)
+    replace_ghost(rows,ghost)
+  end
+  return touched_ids
+end
+
+# PSparseMatrix assembly chain
+#
+#   1 - nz_counter(PSparseMatrixBuilder) -> PSparseMatrixCounter
+#   2 - nz_allocation(PSparseMatrixCounter) -> PSparseMatrixAllocation
+#   3 - create_from_nz(PSparseMatrixAllocation) -> PSparseMatrix
+
+function Algebra.nz_counter(builder::PSparseMatrixBuilder{MT},axs::NTuple{2,<:PRange}) where MT
+  rows, cols = axs # test ids, trial ids
+  counters = map(partition(rows),partition(cols)) do rows, cols
+    local_axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols)))
+    Algebra.CounterCOO{MT}(local_axs)
+  end
+  DistributedCounter(counters,axs,builder.strategy)
+end
+
+function Algebra.nz_allocation(a::PSparseMatrixCounter)
+  allocs = map(nz_allocation,local_views(a))
+  DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function Algebra.create_from_nz(a::PSparseMatrix)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled})
+  A, = create_from_nz_locally_assembled(a)
+  return A
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
+  A, = create_from_nz_assembled(a)
+  return A
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:SubAssembled})
+  A, = create_from_nz_subassembled(a)
+  return A
+end
+
+# PVector assembly chain:
+#
+#   1 - nz_counter(PVectorBuilder) -> PVectorCounter
+#   2 - nz_allocation(PVectorCounter) -> PVectorAllocation
+#   3 - create_from_nz(PVectorAllocation) -> PVector
+
+function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::NTuple{1,<:PRange}) where VT
+  rows, = axs
+  counters = map(partition(rows)) do rows
+    axs = (Base.OneTo(local_length(rows)),)
+    nz_counter(ArrayBuilder(VT),axs)
+  end
+  DistributedCounter(counters,(rows,),builder.strategy)
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:Union{LocallyAssembled,SubAssembled}})
+  allocs = map(nz_allocation,local_views(a))
+  DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:Assembled})
+  values = map(nz_allocation,local_views(a))
+  allocs = map(TrackedArrayAllocation,values)
+  return DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function rhs_callback(b::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}}, rows)
+  b_fespace = PVector(local_views(b), partition(axes(b,1)))
+  locally_repartition(b_fespace, rows)
+end
+
+function rhs_callback(b::PVectorAllocation{<:Assembled}, rows)
+  new_indices = collect_touched_ids(b)
+  values = map(ai -> ai.values, local_views(b))
+  b_fespace = PVector(values, partition(axes(b,1)))
+  locally_repartition(b_fespace, new_indices)
+end
+
+function Algebra.create_from_nz(a::PVector)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVector})
+  # This point MUST NEVER be reached. If reached there is an inconsistency
+  # in the parallel code in charge of vector assembly
+  @unreachable
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}})
+  rows = map(remove_ghost, map(unpermute, partition(axes(a,1))))
+  return rhs_callback(a, rows)
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled})
+  rows = collect_touched_ids(a)
+  b = rhs_callback(a, rows)
+  t2 = assemble!(b)
+  !isnothing(t2) && wait(t2)
+  return b
+end
+
+# PSystem assembly chain:
+#
+# When assembling a full system (matrix + vector), it is more efficient to 
+# overlap communications the assembly of the matrix and the vector.
+# Not only it is faster, but also necessary to ensure identical ghost indices 
+# in both the matrix and vector rows.
+# This is done by using the following specializations:
+
+function Arrays.nz_allocation(
+  a::PSparseMatrixCounter, b::PVectorCounter
+)
+  return nz_allocation(a), nz_allocation(b)
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:LocallyAssembled},
+  b::PVectorAllocation{<:LocallyAssembled}
+)
+  A, B = create_from_nz_locally_assembled(a, Base.Fix1(rhs_callback,b))
+  return A, B
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:Assembled},
+  b::PVectorAllocation{<:Assembled}
+)
+  A, B = create_from_nz_assembled(a, Base.Fix1(rhs_callback,b), assemble!)
+  return A, B
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:SubAssembled},
+  b::PVectorAllocation{<:SubAssembled}
+)
+  A, B = create_from_nz_subassembled(a, Base.Fix1(rhs_callback,b))
+  return A, B
+end
+
+# Low-level assembly methods
+
+function create_from_nz_locally_assembled(
+  a,
+  callback::Function = rows -> nothing
+)
+  I,J,V = get_allocations(a)
+  test_ids = partition(axes(a,1))
+  trial_ids = partition(axes(a,2))
+
+  rows = map(remove_ghost,map(unpermute,test_ids))
+  b = callback(rows)
+
+  foreach(map_local_to_global!,I,test_ids)
+  foreach(map_local_to_global!,J,trial_ids)
+
+  cols = map(unpermute,trial_ids)
+
+  foreach(map_global_to_local!,I,rows)
+  foreach(map_global_to_local!,J,cols)
+
+  assembled = true
+  a_sys = change_axes(a,(PRange(rows),PRange(cols)))
+  values = map(create_from_nz,local_views(a_sys))
+  A = PSparseMatrix(values,rows,cols,assembled)
+
+  return A, b
+end
+
+function create_from_nz_assembled(
+  a,
+  callback::Function = rows -> nothing,
+  async_callback::Function = b -> empty_async_task
+)
+  I,J,V = get_allocations(a)
+  test_ids = partition(axes(a,1))
+  trial_ids = partition(axes(a,2))
+
+  # convert I and J to global dof ids
+  foreach(map_local_to_global!,I,test_ids)
+  foreach(map_local_to_global!,J,trial_ids)
+
+  # Overlapped COO communication and vector assembly
+  rows = filter_and_replace_ghost(map(unpermute,test_ids),I)
+  t = PArrays.assemble_coo!(I,J,V,rows)
+  b = callback(rows)
+  wait(t)
+
+  # Overlap rhs communications with CSC compression
+  t2 = async_callback(b)
+  cols = filter_and_replace_ghost(map(unpermute,trial_ids),J)
+
+  foreach(map_global_to_local!,I,rows)
+  foreach(map_global_to_local!,J,cols)
+
+  assembled = true
+  a_sys = change_axes(a,(PRange(rows),PRange(cols)))
+  values = map(create_from_nz,local_views(a_sys))
+  A = PSparseMatrix(values,rows,cols,assembled)
+
+  wait(t2)
+  return A, b
+end
+
+function create_from_nz_subassembled(
+  a,
+  callback::Function = rows -> nothing,
+)
+  rows = partition(axes(a,1))
+  cols = partition(axes(a,2))
+
+  b = callback(rows)
+
+  assembled = false
+  values = map(create_from_nz,local_views(a))
+  A = PSparseMatrix(values,rows,cols,assembled)
+
+  return A, b
+end
+
+# Block matrix assembly (Assembled strategy)
+#
+# Gridap's generic create_from_nz(a::ArrayBlock) calls create_from_nz per block
+# independently, producing different row/col PRanges per block → mortar check fails.
+# These dispatches intercept the block case and compute SHARED PRanges by unioning
+# the nonzero indices across all blocks in each block-row and block-col.
+
+function rhs_callback(b::VectorBlock{<:PVectorAllocation{<:Assembled}}, rows_da)
+  b_vecs = [rhs_callback(b.array[i], rows_da[i]) for i in 1:length(rows_da)]
+  mortar(b_vecs)
+end
+
+function Algebra.create_from_nz(a::MatrixBlock{<:PSparseMatrixAllocation{<:Assembled}})
+  A, = create_from_nz_block_assembled(a)
+  return A
+end
+
+function Algebra.create_from_nz(
+  a::MatrixBlock{<:PSparseMatrixAllocation{<:Assembled}},
+  b::VectorBlock{<:PVectorAllocation{<:Assembled}}
+)
+  callback = Base.Fix1(rhs_callback,b)
+  async_callback = assemble!
+  A, B = create_from_nz_block_assembled(a, callback, async_callback)
+  return A, B
+end
+
+function create_from_nz_block_assembled(
+  a              :: MatrixBlock{<:PSparseMatrixAllocation{<:Assembled}},
+  callback       :: Function = rows_da -> nothing,
+  async_callback :: Function = b_vecs  -> [empty_async_task]
+)
+  NBr, NBc  = size(a.array)
+  block_ids = CartesianIndices(a.array)
+
+  I, J, V = map(get_allocations,a.array) |> tuple_of_arrays
+
+  test_ids  = [partition(axes(a.array[i,i],1)) for i in 1:NBr]
+  trial_ids = [partition(axes(a.array[j,j],2)) for j in 1:NBc]
+
+  for i in 1:NBr, j in 1:NBc
+    foreach(map_local_to_global!,I[i,j],test_ids[i])
+    foreach(map_local_to_global!,J[i,j],trial_ids[j])
+  end
+
+  # Shared row partition per block-row: union I[i,:] across all columns
+  rows_da = [filter_and_replace_ghost(I[i,:],test_ids[i]) for i in 1:NBr]
+
+  # COO communication overlapped with callback (e.g. RHS local repartition)
+  ts = [
+    PArrays.assemble_coo!(I[i,j],J[i,j],V[i,j],rows_da[i]) for i in 1:NBr, j in 1:NBc
+  ]
+  b = callback(rows_da)
+  foreach(wait,ts)
+
+  # Kick off async RHS communication, overlapped with column partition computation
+  t2s = async_callback(b)
+
+  # Shared col partition per block-col: union J[:,j] across all rows
+  cols_da = [filter_and_replace_ghost(J[:,j],trial_ids[j]) for j in 1:NBc]
+
+  foreach(wait,t2s)
+
+  for i in 1:NBr, j in 1:NBc
+    foreach(map_global_to_local!,I[i,j],rows_da[i])
+    foreach(map_global_to_local!,J[i,j],cols_da[j])
+  end
+
+  # Build each block's PSparseMatrix using the shared row/col partitions.
+  # All blocks in block-row i store the same rows_da[i] object, so
+  # partition(axes(A,1)) === rows_da[i] for all j → mortar === check passes.
+  assembled = true
+  block_mats = map(block_ids) do idx
+    i,j = idx[1],idx[2]
+    a_sys  = change_axes(a.array[idx],(PRange(rows_da[i]),PRange(cols_da[j])))
+    values = map(create_from_nz,local_views(a_sys))
+    PSparseMatrix(values,rows_da[i],cols_da[j],assembled)
+  end
+
+  return mortar(block_mats), b
+end
diff --git a/src/BlockPartitionedArrays.jl b/src/BlockPartitionedArrays.jl
index eb918d63..8825c81f 100644
--- a/src/BlockPartitionedArrays.jl
+++ b/src/BlockPartitionedArrays.jl
@@ -92,7 +92,9 @@ function BlockPMatrix{V}(::UndefInitializer,rows::BlockPRange,cols::BlockPRange)
   vals = map(block_ids) do I
     r = block_rows[I[1]]
     c = block_cols[I[2]]
-    PSparseMatrix{V}(undef,partition(r),partition(c))
+    psparse(partition(r),partition(c);assembled=true) do row_indices,col_indices
+      PartitionedArrays.allocate_local_values(V,row_indices,col_indices)
+    end
   end
   return BlockPMatrix(vals,rows,cols)
 end
@@ -124,7 +126,12 @@ function Base.similar(a::BlockPMatrix,::Type{T},inds::Tuple{<:BlockPRange,<:Bloc
   vals = map(CartesianIndices(blocksize(a))) do I
     rows = inds[1].ranges[I[1]]
     cols = inds[2].ranges[I[2]]
-    similar(a.blocks[I],T,(rows,cols))
+    new_partition = map(
+      PartitionedArrays.partition(a.blocks[I]),partition(rows),partition(cols)
+    ) do local_mat,row_inds,col_inds
+      PartitionedArrays.allocate_local_values(local_mat,T,row_inds,col_inds)
+    end
+    PSparseMatrix(new_partition,partition(rows),partition(cols),false)
   end
   return BlockPArray(vals,inds)
 end
@@ -134,7 +141,9 @@ function Base.similar(::Type{<:BlockPMatrix{V,T,A}},inds::Tuple{<:BlockPRange,<:
   cols = blocks(inds[2])
   values = map(CartesianIndices((length(rows),length(cols)))) do I
     i,j = I[1],I[2]
-    return similar(A,(rows[i],cols[j]))
+    psparse(partition(rows[i]),partition(cols[j]);assembled=true) do row_inds,col_inds
+      PartitionedArrays.allocate_local_values(V,row_inds,col_inds)
+    end
   end
   return BlockPArray(values,inds)
 end
@@ -179,7 +188,7 @@ function Base.copyto!(y::BlockPMatrix,x::BlockPMatrix)
 end
 
 function Base.fill!(a::BlockPVector,v)
-  map(blocks(a)) do a
+  foreach(blocks(a)) do a
     fill!(a,v)
   end
   return a
@@ -215,8 +224,15 @@ function Base.all(f::Function,x::BlockPVector)
   all(map(xi->all(f,xi),blocks(x)))
 end
 
+function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
+  foreach(blocks(x),blocks(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  return y
+end
+
 function LinearAlgebra.rmul!(a::BlockPVector,v::Number)
-  map(ai->rmul!(ai,v),blocks(a))
+  foreach(ai->rmul!(ai,v),blocks(a))
   return a
 end
 
@@ -271,8 +287,8 @@ function PartitionedArrays.partition(a::BlockPArray)
   return map(mortar,vals)
 end
 
-function PartitionedArrays.to_trivial_partition(a::BlockPArray)
-  vals = map(PartitionedArrays.to_trivial_partition,blocks(a))
+function PartitionedArrays.centralize(a::BlockPArray)
+  vals = map(PartitionedArrays.centralize,blocks(a))
   return mortar(vals)
 end
 
@@ -365,12 +381,22 @@ function LinearAlgebra.norm(v::BlockPVector,p::Real=2)
 end
 
 function LinearAlgebra.fillstored!(a::BlockPMatrix,v)
-  map(blocks(a)) do a
+  foreach(blocks(a)) do a
     LinearAlgebra.fillstored!(a,v)
   end
   return a
 end
 
+function Algebra.axpy_entries!(
+  α::Number, A::BlockPMatrix, B::BlockPMatrix;
+  check::Bool=true
+)
+  foreach(blocks(A),blocks(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
 # Broadcasting
 
 struct BlockPBroadcasted{A,B}
@@ -423,6 +449,6 @@ function Base.materialize(b::BlockPBroadcasted)
 end
 
 function Base.materialize!(a::BlockPArray,b::BlockPBroadcasted)
-  map(Base.materialize!,blocks(a),blocks(b))
+  foreach(Base.materialize!,blocks(a),blocks(b))
   return a
 end
diff --git a/src/CellData.jl b/src/CellData.jl
index 0c82a000..07b0fe96 100644
--- a/src/CellData.jl
+++ b/src/CellData.jl
@@ -16,6 +16,10 @@ end
 
 # DistributedCellField
 """
+    DistributedCellField{A,B,C} <: CellField
+
+Distributed cell field. Wraps a distributed array of local `CellField` objects
+together with the underlying [`DistributedTriangulation`](@ref). 
 """
 struct DistributedCellField{A,B,C} <: CellField
   fields::A
@@ -189,6 +193,10 @@ end
 
 # Integration related
 """
+    DistributedMeasure{A,B} <: GridapType
+
+Distributed integration measure. Wraps a distributed array of local `Measure`
+objects and the underlying [`DistributedTriangulation`](@ref).
 """
 struct DistributedMeasure{A<:AbstractArray{<:Measure},B<:DistributedTriangulation} <: GridapType
   measures::A
@@ -216,6 +224,10 @@ function CellData.get_cell_points(a::DistributedMeasure)
 end
 
 """
+    DistributedDomainContribution{A} <: GridapType
+
+The result of a distributed integral `∫(f)dΩ`. Wraps a distributed array of local
+`DomainContribution` objects.
 """
 struct DistributedDomainContribution{A<:AbstractArray{<:DomainContribution}} <: GridapType
   contribs::A
@@ -502,9 +514,11 @@ function DiracDelta{0}(model::DistributedDiscreteModel;tags)
   degree = 0
   DiracDelta{0}(model,degree;tags=tags)
 end
+
 function (d::DistributedDiracDelta)(f)
- evaluate(d,f)
+  evaluate(d,f)
 end
+
 function Gridap.Arrays.evaluate!(cache,d::DistributedDiracDelta,f)
- ∫(f)*d.dΓ
+  ∫(f)*d.dΓ
 end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 29465f2c..79557eb9 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -1,6 +1,16 @@
 #
 # Generic FE space related methods
 
+"""
+    DistributedFESpace <: FESpace
+
+Abstract supertype for all distributed finite element spaces. Subtypes wrap a
+distributed array of local `FESpace` objects (one per rank) and a `PRange` of global
+DOF indices.
+
+The standard concrete implementations are [`DistributedSingleFieldFESpace`](@ref) and
+[`DistributedMultiFieldFESpace`](@ref).
+"""
 abstract type DistributedFESpace <: FESpace end
 
 function FESpaces.get_vector_type(fs::DistributedFESpace)
@@ -67,200 +77,9 @@ function FESpaces.gather_free_and_dirichlet_values(f::DistributedFESpace,cell_va
   return free_values, dirichlet_values
 end
 
-function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_prange)
-  map(cell_wise_vector,
-          dof_wise_vector,
-          cell_to_ldofs,
-          partition(cell_prange)) do cwv,dwv,cell_to_ldofs,indices
-    cache  = array_cache(cell_to_ldofs)
-    ncells = length(cell_to_ldofs)
-    ptrs = cwv.ptrs
-    data = cwv.data
-    cell_own_to_local = own_to_local(indices)
-    for cell in cell_own_to_local
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      p = ptrs[cell]-1
-      for (i,ldof) in enumerate(ldofs)
-        if ldof > 0
-          data[i+p] = dwv[ldof]
-        end
-      end
-    end
-  end
-end
-
-function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_range)
-  map(dof_wise_vector,
-      cell_wise_vector,
-      cell_to_ldofs,
-      partition(cell_range)) do dwv,cwv,cell_to_ldofs,indices
-    cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
-    for cell in cell_ghost_to_local
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      p = cwv.ptrs[cell]-1
-      for (i,ldof) in enumerate(ldofs)
-        if ldof > 0
-          dwv[ldof] = cwv.data[i+p]
-        end
-      end
-    end
-  end
-end
-
-function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_prange)
-  cwv = map(cell_to_ldofs) do cell_to_ldofs
-    cache = array_cache(cell_to_ldofs)
-    ncells = length(cell_to_ldofs)
-    ptrs = Vector{Int32}(undef,ncells+1)
-    for cell in 1:ncells
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      ptrs[cell+1] = length(ldofs)
-    end
-    PArrays.length_to_ptrs!(ptrs)
-    ndata = ptrs[end]-1
-    data = Vector{Int}(undef,ndata)
-    data .= -1
-    JaggedArray(data,ptrs)
-  end
-  dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_prange)
-  cwv
-end
-
-function fetch_vector_ghost_values_cache(vector_partition,partition)
-  cache = PArrays.p_vector_cache(vector_partition,partition)
-  map(reverse,cache)
-end
-
-function fetch_vector_ghost_values!(vector_partition,cache)
-  assemble!((a,b)->b, vector_partition, cache) 
-end
-
-function generate_gids(
-  cell_range::PRange,
-  cell_to_ldofs::AbstractArray{<:AbstractArray},
-  nldofs::AbstractArray{<:Integer})
-
-  # Find and count number owned dofs
-  ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs
-    ldof_to_owner = fill(Int32(0),nldofs)
-    cache = array_cache(cell_to_ldofs)
-    lcell_to_owner = local_to_owner(indices)
-    for cell in 1:length(cell_to_ldofs)
-      owner = lcell_to_owner[cell]
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      for ldof in ldofs
-        if ldof>0
-          # TODO this simple approach concentrates dofs
-          # in the last part and creates imbalances
-          ldof_to_owner[ldof] = max(owner,ldof_to_owner[ldof])
-        end
-      end
-    end
-    me = part_id(indices) 
-    nodofs = count(p->p==me,ldof_to_owner)
-    ldof_to_owner, nodofs
-  end |> tuple_of_arrays
-
-  cell_ldofs_to_part = dof_wise_to_cell_wise(ldof_to_owner,
-                                              cell_to_ldofs,
-                                              cell_range)
-
-  # Note1 : this call potentially updates cell_prange with the 
-  #         info required to exchange info among nearest neighbours
-  #         so that it can be re-used in the future for other exchanges
-
-  # Note2 : we need to call reverse() as the senders and receivers are 
-  #         swapped in the AssemblyCache of partition(cell_range)
-
-  # Exchange the dof owners
-  cache_fetch=fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range))
-  fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait
-  
-  cell_wise_to_dof_wise!(ldof_to_owner,
-                         cell_ldofs_to_part,
-                         cell_to_ldofs,
-                         cell_range)
-
-  # Find the global range of owned dofs
-  first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs)))
-  
-  # Distribute gdofs to owned ones
-  ldof_to_gdof = map(first_gdof,ldof_to_owner,partition(cell_range)) do first_gdof,ldof_to_owner,indices
-    me = part_id(indices)
-    offset = first_gdof-1
-    ldof_to_gdof = Vector{Int}(undef,length(ldof_to_owner))
-    odof = 0
-    for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
-        odof += 1
-        ldof_to_gdof[ldof] = odof
-      else
-        ldof_to_gdof[ldof] = 0
-      end
-    end
-    for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
-        ldof_to_gdof[ldof] += offset
-      end
-    end
-    ldof_to_gdof
-  end
-
-  # Create cell-wise global dofs
-  cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,
-                                        cell_to_ldofs,
-                                        cell_range)
-
-  # Exchange the global dofs
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-
-  # Distribute global dof ids also to ghost
-  map(cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
-    gdof = 0
-    cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
-    cell_local_to_owner = local_to_owner(indices)
-    for cell in cell_ghost_to_local
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      p = cell_to_gdofs.ptrs[cell]-1
-      for (i,ldof) in enumerate(ldofs)
-        if ldof > 0 && ldof_to_owner[ldof] == cell_local_to_owner[cell]
-          ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
-        end
-      end
-    end
-  end
-
-  dof_wise_to_cell_wise!(cell_to_gdofs,
-                         ldof_to_gdof,
-                         cell_to_ldofs,
-                         cell_range)
-
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-
-  cell_wise_to_dof_wise!(ldof_to_gdof,
-                         cell_to_gdofs,
-                         cell_to_ldofs,
-                         cell_range)
-
-  # Setup DoFs LocalIndices
-  ngdofs = reduction(+,nodofs,destination=:all,init=zero(eltype(nodofs)))
-  local_indices = map(ngdofs,partition(cell_range),ldof_to_gdof,ldof_to_owner) do ngdofs,
-                                                                                  indices,
-                                                                                  ldof_to_gdof,
-                                                                                  ldof_to_owner
-     me = part_id(indices)
-     LocalIndices(ngdofs,me,ldof_to_gdof,ldof_to_owner)
-  end
-
-  # Setup dof range
-  dofs_range = PRange(local_indices)
-
-  return dofs_range
-end
 
 # FEFunction related
+
 """
 """
 struct DistributedFEFunctionData{T<:AbstractVector} <: GridapType
@@ -277,6 +96,10 @@ end
 
 # Single field related
 """
+    DistributedSingleFieldFESpace{A,B,C,D,E} <: DistributedFESpace
+
+Distributed single-field FE space. Stores a distributed array of local
+`SingleFieldFESpace` objects (`spaces`) and the global DOF `PRange` (`gids`).
 """
 struct DistributedSingleFieldFESpace{A,B,C,D,E} <: DistributedFESpace
   spaces::A
@@ -470,7 +293,7 @@ function FESpaces.interpolate!(
   free_values::AbstractVector,
   f::DistributedSingleFieldFESpace, 
   isconsistent=false)
-  map(f.spaces,local_views(free_values)) do V,vec
+  foreach(f.spaces,local_views(free_values)) do V,vec
     interpolate!(u,vec,V)
   end
   FEFunction(f,free_values,isconsistent)
@@ -479,9 +302,9 @@ end
 function FESpaces.interpolate!(
   u::DistributedCellField,
   free_values::AbstractVector,
-  f::DistributedSingleFieldFESpace, 
+  f::DistributedSingleFieldFESpace,
   isconsistent=false)
-  map(local_views(u),f.spaces,local_views(free_values)) do u, V,vec
+  foreach(local_views(u),f.spaces,local_views(free_values)) do u, V,vec
     interpolate!(u,vec,V)
   end
   FEFunction(f,free_values,isconsistent)
@@ -497,7 +320,7 @@ function FESpaces.interpolate_dirichlet!(
   u, free_values::AbstractVector,
   dirichlet_values::AbstractArray{<:AbstractVector},
   f::DistributedSingleFieldFESpace, isconsistent=false)
-  map(f.spaces,local_views(free_values),dirichlet_values) do V,fvec,dvec
+  foreach(f.spaces,local_views(free_values),dirichlet_values) do V,fvec,dvec
     interpolate_dirichlet!(u,fvec,dvec,V)
   end
   FEFunction(f,free_values,dirichlet_values,isconsistent)
@@ -507,7 +330,7 @@ function FESpaces.interpolate_dirichlet!(
   u::DistributedCellField, free_values::AbstractVector,
   dirichlet_values::AbstractArray{<:AbstractVector},
   f::DistributedSingleFieldFESpace, isconsistent=false)
-  map(local_views(u), f.spaces,local_views(free_values),dirichlet_values) do u,V,fvec,dvec
+  foreach(local_views(u), f.spaces,local_views(free_values),dirichlet_values) do u,V,fvec,dvec
     interpolate_dirichlet!(u,fvec,dvec,V)
   end
   FEFunction(f,free_values,dirichlet_values,isconsistent)
@@ -523,7 +346,7 @@ function FESpaces.interpolate_everywhere!(
   u, free_values::AbstractVector,
   dirichlet_values::AbstractArray{<:AbstractVector},
   f::DistributedSingleFieldFESpace, isconsistent=false)
-  map(f.spaces,local_views(free_values),dirichlet_values) do V,fvec,dvec
+  foreach(f.spaces,local_views(free_values),dirichlet_values) do V,fvec,dvec
     interpolate_everywhere!(u,fvec,dvec,V)
   end
   FEFunction(f,free_values,dirichlet_values,isconsistent)
@@ -533,7 +356,7 @@ function FESpaces.interpolate_everywhere!(
   u::DistributedCellField, free_values::AbstractVector,
   dirichlet_values::AbstractArray{<:AbstractVector},
   f::DistributedSingleFieldFESpace, isconsistent=false)
-  map(local_views(u),f.spaces,local_views(free_values),dirichlet_values) do ui,V,fvec,dvec
+  foreach(local_views(u),f.spaces,local_views(free_values),dirichlet_values) do ui,V,fvec,dvec
     interpolate_everywhere!(ui,fvec,dvec,V)
   end
   FEFunction(f,free_values,dirichlet_values,isconsistent)
@@ -541,32 +364,6 @@ end
 
 # Factories
 
-# function FESpaces.FESpace(
-#   model::DistributedDiscreteModel,reffe;split_own_and_ghost=false,constraint=nothing,kwargs...
-# )
-#   spaces = map(local_views(model)) do m
-#     FESpace(m,reffe;kwargs...)
-#   end
-#   gids =  generate_gids(model,spaces)
-#   trian = DistributedTriangulation(map(get_triangulation,spaces),model)
-#   vector_type = _find_vector_type(spaces,gids;split_own_and_ghost=split_own_and_ghost)
-#   space = DistributedSingleFieldFESpace(spaces,gids,trian,vector_type)
-#   return _add_distributed_constraint(space,reffe,constraint)
-# end
-# 
-# function FESpaces.FESpace(
-#   _trian::DistributedTriangulation,reffe;split_own_and_ghost=false,constraint=nothing,kwargs...
-# )
-#   trian = add_ghost_cells(_trian)
-#   spaces = map(local_views(trian)) do t
-#     FESpace(t,reffe;kwargs...)
-#   end
-#   gids = generate_gids(trian,spaces)
-#   vector_type = _find_vector_type(spaces,gids;split_own_and_ghost=split_own_and_ghost)
-#   space = DistributedSingleFieldFESpace(spaces,gids,trian,vector_type)
-#   return _add_distributed_constraint(space,reffe,constraint)
-# end
-
 function FESpaces.FESpace(
   model::DistributedDiscreteModel,args...;kwargs...
 )
@@ -674,193 +471,6 @@ function _add_distributed_constraint(F::DistributedFESpace,order::Integer,constr
   end
 end
 
-# Assembly
-
-function FESpaces.collect_cell_matrix(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  a::DistributedDomainContribution)
-  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
-end
-
-function FESpaces.collect_cell_vector(
-  test::DistributedFESpace, a::DistributedDomainContribution)
-  map(collect_cell_vector,local_views(test),local_views(a))
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution)
-  map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform))
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution,
-  uhd)
-  map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform),
-    local_views(uhd))
-end
-
-function FESpaces.collect_cell_vector(
-  test::DistributedFESpace,l::Number)
-  map(local_views(test)) do s
-    collect_cell_vector(s,l)
-  end
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number)
-  map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
-    collect_cell_matrix_and_vector(u,v,m,l)
-  end
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number,
-  uhd)
-  map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
-    collect_cell_matrix_and_vector(u,v,m,l,f)
-  end
-end
-"""
-"""
-struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
-  strategy::A
-  assems::B
-  matrix_builder::C
-  vector_builder::D
-  test_dofs_gids_prange::E
-  trial_dofs_gids_prange::F
-end
-
-local_views(a::DistributedSparseMatrixAssembler) = a.assems
-
-FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_dofs_gids_prange
-FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_dofs_gids_prange
-FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
-FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
-FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
-
-function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
-end
-
-function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
-end
-
-function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  Aviews=local_views(A,rows,cols)
-  bviews=local_views(b,rows)
-  aviews=local_views(a)
-  map(symbolic_loop_matrix_and_vector!,Aviews,bviews,aviews,data)  
-end
-
-function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
-end
-
-# Parallel Assembly strategies
-
-function local_assembly_strategy(::SubAssembledRows,rows,cols)
-  DefaultAssemblyStrategy()
-end
-
-# When using this one, make sure that you also loop over ghost cells.
-# This is at your own risk.
-function local_assembly_strategy(::FullyAssembledRows,rows,cols)
-  rows_local_to_ghost = local_to_ghost(rows)
-  GenericAssemblyStrategy(
-    identity,
-    identity,
-    row->rows_local_to_ghost[row]==0,
-    col->true
-  )
-end
-
-# Assembler high level constructors
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  rows::PRange,
-  cols::PRange,
-  par_strategy=SubAssembledRows()
-)
-  assems = map(partition(rows),partition(cols)) do rows,cols
-    local_strategy = local_assembly_strategy(par_strategy,rows,cols)
-    FESpaces.GenericSparseMatrixAssembler(
-      SparseMatrixBuilder(local_mat_type),
-      ArrayBuilder(local_vec_type),
-      Base.OneTo(length(rows)),
-      Base.OneTo(length(cols)),
-      local_strategy
-    )
-  end
-  mat_builder = PSparseMatrixBuilderCOO(local_mat_type,par_strategy)
-  vec_builder = PVectorBuilder(local_vec_type,par_strategy)
-  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy=SubAssembledRows()
-)
-  rows = get_free_dof_ids(test)
-  cols = get_free_dof_ids(trial)
-  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy=SubAssembledRows()
-)
-  Tv = PartitionedArrays.getany(map(get_vector_type,local_views(trial)))
-  T  = eltype(Tv)
-  Tm = SparseMatrixCSC{T,Int}
-  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy)
-end
-
 # ZeroMean FESpace
 struct DistributedZeroMeanCache{A,B}
   dvol::A
@@ -901,7 +511,7 @@ function FESpaces.ZeroMeanFESpace(space::DistributedSingleFieldFESpace,dΩ::Dist
     vol  = sum(dvol)
     return vol, dvol
   end |> tuple_of_arrays
-  vol  = reduce(+,_vol,init=zero(eltype(vol)))
+  vol = reduce(+,_vol,init=zero(eltype(_vol)))
   metadata = DistributedZeroMeanCache(dvol,vol)
 
   return DistributedSingleFieldFESpace(
@@ -976,10 +586,26 @@ end
 
 # Constant FESpace
 
+# Creates a partition with 1 global DOF, owned by `owner`, ghost on all other ranks.
+function constant_partition(ranks, owner)
+  indices = map(ranks) do rank
+    if rank == owner
+      own   = OwnIndices(1, rank, Int[1])
+      ghost = GhostIndices(1, Int[], Int32[])
+    else
+      own   = OwnIndices(1, rank, Int[])
+      ghost = GhostIndices(1, Int[1], Int32[owner])
+    end
+    global_to_owner = [rank]
+    return OwnAndGhostIndices(own, ghost, global_to_owner)
+  end
+  return indices
+end
+
 """
     ConstantFESpace(
-      model::DistributedDiscreteModel; 
-      constraint_type=:global, 
+      model::DistributedDiscreteModel;
+      constraint_type=:global,
       kwargs...
     )
 
@@ -1008,16 +634,12 @@ function FESpaces.ConstantFESpace(
 
   # Single dof, owned by processor 1 (ghost for all other processors)
   nranks = length(spaces)
-  cell_gids = get_cell_gids(model)
-  indices = map(partition(cell_gids)) do cell_indices
-    me = part_id(cell_indices)
-    if constraint_type == :global
-      LocalIndices(1,me,Int[1],Int32[1])
-    else
-      LocalIndices(nranks,me,Int[me],Int32[me])
-    end
+  ranks = linear_indices(partition(get_cell_gids(model)))
+  gids = if constraint_type == :global
+    PRange(constant_partition(ranks, 1))
+  else
+    PRange(uniform_partition(ranks, nranks))
   end
-  gids = PRange(indices)
 
   trian = DistributedTriangulation(map(get_triangulation,spaces),model)
   vector_type = _find_vector_type(spaces,gids)
@@ -1044,3 +666,157 @@ function FESpaces.PolytopalFESpace(
   trian = Triangulation(with_ghost,model)
   FESpaces.PolytopalFESpace(trian,args...;kwargs...)
 end
+
+# Assembly
+
+"""
+"""
+struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
+  strategy::A
+  assems::B
+  matrix_builder::C
+  vector_builder::D
+  test_gids::E
+  trial_gids::F
+end
+
+local_views(a::DistributedSparseMatrixAssembler) = a.assems
+
+FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
+FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
+FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
+FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
+FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  rows::PRange,
+  cols::PRange,
+  par_strategy=Assembled()
+)
+  assems = map(partition(rows),partition(cols)) do rows,cols
+    FESpaces.GenericSparseMatrixAssembler(
+      SparseMatrixBuilder(local_mat_type), 
+      ArrayBuilder(local_vec_type),
+      Base.OneTo(length(rows)),
+      Base.OneTo(length(cols)),
+      local_assembly_strategy(par_strategy,rows,cols)
+    )
+  end
+  mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy)
+  vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy)
+  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  rows = get_free_dof_ids(test)
+  cols = get_free_dof_ids(trial)
+  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  Tv = getany(map(get_vector_type,local_views(trial)))
+  Tm = SparseMatrixCSC{eltype(Tv),Int}
+  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
+end
+
+function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)  
+end
+
+function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
+end
+
+function FESpaces.collect_cell_matrix(
+  trial::DistributedFESpace, test::DistributedFESpace, matdata::DistributedDomainContribution
+)
+  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(matdata))
+end
+
+function FESpaces.collect_cell_vector(
+  test::DistributedFESpace, vecdata::DistributedDomainContribution
+)
+  map(collect_cell_vector,local_views(test),local_views(vecdata))
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace, test::DistributedFESpace,
+  matdata::DistributedDomainContribution, vecdata::DistributedDomainContribution
+)
+  map(collect_cell_matrix_and_vector,
+    local_views(trial), local_views(test),
+    local_views(matdata), local_views(vecdata)
+  )
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace, test::DistributedFESpace,
+  matdata::DistributedDomainContribution, vecdata::DistributedDomainContribution, uhd
+)
+  map(collect_cell_matrix_and_vector,
+    local_views(trial), local_views(test),
+    local_views(matdata), local_views(vecdata), local_views(uhd)
+  )
+end
+
+function FESpaces.collect_cell_vector(
+  test::DistributedFESpace, l::Number
+)
+  map(local_views(test)) do s
+    collect_cell_vector(s,l)
+  end
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace, test::DistributedFESpace,
+  matdata::DistributedDomainContribution, l::Number
+)
+  map(local_views(trial),local_views(test),local_views(matdata)) do u,v,m
+    collect_cell_matrix_and_vector(u,v,m,l)
+  end
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace, test::DistributedFESpace,
+  matdata::DistributedDomainContribution, l::Number, uhd
+)
+  map(local_views(trial),local_views(test),local_views(matdata),local_views(uhd)) do u,v,m,f
+    collect_cell_matrix_and_vector(u,v,m,l,f)
+  end
+end
diff --git a/src/Geometry.jl b/src/Geometry.jl
index 5ef8d06f..13b1185f 100644
--- a/src/Geometry.jl
+++ b/src/Geometry.jl
@@ -2,12 +2,33 @@
 struct WithGhost end
 struct NoGhost end
 
+"""
+    with_ghost
+
+Sentinel value passed to triangulation and FE-function accessors to request that
+ghost (off-processor) entries be included in the returned local array.
+
+See also [`no_ghost`](@ref).
+"""
 const with_ghost = WithGhost()
+
+"""
+    no_ghost
+
+Sentinel value passed to triangulation and FE-function accessors to request that
+only owned (on-processor) entries be included in the returned local array.
+
+See also [`with_ghost`](@ref).
+"""
 const no_ghost = NoGhost()
 
 # We do not inherit from Grid on purpose.
 # This object cannot implement the Grid interface in a strict sense
 """
+    DistributedGrid{Dc,Dp,A} <: GridapType
+
+Distributed counterpart of Gridap's `Grid`. Wraps a distributed array of local
+`Grid` objects.
 """
 struct DistributedGrid{Dc,Dp,A} <: GridapType
   grids::A
@@ -37,6 +58,11 @@ Geometry.num_point_dims(::Type{<:DistributedGrid{Dc,Dp}}) where {Dc,Dp} = Dp
 # We do not inherit from GridTopology on purpose.
 # This object cannot implement the GridTopology interface in a strict sense
 """
+    DistributedGridTopology{Dc,Dp,A,B} <: GridapType
+
+Distributed counterpart of Gridap's `GridTopology`. Stores a distributed array of
+local topologies together with distributed index ranges for each
+face dimension.
 """
 struct DistributedGridTopology{Dc,Dp,A,B} <: GridapType
   topos::A
@@ -122,7 +148,7 @@ function _setup_consistent_faces!(topo::DistributedGridTopology, dimfrom::Intege
     JaggedArray(lfrom_to_lto.data, lfrom_to_lto.ptrs)
   end
   wait(consistent!(PVector(lfrom_to_gto, gids_from)))
-  map(lfrom_to_gto, gids_to) do lfrom_to_gto, gids_to
+  foreach(lfrom_to_gto, gids_to) do lfrom_to_gto, gids_to
     to_local!(lfrom_to_gto.data, gids_to)
   end
   return nothing
@@ -169,6 +195,11 @@ function Geometry.get_isboundary_face(topo::DistributedGridTopology, d::Integer)
 end
 
 """
+    DistributedFaceLabeling{A} <: GridapType
+
+Distributed counterpart of Gridap's `FaceLabeling`. Wraps a distributed array of
+local `FaceLabeling` objects. Tags are applied locally on each rank and do NOT need to be 
+consistent; consistency across must be ensured explicitly by the user.
 """
 struct DistributedFaceLabeling{A<:AbstractArray{<:FaceLabeling}}
   labels::A
@@ -215,13 +246,27 @@ end
 # This object cannot implement the DiscreteModel interface in a strict sense
 
 """
+    DistributedDiscreteModel{Dc,Dp} <: GridapType
+
+Abstract type for distributed mesh models. Each MPI rank holds a local portion of the
+mesh (owned cells + ghost cells).
 """
 abstract type DistributedDiscreteModel{Dc,Dp} <: GridapType end
 
+"""
+    get_cell_gids(model::DistributedDiscreteModel) -> PRange
+
+Return the `PRange` of global cell indices for the distributed model.
+"""
 function get_cell_gids(model::DistributedDiscreteModel{Dc}) where Dc
   @abstractmethod
 end
 
+"""
+    get_face_gids(model::DistributedDiscreteModel, dim::Integer) -> PRange
+
+Return the `PRange` of global `dim`-face indices for the distributed model.
+"""
 function get_face_gids(model::DistributedDiscreteModel,dim::Integer)
   @abstractmethod
 end
@@ -268,6 +313,11 @@ function Geometry.get_face_labeling(model::DistributedDiscreteModel)
 end
 
 """
+    GenericDistributedDiscreteModel{Dc,Dp,A,B,C} <: DistributedDiscreteModel{Dc,Dp}
+
+Standard concrete implementation of [`DistributedDiscreteModel`](@ref). Stores a
+distributed array of local `DiscreteModel` objects (`models`) and a vector of
+`PRange` objects for each face dimension (`face_gids`).
 """
 struct GenericDistributedDiscreteModel{Dc,Dp,A,B,C} <: DistributedDiscreteModel{Dc,Dp}
   models::A
@@ -609,6 +659,13 @@ end
 # We do not inherit from Triangulation on purpose.
 # This object cannot implement the Triangulation interface in a strict sense
 """
+    DistributedTriangulation{Dc,Dp,A,B,C} <: GridapType
+
+Distributed counterpart of Gridap's `Triangulation`. Wraps a distributed array of
+local triangulations (one per rank).
+
+Constructed from a `DistributedDiscreteModel` using the standard Gridap constructors:
+`Triangulation(model)`, `Boundary(model, tags=...)`, etc.
 """
 struct DistributedTriangulation{Dc,Dp,A,B,C} <: GridapType
   trians  ::A
@@ -795,12 +852,12 @@ end
 # Filtering cells
 
 @inline function filter_cells_when_needed(
-  portion::Union{WithGhost,FullyAssembledRows},cell_gids,trian)
+  portion::Union{WithGhost,LocallyAssembled},cell_gids,trian)
   return trian
 end
 
 @inline function filter_cells_when_needed(
-  portion::Union{NoGhost,SubAssembledRows},cell_gids,trian)
+  portion::Union{NoGhost,Assembled},cell_gids,trian)
   return remove_ghost_cells(trian,cell_gids)
 end
 
@@ -957,12 +1014,12 @@ function generate_cell_gids(dtrian::DistributedTriangulation)
   generate_cell_gids(dmodel,dtrian)
 end
 
-function generate_cell_gids(dmodel::DistributedDiscreteModel{Dm},
-                            dtrian::DistributedTriangulation{Dt}) where {Dm,Dt}
-
+function generate_cell_gids(
+  dmodel::DistributedDiscreteModel{Dm},
+  dtrian::DistributedTriangulation{Dt}
+) where {Dm,Dt}
   mgids = get_face_gids(dmodel,Dt)
-  covers_all_faces = _covers_all_faces(dmodel,dtrian)
-  if (covers_all_faces)
+  if _covers_all_faces(dmodel,dtrian)
     tgids = mgids
   else
     tcell_to_mcell = map(local_views(dtrian)) do trian
@@ -974,44 +1031,3 @@ function generate_cell_gids(dmodel::DistributedDiscreteModel{Dm},
   end
   return tgids
 end
-
-function restrict_gids(gids::PRange, new_to_old_lid::AbstractArray)
-
-  n_own = map(partition(gids), new_to_old_lid) do ids, n2o_lid
-    rank = part_id(ids)
-    return count(isequal(rank), view(local_to_owner(ids), n2o_lid)) 
-  end
-
-  # Assign global ids to owned lids
-  first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
-  
-  old_lid_to_new_gid = map(first_gid,new_to_old_lid,partition(gids)) do first_gid, n2o_lid, ids
-    old_lid_to_new_gid = zeros(Int,local_length(ids))
-    old_lid_to_owner = local_to_owner(ids)
-    rank = part_id(ids)
-    gid = first_gid
-    for old in n2o_lid
-      if old_lid_to_owner[old] == rank
-        old_lid_to_new_gid[old] = gid
-        gid += 1
-      end
-    end
-    return old_lid_to_new_gid
-  end
-
-  consistent!(PVector(old_lid_to_new_gid,partition(gids))) |> wait
-
-  # Prepare new partition
-  n_gids = reduction(+,n_own,destination=:all,init=zero(eltype(n_own)))
-
-  new_indices = map(
-    n_gids, old_lid_to_new_gid, new_to_old_lid, partition(gids)
-  ) do n_gids, old_lid_to_new_gid, new_to_old_lid, ids
-    lid_to_gid = old_lid_to_new_gid[new_to_old_lid]
-    lid_to_owner  = local_to_owner(ids)[new_to_old_lid]
-    return LocalIndices(n_gids,part_id(ids),lid_to_gid,lid_to_owner)
-  end
-  _find_neighbours!(new_indices, partition(gids))
-
-  return PRange(new_indices)
-end
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index da799997..d3312203 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -33,8 +33,9 @@ import Base: inv, abs, abs2, *, +, -, /, adjoint, transpose, real, imag, conj, g
 import Gridap.Fields: grad2curl
 import Gridap.CellData: Interpolable
 
-export FullyAssembledRows
-export SubAssembledRows
+export LocallyAssembled
+export SubAssembled
+export Assembled
 
 export get_cell_gids
 export get_face_gids
@@ -48,8 +49,14 @@ include("PArraysExtras.jl")
 
 include("BlockPartitionedArrays.jl")
 
+include("GridapExtras.jl")
+
 include("Algebra.jl")
 
+include("Assembly.jl")
+
+include("Partitions.jl")
+
 include("Geometry.jl")
 
 include("CellData.jl")
diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl
new file mode 100644
index 00000000..ba0bf326
--- /dev/null
+++ b/src/GridapExtras.jl
@@ -0,0 +1,59 @@
+
+# New type of Vector allocation
+
+"""
+    TrackedArrayAllocation{T} <: GridapType
+
+Array that keeps track of which entries have been touched. For assembly purposes.
+"""
+struct TrackedArrayAllocation{T}
+  values  :: Vector{T}
+  touched :: Vector{Bool}
+end
+
+function TrackedArrayAllocation(values)
+  touched = fill(false,length(values))
+  TrackedArrayAllocation(values,touched)
+end
+
+Algebra.LoopStyle(::Type{<:TrackedArrayAllocation}) = Algebra.Loop()
+Algebra.create_from_nz(a::TrackedArrayAllocation) = a.values
+
+@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i)
+  if i > 0
+    a.touched[i] = true
+  end
+  nothing
+end
+@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v,i)
+  if i > 0
+    a.touched[i] = true
+    a.values[i] = combine(v,a.values[i])
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i)
+  for ie in i
+    Arrays.add_entry!(combine,a,nothing,ie)
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v,i)
+  for (ve,ie) in zip(v,i)
+    Arrays.add_entry!(combine,a,ve,ie)
+  end
+  nothing
+end
+
+# change_axes
+
+function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
+  b = Algebra.CounterCOO{T}(axes)
+  b.nnz = a.nnz
+  b
+end
+
+function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
+  counter = change_axes(a.counter,axes)
+  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
+end
diff --git a/src/MacroDiscreteModels.jl b/src/MacroDiscreteModels.jl
index c08392ec..e1cdf48a 100644
--- a/src/MacroDiscreteModels.jl
+++ b/src/MacroDiscreteModels.jl
@@ -49,7 +49,8 @@ end
 # DistributedDiscreteModel API
 
 function get_cell_gids(macro_model::MacroDiscreteModel)
-  return linear_indices(local_views(macro_model.model))
+  ranks = linear_indices(local_views(macro_model.model))
+  return PRange(uniform_partition(ranks,length(ranks)))
 end
 
 # TODO: Reuse macro_model.interface_gids
@@ -74,15 +75,17 @@ function get_local_dimranges(macro_model::MacroDiscreteModel{Dc}) where Dc
 end
 
 """
-  Returns a JaggedArray of JaggedArrays such that we have
+    classify_interfaces(model::DistributedDiscreteModel{Dc};sort_faces=true)
+
+Returns a JaggedArray of JaggedArrays such that we have
 
     [interface dimension][interface lid][face dimension] -> interface dfaces lids
-  
-  I.e we bundle interfaces of the same dimension together (macro-faces, macro-edges, macro-nodes), 
-  and for each interface we return a JaggedArray with the d-faces in the interface.
 
-  If `sort_faces` is true, the faces in each interface are sorted by gid. If false, the faces
-  are by default sorted by lid.
+I.e we bundle interfaces of the same dimension together (macro-faces, macro-edges, macro-nodes), 
+and for each interface we return a JaggedArray with the d-faces in the interface.
+
+If `sort_faces` is true, the faces in each interface are sorted by gid. If false, the faces
+are by default sorted by lid.
 """
 function classify_interfaces(model::DistributedDiscreteModel{Dc};sort_faces=true) where Dc
   ranks = linear_indices(local_views(model))
@@ -166,22 +169,22 @@ function classify_interfaces(model::DistributedDiscreteModel{Dc};sort_faces=true
 end
 
 """
-    function generate_interface_gids(nbors,keys) -> gids
+    generate_interface_gids(nbors,keys) -> gids
 
-  Generates a global numbering for the interfaces, given two input arrays per processor:
+Generates a global numbering for the interfaces, given two input arrays per processor:
 
-    - `nbors`: For each interface, the minimum rank of the neighboring processors.
-    - `keys`: For each interface, the minimum gid of the faces in the interface.
-  
-  Then the gids are assigned in the following way: 
-  We iterate over the processors in ascending order. For each processor, we iterate over the local
-  interfaces. Then: 
-
-    - If the `nbor` processor of an interface has a higher (or equal) rank than the current processor, it means we
-    haven't assigned a gid to that interface yet. So we assign a new gid to the current interface.
-    - If the `nbor` processor of an interface has a lower rank than the current processor, it means we 
-    already assigned a gid to that interface (while iterating over `nbor`). So we look for a matching `key`
-    in the `keys` array of the `nbor` processor, and assign the same gid to the current interface.
+- `nbors`: For each interface, the minimum rank of the neighboring processors.
+- `keys`: For each interface, the minimum gid of the faces in the interface.
+
+Then the gids are assigned in the following way: 
+We iterate over the processors in ascending order. For each processor, we iterate over the local
+interfaces. Then: 
+
+- If the `nbor` processor of an interface has a higher (or equal) rank than the current processor, it means we
+haven't assigned a gid to that interface yet. So we assign a new gid to the current interface.
+- If the `nbor` processor of an interface has a lower rank than the current processor, it means we 
+already assigned a gid to that interface (while iterating over `nbor`). So we look for a matching `key`
+in the `keys` array of the `nbor` processor, and assign the same gid to the current interface.
 """
 function generate_interface_gids(nbors,keys)
   # Gather to main
@@ -222,21 +225,27 @@ function generate_interface_gids(nbors,keys)
   n_glob = emit(_n_glob)
 
   # Create PRange
-  ranks = linear_indices(igids)
-  indices = map(ranks,n_glob,igids,nbors) do rank,n_glob, igids, nbors
-    LocalIndices(n_glob,rank,igids,nbors)
+  n_own = map(linear_indices(nbors), nbors) do rank, nbors
+    count(==(rank), nbors)
   end
-  return PRange(indices)
+  return PRange(permuted_variable_partition(n_own, igids, nbors; n_global=n_glob))
 end
 
 """
-  Given a model and a set of local interfaces for each model, returns
-    - `nbors`: For each interface, the minimum rank of the neighboring processors.
-    - `keys`: For each interface, the minimum gid of the faces in the interface.
+    generate_nbors_and_keys(
+      model::DistributedDiscreteModel{Dc}, d_to_interfaces;
+      is_sorted = false, dimensions = 0:Dc-1
+    )
+
+Given a model and a set of local interfaces for each model, returns
+
+- `nbors`: For each interface, the minimum rank of the neighboring processors.
+- `keys`: For each interface, the minimum gid of the faces in the interface.
 
-  Options: 
-   - `is_sorted`: If true, the keys are assumed to be sorted by gid. This allows some optimization.
-   - `dimensions`: List/Set of interface dimensions to be considered. 
+Options: 
+
+- `is_sorted`: If true, the keys are assumed to be sorted by gid. This allows some optimization.
+- `dimensions`: List/Set of interface dimensions to be considered. 
 """
 function generate_nbors_and_keys(
   model::DistributedDiscreteModel{Dc},
@@ -303,13 +312,16 @@ function generate_nbors_and_keys(
 end
 
 """
-  Returns a global (consistent) face labeling for the macro model. Requires communication.
+    get_global_face_labeling(macro_model::MacroDiscreteModel{Dc})
+
+Returns a global (consistent) face labeling for the macro model. Requires communication.
 
-  The face labeling contains the following tags: 
-    - `Interior_i` : Faces which are interior to processor `i`.
-    - `Interface_j`: Faces which belong to interface `j` (global id).
-    - `Interiors`  : Union of all `Interior_i` tags.
-    - `Interfaces` : Union of all `Interface_j` tags.
+The face labeling contains the following tags: 
+
+- `Interior_i` : Faces which are interior to processor `i`.
+- `Interface_j`: Faces which belong to interface `j` (global id).
+- `Interiors`  : Union of all `Interior_i` tags.
+- `Interfaces` : Union of all `Interface_j` tags.
 """
 function get_global_face_labeling(macro_model::MacroDiscreteModel{Dc}) where Dc
   model = macro_model.model
@@ -374,15 +386,19 @@ function get_global_face_labeling(macro_model::MacroDiscreteModel{Dc}) where Dc
 end
 
 """
-  Returns a local face labeling for the macro model. Does not require communication.
-  WARNING: This is NOT consistent, it is sub-assembled. The same face might have 
-           different labels in different processors.
-
-  The face labeling contains the following tags: 
-    - `Interior`   : Faces which are interior to the processor.
-    - `Exterior`   : Faces which are exterior to the processor.
-    - `Interface_i`: Faces which belong to interface `i` (local id).
-    - `Interfaces` : Union of all `Interface_i` tags.
+    get_local_face_labeling(macro_model::MacroDiscreteModel{Dc})
+
+Returns a local face labeling for the macro model. Does not require communication.
+
+WARNING: This is NOT consistent, it is sub-assembled. The same face might have 
+         different labels in different processors.
+
+The face labeling contains the following tags: 
+
+- `Interior`   : Faces which are interior to the processor.
+- `Exterior`   : Faces which are exterior to the processor.
+- `Interface_i`: Faces which belong to interface `i` (local id).
+- `Interfaces` : Union of all `Interface_i` tags.
 """
 function get_local_face_labeling(macro_model::MacroDiscreteModel{Dc}) where Dc
   model = macro_model.model
diff --git a/src/MultiField.jl b/src/MultiField.jl
index 4b2b1626..053c6ff1 100644
--- a/src/MultiField.jl
+++ b/src/MultiField.jl
@@ -62,6 +62,14 @@ end
 # DistributedMultiFieldFESpace
 
 """
+    DistributedMultiFieldFESpace{MS,A,B,C,D} <: DistributedFESpace
+
+Distributed multi-field FE space coupling several [`DistributedSingleFieldFESpace`](@ref)
+objects into a block system. The global DOF indices are stored as either a `PRange`
+(monolithic) or a `BlockPRange` (block-structured, for block preconditioners).
+
+Constructed via `MultiFieldFESpace([V, Q, ...])` from a vector of distributed
+single-field spaces.
 """
 struct DistributedMultiFieldFESpace{MS,A,B,C,D} <: DistributedFESpace
   multi_field_style::MS
@@ -334,7 +342,7 @@ function generate_multi_field_gids(
     push!(f_p_flid_lid,p_flid_lid)
   end
   f_frange = map(get_free_dof_ids,f_dspace)
-  gids = generate_multi_field_gids(f_p_flid_lid,f_frange)
+  gids = vcat_gids(f_p_flid_lid,f_frange)
   return gids
 end
 
@@ -351,120 +359,6 @@ function generate_multi_field_gids(
   return BlockPRange(block_gids)
 end
 
-function generate_multi_field_gids(
-  f_p_flid_lid::AbstractVector{<:AbstractArray{<:AbstractVector}},
-  f_frange::AbstractVector{<:PRange})
-
-  f_p_fiset = map(local_views,f_frange)
-
-  v(x...) = collect(x)
-  p_f_fiset = map(v,f_p_fiset...)
-  p_f_flid_lid = map(v,f_p_flid_lid...)
-
-  # Find the first gid of the multifield space in each part
-  ngids = sum(map(length,f_frange))
-  p_noids = map(f_fiset->sum(map(own_length,f_fiset)),p_f_fiset)
-  p_firstgid = scan(+,p_noids,type=:exclusive,init=one(eltype(p_noids)))
-
-  # Distributed gids to owned dofs
-  p_lid_gid, p_lid_part = map(
-    p_f_flid_lid, p_f_fiset, p_firstgid) do f_flid_lid, f_fiset, firstgid
-    nlids = sum(map(length,f_flid_lid))
-    lid_gid = zeros(Int,nlids)
-    lid_part = zeros(Int32,nlids)
-    nf = length(f_fiset)
-    gid = firstgid
-    for f in 1:nf
-      fiset = f_fiset[f]
-      fiset_owner_to_local = own_to_local(fiset)
-      flid_lid = f_flid_lid[f]
-      part = part_id(fiset)
-      for foid in 1:own_length(fiset)
-        flid = fiset_owner_to_local[foid]
-        lid = flid_lid[flid]
-        lid_part[lid] = part
-        lid_gid[lid] = gid
-        gid += 1
-      end
-    end
-    lid_gid,lid_part
-  end |> tuple_of_arrays
-
-  # Now we need to propagate to ghost
-  # to this end we use the already available
-  # communicators in each of the single fields
-  # We cannot use the cell wise dof like in the old version
-  # since each field can be defined on an independent mesh.
-  f_aux_gids = map(frange->PVector{Vector{eltype(eltype(p_lid_gid))}}(undef,partition(frange)),f_frange)
-  f_aux_part = map(frange->PVector{Vector{eltype(eltype(p_lid_part))}}(undef,partition(frange)),f_frange)
-  propagate_to_ghost_multifield!(p_lid_gid,f_aux_gids,f_p_flid_lid,f_p_fiset)
-  propagate_to_ghost_multifield!(p_lid_part,f_aux_part,f_p_flid_lid,f_p_fiset)
-
-  p_iset = map(partition(f_frange[1]),p_lid_gid,p_lid_part) do indices,
-                                                               lid_to_gid,
-                                                               lid_to_owner
-     me = part_id(indices)
-     LocalIndices(ngids,me,lid_to_gid,lid_to_owner)
-  end
-
-  # Merge neighbors
-  function merge_neigs(f_neigs)
-    dict = Dict{Int32,Int32}()
-    for f in 1:length(f_neigs)
-      for neig in f_neigs[f]
-        dict[neig] = neig
-      end
-    end
-    collect(keys(dict))
-  end
-
-  f_p_parts_snd, f_p_parts_rcv = map(x->assembly_neighbors(partition(x)),f_frange) |> tuple_of_arrays
-  p_f_parts_snd = map(v,f_p_parts_snd...)
-  p_f_parts_rcv = map(v,f_p_parts_rcv...)
-  p_neigs_snd = map(merge_neigs,p_f_parts_snd)
-  p_neigs_rcv = map(merge_neigs,p_f_parts_rcv)
-
-  exchange_graph = ExchangeGraph(p_neigs_snd,p_neigs_rcv)
-  assembly_neighbors(p_iset;neighbors=exchange_graph)
-
-  PRange(p_iset)
-end
-
-function propagate_to_ghost_multifield!(
-  p_lid_gid,f_gids,f_p_flid_lid,f_p_fiset)
-  # Loop over fields
-  nf = length(f_gids)
-  for f in 1:nf
-    # Write data into owned in single-field buffer
-    gids = f_gids[f]
-    p_flid_gid = gids.vector_partition
-    p_flid_lid = f_p_flid_lid[f]
-    p_fiset = f_p_fiset[f]
-    map(
-      p_flid_gid,p_flid_lid,p_lid_gid,p_fiset) do flid_gid,flid_lid,lid_gid,fiset
-      fiset_own_to_local = own_to_local(fiset)
-      for foid in 1:own_length(fiset)
-        flid = fiset_own_to_local[foid]
-        lid = flid_lid[flid]
-        flid_gid[flid] = lid_gid[lid]
-      end
-    end
-    # move to ghost
-    cache=fetch_vector_ghost_values_cache(partition(gids),p_fiset)
-    fetch_vector_ghost_values!(partition(gids),cache) |> wait
-    # write again into multifield array on ghost ids
-    map(
-      p_flid_gid,p_flid_lid,p_lid_gid,p_fiset) do flid_gid,flid_lid,lid_gid,fiset
-      fiset_ghost_to_local=ghost_to_local(fiset)
-      for fhid in 1:ghost_length(fiset)
-        flid = fiset_ghost_to_local[fhid]
-        lid = flid_lid[flid]
-        lid_gid[lid] = flid_gid[flid]
-      end
-    end
-  end
-end
-
 # BlockSparseMatrixAssemblers
 
 const DistributedBlockSparseMatrixAssembler{R,C} =
@@ -475,7 +369,7 @@ function FESpaces.SparseMatrixAssembler(
   local_vec_type,
   trial::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle},
   test::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle},
-  par_strategy=SubAssembledRows()
+  par_strategy::FESpaces.AssemblyStrategy=Assembled()
 )
   NBr, SBr, Pr = MultiField.get_block_parameters(MultiFieldStyle(test))
   NBc, SBc, Pc = MultiField.get_block_parameters(MultiFieldStyle(trial))
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 3e5264ee..4453e22e 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -1,4 +1,398 @@
 
+"""
+    permuted_variable_partition(n_own,gids,owners; n_global, start)
+
+Create indices which are a permuted version of a variable_partition.
+The advantage of this w.r.t. the `LocalIndices` type, is that we can compute 
+dof owners with minimal communications, since we only need the size of the blocks.
+
+NOTE: This is the type for our FESpace dof_ids.
+"""
+function permuted_variable_partition(
+  n_own::AbstractArray{<:Integer},
+  gids::AbstractArray{<:AbstractArray{<:Integer}},
+  owners::AbstractArray{<:AbstractArray{<:Integer}};
+  n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))),
+  start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+)
+  ranks = linear_indices(n_own)
+  np = length(ranks)
+  map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners
+    n_local = length(gids)
+    n_ghost = n_local - n_own
+    perm = fill(zero(Int32),n_local)
+    ghost_gids = fill(zero(Int),n_ghost)
+    ghost_owners = fill(zero(Int32),n_ghost)
+
+    n_ghost = 0
+    for (lid,(gid,owner)) in enumerate(zip(gids,owners))
+      if owner == rank
+        perm[lid] = gid-start+1
+      else
+        n_ghost += 1
+        ghost_gids[n_ghost] = gid
+        ghost_owners[n_ghost] = owner
+        perm[lid] = n_own + n_ghost
+      end
+    end
+    @assert n_ghost == n_local - n_own
+
+    ghost = GhostIndices(n_global,ghost_gids,ghost_owners)
+    dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize(
+      CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost
+    )
+    permute_indices(dof_ids,perm)
+  end
+end
+
+"""
+    unpermute(indices::AbstractLocalIndices)
+
+Given local indices, reorders them to (locally) have own indices first, 
+followed by ghost indices.
+"""
+function unpermute(indices::AbstractLocalIndices) 
+  @notimplemented
+end
+
+unpermute(indices::PermutedLocalIndices) = unpermute(indices.indices)
+unpermute(indices::PartitionedArrays.LocalIndicesInBlockPartition) = indices
+unpermute(indices::OwnAndGhostIndices) = indices
+
+function unpermute(indices::LocalIndices)
+  nglobal = global_length(indices)
+  rank = part_id(indices)
+  own = OwnIndices(nglobal,rank,own_to_global(indices))
+  ghost = GhostIndices(nglobal,ghost_to_global(indices),ghost_to_owner(indices))
+  OwnAndGhostIndices(own,ghost)
+end
+
+"""
+    locally_repartition(v::PVector,new_indices)
+
+Map the values of a PVector to a new partitioning of the indices.
+
+Similar to `PartitionedArrays.repartition`, but without any communications. Instead, 
+it is assumed that the local-to-local mapping can be done locally.
+"""
+function locally_repartition(v::PVector,new_indices)
+  w = similar(v,PRange(new_indices))
+  locally_repartition!(w,v)
+end
+
+function locally_repartition!(w::PVector,v::PVector)
+  # Fill own values
+  map(copy!,own_values(w),own_values(v))
+
+  # Fill ghost values
+  new_indices = partition(axes(w,1))
+  old_indices = partition(axes(v,1))
+  map(partition(w),partition(v),new_indices,old_indices) do w,v,new_ids,old_ids
+    old_gid_to_lid = global_to_local(old_ids)
+    for (lid,gid) in zip(ghost_to_local(new_ids),ghost_to_global(new_ids))
+      old_lid = old_gid_to_lid[gid]
+      w[lid] = v[old_lid]
+    end
+  end
+
+  return w
+end
+
+
+"""
+    filter_and_replace_ghost(indices,gids)
+
+Replace ghost ids in `indices` with the ghost ids within `gids`.
+
+NOTE: The issue is that `replace_ghost` does not check if all gids are ghosts or whether
+they are repeated. It also shifts ownership of the ghosts. Its all quite messy and not what we
+would need. TODO: Make me better.
+"""
+function filter_and_replace_ghost(indices,gids)
+  owners = find_owner(indices,gids)
+  new_indices = map(indices,gids,owners) do indices, gids, owners
+    ghost_gids, ghost_owners = _filter_ghost(indices,gids,owners)
+    replace_ghost(indices, ghost_gids, ghost_owners)
+  end
+  return new_indices
+end
+
+# Block variant: union a slice of distributed gid arrays (e.g. I[i,:] or J[:,j])
+# before calling the standard filter_and_replace_ghost.
+function filter_and_replace_ghost(gids_block::Vector{<:AbstractArray}, ids)
+  union_gids = map(to_parray_of_arrays(gids_block)) do chunks
+    vcat(chunks...)
+  end
+  filter_and_replace_ghost(map(unpermute,ids), union_gids)
+end
+
+# Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that 
+# belong to `indices`. This could eventually be a flag in the original function.
+function _filter_ghost(indices,gids,owners)
+  ghosts = Set{Int}()
+  part_owner = part_id(indices)
+
+  n_ghost = 0
+  for (gid,owner) in zip(gids,owners)
+    if gid < 1
+      continue
+    end
+    if (owner != part_owner) && !(gid in ghosts)
+      n_ghost += 1
+      push!(ghosts,gid)
+    end
+  end
+
+  new_ghost_to_global = zeros(Int,n_ghost)
+  new_ghost_to_owner = zeros(Int32,n_ghost)
+
+  empty!(ghosts)
+  n_ghost = 0
+  for (gid,owner) in zip(gids,owners)
+    if gid < 1
+      continue
+    end
+    if (owner != part_owner) && !(gid in ghosts)
+      n_ghost += 1
+      new_ghost_to_global[n_ghost] = gid
+      new_ghost_to_owner[n_ghost] = owner
+      push!(ghosts,gid)
+    end
+  end
+
+  return new_ghost_to_global, new_ghost_to_owner
+end
+
+# function PArrays.remove_ghost(indices::PermutedLocalIndices)
+#   n_global = global_length(indices) 
+#   own = OwnIndices(n_global,part_id(indices),own_to_global(indices))
+#   ghost = GhostIndices(n_global,Int[],Int32[])
+#   OwnAndGhostIndices(own,ghost)
+# end
+
+function PArrays.remove_ghost(indices::PermutedLocalIndices)
+  @check indices.perm[own_to_local(indices)] == 1:own_length(indices)
+  return remove_ghost(unpermute(indices))
+end
+
+# This function computes a mapping among the local identifiers of a and b
+# for which the corresponding global identifiers are both in a and b. 
+# Note that the haskey check is necessary because in the general case 
+# there might be gids in b which are not present in a
+function local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
+  a_lid_to_b_lid = fill(zero(Int32),local_length(a))
+
+  b_local_to_global  = local_to_global(b)
+  a_global_to_local  = global_to_local(a)
+  for b_lid in 1:local_length(b)
+    gid = b_local_to_global[b_lid]
+    a_lid = a_global_to_local[gid]
+    if !iszero(a_lid)
+      a_lid_to_b_lid[a_lid] = b_lid
+    end  
+  end
+  a_lid_to_b_lid
+end
+
+# SubSparseMatrix extensions
+
+function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
+  I,J,V = findnz(A.parent)
+  rowmap, colmap = A.inv_indices
+  for k in eachindex(I)
+    I[k] = rowmap[I[k]]
+    J[k] = colmap[J[k]]
+  end
+  mask = map((i,j) -> (i > 0 && j > 0), I, J)
+  return I[mask], J[mask], V[mask]
+end
+
+# Async tasks
+
+const empty_async_task = PartitionedArrays.FakeTask(() -> nothing)
+
+Base.copy(a::MPIArray) = map(copy, a)
+
+# Linear algebra
+
+function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
+  @check matching_local_indices(partition(axes(x,1)),partition(axes(y,1)))
+  foreach(partition(x),partition(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  consistent!(y) |> wait
+  return y
+end
+
+
+function Algebra.axpy_entries!(
+  α::Number, A::PSparseMatrix, B::PSparseMatrix;
+  check::Bool=true
+)
+# We should definitely check here that the index partitions are the same. 
+# However: Because the different matrices are assembled separately, the objects are not the 
+# same (i.e can't use ===). Checking the index partitions would then be costly...
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
+  foreach(partition(A),partition(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+
+# Array of PArrays -> PArray of Arrays 
+# TODO: I think this is now implemented in PartitionedArrays.jl (check)
+function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      PartitionedArrays.getany(aj)
+    end
+  end
+end
+
+function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      aj.items[i]
+    end
+  end
+end
+
+# To local/to global for blocks 
+
+function to_local_indices!(I,ids::PRange;kwargs...)
+  foreach(to_local!,I,partition(ids))
+  return I
+end
+
+function to_global_indices!(I,ids::PRange;kwargs...)
+  foreach(to_global!,I,partition(ids))
+  return I
+end
+
+# This type is required because MPIArray from PArrays 
+# cannot be instantiated with a NULL communicator
+struct MPIVoidVector{T} <: AbstractVector{T}
+  comm::MPI.Comm
+  function MPIVoidVector(::Type{T}) where {T}
+    new{T}(MPI.COMM_NULL)
+  end
+end
+
+Base.size(a::MPIVoidVector) = (0,)
+Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
+function Base.getindex(a::MPIVoidVector,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.setindex!(a::MPIVoidVector,v,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
+  println(io,"MPIVoidVector")
+end
+
+# Communication extras, subpartitioning extras
+
+function num_parts(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    nparts = MPI.Comm_size(comm)
+  else
+    nparts = -1
+  end
+  nparts
+end
+@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
+@inline num_parts(comm::DebugArray) = length(comm.items)
+@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
+
+function get_part_id(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    id = MPI.Comm_rank(comm)+1
+  else
+    id = -1
+  end
+  id
+end
+@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
+@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
+
+"""
+    i_am_in(comm) -> Bool
+
+Return `true` if the current MPI rank belongs to the (sub-)communicator `comm`.
+In `DebugArray` mode always returns `true`.
+
+Use this to guard code that should only run on a subset of ranks:
+
+```julia
+coarse_ranks = generate_subparts(ranks, 2)
+if i_am_in(coarse_ranks)
+  # only executed on ranks 1–2
+end
+```
+"""
+function i_am_in(comm::MPI.Comm)
+  get_part_id(comm) >=0
+end
+@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
+@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
+@inline i_am_in(comm::DebugArray) = true
+
+function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
+  x_new = map(new_parts) do p
+    if isa(x,MPIArray)
+      PartitionedArrays.getany(x)
+    elseif isa(x,DebugArray) && (p <= length(x.items))
+      x.items[p]
+    else
+      default
+    end
+  end
+  return x_new
+end
+
+"""
+    generate_subparts(parts, new_comm_size) -> AbstractArray
+
+Return a new distributed array of rank indices with `new_comm_size` entries,
+formed from the first `new_comm_size` ranks of `parts`. Ranks outside the new
+sub-communicator receive an `MPIVoidVector` (a null communicator placeholder).
+
+Use [`i_am_in`](@ref) to check whether the current rank belongs to the returned
+sub-communicator before executing sub-communicator-specific code.
+"""
+function generate_subparts(parts::MPIArray,new_comm_size)
+  root_comm = parts.comm
+  root_size = MPI.Comm_size(root_comm)
+  rank = MPI.Comm_rank(root_comm)
+
+  @static if isdefined(MPI,:MPI_UNDEFINED)
+    mpi_undefined = MPI.MPI_UNDEFINED[]
+  else
+    mpi_undefined = MPI.API.MPI_UNDEFINED[]
+  end
+  
+  if root_size == new_comm_size
+    return parts
+  else
+    if rank < new_comm_size
+      comm = MPI.Comm_split(root_comm,0,0)
+      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
+    else
+      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
+      return MPIVoidVector(eltype(parts))
+    end
+  end
+end
+
+function generate_subparts(parts::DebugArray,new_comm_size)
+  DebugArray(LinearIndices((new_comm_size,)))
+end
+
+# TODO: This comes from master! Still needed? Probably not. 
 # This is copied from PArrays v0.5, which is not yet supported. It is necessary to get 
 # more than one layers of ghosts.
 
diff --git a/src/Partitions.jl b/src/Partitions.jl
new file mode 100644
index 00000000..fe327267
--- /dev/null
+++ b/src/Partitions.jl
@@ -0,0 +1,630 @@
+
+# Cell-wise communication helpers
+
+function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids)
+  foreach(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids
+    cache  = array_cache(cell_to_ldofs)
+    for cell in cell_ids
+      ldofs = getindex!(cache,cell_to_ldofs,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,ldof) in enumerate(ldofs)
+        if ldof > 0
+          cwv.data[i+p] = dwv[ldof]
+        end
+      end
+    end
+  end
+  return cell_wise_vector
+end
+
+function posneg_wise_to_cell_wise!(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids)
+  foreach(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids) do cwv,pwv,nwv,cell_to_posneg,cell_ids
+    cache  = array_cache(cell_to_posneg)
+    for cell in cell_ids
+      lids = getindex!(cache,cell_to_posneg,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,lid) in enumerate(lids)
+        if lid > 0
+          cwv.data[i+p] = pwv[lid]
+        elseif lid < 0
+          cwv.data[i+p] = nwv[-lid]
+        end
+      end
+    end
+  end
+  return cell_wise_vector
+end
+
+function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids)
+  foreach(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
+    cache = array_cache(cell_to_ldofs)
+    for cell in cell_ids
+      ldofs = getindex!(cache,cell_to_ldofs,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,ldof) in enumerate(ldofs)
+        if ldof > 0
+          dwv[ldof] = cwv.data[i+p]
+        end
+      end
+    end
+  end
+  return dof_wise_vector
+end
+
+function cell_wise_to_posneg_wise!(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids)
+  foreach(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids) do pwv,nwv,cwv,cell_to_posneg,cell_ids
+    cache = array_cache(cell_to_posneg)
+    for cell in cell_ids
+      lids = getindex!(cache,cell_to_posneg,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,lid) in enumerate(lids)
+        if lid > 0
+          pwv[lid] = cwv.data[i+p]
+        elseif lid < 0
+          nwv[-lid] = cwv.data[i+p]
+        end
+      end
+    end
+  end
+  return pos_wise_vector, neg_wise_vector
+end
+
+function allocate_cell_wise_vector(T, cell_to_lids)
+  map(cell_to_lids) do cell_to_lids
+    ptrs = Arrays.generate_ptrs(cell_to_lids)
+    data = zeros(T,ptrs[end]-1)
+    JaggedArray(data,ptrs)
+  end
+end
+
+function dof_wise_to_cell_wise(
+  dof_wise_vector, cell_to_ldofs, cell_ids;
+  T = eltype(eltype(dof_wise_vector))
+)
+  cwv = allocate_cell_wise_vector(T,cell_to_ldofs)
+  dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_ids)
+  return cwv
+end
+
+function fetch_vector_ghost_values_cache(vector_partition,partition)
+  cache = PArrays.p_vector_cache(vector_partition,partition)
+  reverse(cache)
+end
+
+function fetch_vector_ghost_values!(vector_partition,cache)
+  assemble!((a,b)->b, vector_partition, cache)
+end
+
+# PRange generation from cell meshes
+
+"""
+    generate_gids(cell_gids::PRange, cell_to_lids, nlids) -> PRange
+
+Given a set of cell global ids, a distributed array of local tables mapping cells to local dof ids,
+and a distributed array with the number of local dofs in each partition, this function
+generates the global dof ids and returns them as a `PRange` of `PermutedLocalIndices`.
+
+Ignores negative local dof ids (usually used for Dirichlet dofs).
+"""
+function generate_gids(
+  cell_range::PRange,
+  cell_to_ldofs::AbstractArray{<:AbstractArray},
+  nldofs::AbstractArray{<:Integer}
+)
+  ranks = linear_indices(partition(cell_range))
+  cell_ldofs_to_data = allocate_cell_wise_vector(Int, cell_to_ldofs)
+  cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_data,partition(cell_range))
+
+  # Find and count number owned dofs
+  ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs
+    ldof_to_owner = fill(Int32(0),nldofs)
+    cache = array_cache(cell_to_ldofs)
+    for (cell, owner) in enumerate(local_to_owner(indices))
+      ldofs = getindex!(cache,cell_to_ldofs,cell)
+      for ldof in ldofs
+        if ldof > 0
+          # NOTE: this approach concentrates dofs in the last processor
+          ldof_to_owner[ldof] = max(owner,ldof_to_owner[ldof])
+        end
+      end
+    end
+    nodofs = count(isequal(part_id(indices)),ldof_to_owner)
+    ldof_to_owner, nodofs
+  end |> tuple_of_arrays
+
+  # Find the global range of owned dofs
+  first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs)))
+
+  # Exchange the dof owners. Cell owner always has correct dof owner.
+  cell_ldofs_to_owner = dof_wise_to_cell_wise!(
+    cell_ldofs_to_data,ldof_to_owner,cell_to_ldofs,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_ldofs_to_owner,cache_fetch) |> wait
+  cell_wise_to_dof_wise!(
+    ldof_to_owner,cell_ldofs_to_owner,cell_to_ldofs,ghost_to_local(cell_range)
+  )
+
+  # Fill owned gids
+  ldof_to_gdof = map(ranks,first_gdof,ldof_to_owner) do rank,first_gdof,ldof_to_owner
+    offset = first_gdof-1
+    ldof_to_gdof = zeros(Int,length(ldof_to_owner))
+    odof = 0
+    for (ldof,owner) in enumerate(ldof_to_owner)
+      if owner == rank
+        odof += 1
+        ldof_to_gdof[ldof] = odof + offset
+      end
+    end
+    ldof_to_gdof
+  end
+
+  # Exchange gids
+  cell_to_gdofs = dof_wise_to_cell_wise!(
+    cell_ldofs_to_data,ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
+
+  # Fill ghost gids with exchanged information
+  foreach(
+    cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)
+  ) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
+    cache = array_cache(cell_to_ldofs)
+    lcell_to_owner = local_to_owner(indices)
+    for cell in ghost_to_local(indices)
+      p = cell_to_gdofs.ptrs[cell]-1
+      ldofs = getindex!(cache,cell_to_ldofs,cell)
+      cell_owner = lcell_to_owner[cell]
+      for (i,ldof) in enumerate(ldofs)
+        if (ldof > 0) && isequal(ldof_to_owner[ldof],cell_owner)
+          ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
+        end
+      end
+    end
+  end
+
+  dof_wise_to_cell_wise!(cell_to_gdofs,ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range))
+  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
+  cell_wise_to_dof_wise!(ldof_to_gdof,cell_to_gdofs,cell_to_ldofs,ghost_to_local(cell_range))
+
+  # Setup DoFs LocalIndices
+  local_indices = permuted_variable_partition(
+    nodofs,ldof_to_gdof,ldof_to_owner;start=first_gdof
+  )
+
+  return PRange(local_indices)
+end
+
+"""
+    generate_posneg_gids(cell_gids::PRange, cell_to_lposneg, nlpos, nlneg) -> (PRange,PRange)
+
+Similar to `generate_gids`, but also handles negative local dof ids. Returns two sets of
+global ids: one for positive local ids and another for negative local ids.
+This can be used to generate simultaneously free and dirichlet dof global ids.
+"""
+function generate_posneg_gids(
+  cell_range::PRange,
+  cell_to_lposneg::AbstractArray{<:AbstractArray},
+  nlpos::AbstractArray{<:Integer},
+  nlneg::AbstractArray{<:Integer}
+)
+  ranks = linear_indices(partition(cell_range))
+  cell_lids_to_data = allocate_cell_wise_vector(Int, cell_to_lposneg)
+  cache_fetch = fetch_vector_ghost_values_cache(cell_lids_to_data,partition(cell_range))
+
+  # Find and count number owned dofs
+  lpos_to_owner, lneg_to_owner, nopos, noneg = map(
+    partition(cell_range),cell_to_lposneg,nlpos,nlneg
+  ) do indices,cell_to_lposneg,nlpos,nlneg
+    lpos_to_owner = fill(zero(Int32),nlpos)
+    lneg_to_owner = fill(zero(Int32),nlneg)
+    cache = array_cache(cell_to_lposneg)
+    for (cell, owner) in enumerate(local_to_owner(indices))
+      lids = getindex!(cache,cell_to_lposneg,cell)
+      for lid in lids
+        if lid > 0
+          lpos_to_owner[lid] = max(owner,lpos_to_owner[lid])
+        elseif lid < 0
+          lneg_to_owner[-lid] = max(owner,lneg_to_owner[-lid])
+        end
+      end
+    end
+    rank = part_id(indices)
+    nopos = count(isequal(rank),lpos_to_owner)
+    noneg = count(isequal(rank),lneg_to_owner)
+    return lpos_to_owner, lneg_to_owner, nopos, noneg
+  end |> tuple_of_arrays
+
+  # Find the global range of owned dofs
+  first_gpos = scan(+,nopos,type=:exclusive,init=1)
+  first_gneg = scan(+,noneg,type=:exclusive,init=1)
+
+  # Exchange the dof owners. Cell owner always has correct dof owner.
+  cell_ldofs_to_owner = posneg_wise_to_cell_wise!(
+    cell_lids_to_data,lpos_to_owner,lneg_to_owner,cell_to_lposneg,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_ldofs_to_owner,cache_fetch) |> wait
+  cell_wise_to_posneg_wise!(
+    lpos_to_owner,lneg_to_owner,cell_ldofs_to_owner,cell_to_lposneg,ghost_to_local(cell_range)
+  )
+
+  # Fill owned gids
+  lpos_to_gpos = map(ranks,first_gpos,lpos_to_owner) do rank,first_gpos,lpos_to_owner
+    offset = first_gpos-1
+    lpos_to_gpos = zeros(Int,length(lpos_to_owner))
+    opos = 0
+    for (lpos,owner) in enumerate(lpos_to_owner)
+      if owner == rank
+        opos += 1
+        lpos_to_gpos[lpos] = opos + offset
+      end
+    end
+    lpos_to_gpos
+  end
+
+  lneg_to_gneg = map(ranks,first_gneg,lneg_to_owner) do rank,first_gneg,lneg_to_owner
+    offset = first_gneg-1
+    lneg_to_gneg = zeros(Int,length(lneg_to_owner))
+    oneg = 0
+    for (lneg,owner) in enumerate(lneg_to_owner)
+      if owner == rank
+        oneg += 1
+        lneg_to_gneg[lneg] = oneg + offset
+      end
+    end
+    lneg_to_gneg
+  end
+
+  # Exchange gids
+  cell_to_gposneg = posneg_wise_to_cell_wise!(
+    cell_lids_to_data,lpos_to_gpos,lneg_to_gneg,cell_to_lposneg,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_to_gposneg,cache_fetch) |> wait
+
+  # Fill ghost gids with exchanged information
+  foreach(
+    cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,partition(cell_range)
+  ) do cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,indices
+    cache = array_cache(cell_to_lposneg)
+    lcell_to_owner = local_to_owner(indices)
+    for cell in ghost_to_local(indices)
+      p = cell_to_gposneg.ptrs[cell]-1
+      lids = getindex!(cache,cell_to_lposneg,cell)
+      cell_owner = lcell_to_owner[cell]
+      for (i,lid) in enumerate(lids)
+        if (lid > 0) && isequal(lpos_to_owner[lid],cell_owner)
+          lpos_to_gpos[lid] = cell_to_gposneg.data[i+p]
+        elseif (lid < 0) && isequal(lneg_to_owner[-lid],cell_owner)
+          lneg_to_gneg[-lid] = cell_to_gposneg.data[i+p]
+        end
+      end
+    end
+  end
+
+  posneg_wise_to_cell_wise!(cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,cell_to_lposneg,own_to_local(cell_range))
+  fetch_vector_ghost_values!(cell_to_gposneg,cache_fetch) |> wait
+  cell_wise_to_posneg_wise!(lpos_to_gpos,lneg_to_gneg,cell_to_gposneg,cell_to_lposneg,ghost_to_local(cell_range))
+
+  # Setup DoFs LocalIndices
+  local_indices_pos = permuted_variable_partition(nopos,lpos_to_gpos,lpos_to_owner;start=first_gpos)
+  local_indices_neg = permuted_variable_partition(noneg,lneg_to_gneg,lneg_to_owner;start=first_gneg)
+
+  return PRange(local_indices_pos), PRange(local_indices_neg)
+end
+
+"""
+    generate_gids_by_color(cell_gids::PRange, cell_to_lids, lid_to_color, ncolors)
+
+Similar to `generate_gids`, but uses a global partition given by `lid_to_color` to generate
+a different set of global ids per color. Returns:
+
+- a tuple with a `PRange` of `PermutedLocalIndices` per color.
+- a mapping `lid_to_clid` that maps local ids to local color ids
+- a mapping `color_to_clid_to_lid` that maps, for each color, local color ids to local ids.
+"""
+function generate_gids_by_color(
+  cell_range::PRange,
+  cell_to_lids::AbstractArray{<:AbstractArray},
+  lid_to_color::AbstractArray,
+  ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))
+)
+  cell_lids_to_data = allocate_cell_wise_vector(Int, cell_to_lids)
+  cache_fetch = fetch_vector_ghost_values_cache(cell_lids_to_data,partition(cell_range))
+
+  lid_to_owner, color_to_noids = map(
+    partition(cell_range),cell_to_lids,lid_to_color
+  ) do indices, cell_to_lids, lid_to_color
+    lid_to_owner = fill(zero(Int32),length(lid_to_color))
+    cache = array_cache(cell_to_lids)
+    for (cell, owner) in enumerate(local_to_owner(indices))
+      lids = getindex!(cache,cell_to_lids,cell)
+      for lid in lids
+        (lid < 0) && continue
+        lid_to_owner[lid] = max(owner,lid_to_owner[lid])
+      end
+    end
+
+    rank = part_id(indices)
+    color_to_noids = zeros(Int,ncolors)
+    for (owner, color) in zip(lid_to_owner,lid_to_color)
+      color_to_noids[color] += isequal(owner,rank)
+    end
+
+    return lid_to_owner, color_to_noids
+  end |> tuple_of_arrays
+
+  # Find first gid per color
+  color_to_fgid = map(1:ncolors) do c
+    scan(+,map(Base.Fix2(getindex,c), color_to_noids); type=:exclusive, init=1)
+  end |> to_parray_of_arrays
+
+  # Start exchanging dof owners
+  cell_lids_to_owner = dof_wise_to_cell_wise!(
+    cell_lids_to_data,lid_to_owner,cell_to_lids,own_to_local(cell_range)
+  )
+  t1 = fetch_vector_ghost_values!(cell_lids_to_owner,cache_fetch)
+
+  # Note: lid_to_owner is still not consistent, but owned data is correct
+  lid_to_gid, lid_to_clid, color_to_clid_to_lid = map(
+    partition(cell_range), lid_to_color, lid_to_owner, color_to_fgid
+  ) do indices, lid_to_color, lid_to_owner, color_to_fgid
+
+    # Count number of dofs per color
+    rank = part_id(indices)
+    color_to_nldofs = zeros(Int,ncolors)
+    lid_to_clid = zeros(Int32,length(lid_to_color))
+    lid_to_gid = zeros(Int,length(lid_to_color))
+    for (lid, (color, owner)) in enumerate(zip(lid_to_color,lid_to_owner))
+      color_to_nldofs[color] += 1
+      lid_to_clid[lid] = color_to_nldofs[color]
+      if isequal(owner,rank)
+        lid_to_gid[lid] = color_to_fgid[color]
+        color_to_fgid[color] += 1
+      end
+    end
+
+    # Find clid to lid mapping
+    color_to_clid_to_lid = [fill(zero(Int32),n) for n in color_to_nldofs]
+    for (lid, (color, clid)) in enumerate(zip(lid_to_color,lid_to_clid))
+      color_to_clid_to_lid[color][clid] = lid
+    end
+    return lid_to_gid, lid_to_clid, color_to_clid_to_lid
+  end |> tuple_of_arrays
+
+  # Finish exchanging the dof owners.
+  wait(t1)
+  cell_wise_to_dof_wise!(
+    lid_to_owner,cell_lids_to_owner,cell_to_lids,ghost_to_local(cell_range)
+  )
+
+  # Start exchanging gids
+  cell_to_gids = dof_wise_to_cell_wise!(
+    cell_lids_to_data,lid_to_gid,cell_to_lids,own_to_local(cell_range)
+  )
+  t2 = fetch_vector_ghost_values!(cell_to_gids,cache_fetch)
+
+  # Finish exchanging the gids.
+  wait(t2)
+  map(
+    cell_to_lids,cell_to_gids,lid_to_gid,lid_to_owner,partition(cell_range)
+  ) do cell_to_lids,cell_to_gids,lid_to_gid,lid_to_owner,indices
+    cache = array_cache(cell_to_lids)
+    lcell_to_owner = local_to_owner(indices)
+    for cell in ghost_to_local(indices)
+      p = cell_to_gids.ptrs[cell]-1
+      lids = getindex!(cache,cell_to_lids,cell)
+      cell_owner = lcell_to_owner[cell]
+      for (i,lid) in enumerate(lids)
+        if (lid > 0) && isequal(lid_to_owner[lid],cell_owner)
+          lid_to_gid[lid] = cell_to_gids.data[i+p]
+        end
+      end
+    end
+  end
+
+  dof_wise_to_cell_wise!(cell_to_gids,lid_to_gid,cell_to_lids,own_to_local(cell_range))
+  fetch_vector_ghost_values!(cell_to_gids,cache_fetch) |> wait
+  cell_wise_to_dof_wise!(lid_to_gid,cell_to_gids,cell_to_lids,ghost_to_local(cell_range))
+
+  # Setup DoFs indices per color
+  color_to_indices = ntuple(ncolors) do c
+    c_noids  = map(Base.Fix2(getindex,c), color_to_noids)
+    c_fgid   = color_to_fgid[c]
+    c_gids   = map(getindex, lid_to_gid,   color_to_clid_to_lid[c])
+    c_owners = map(getindex, lid_to_owner, color_to_clid_to_lid[c])
+    PRange(permuted_variable_partition(c_noids, c_gids, c_owners; start=c_fgid))
+  end
+
+  return color_to_indices, lid_to_clid, color_to_clid_to_lid
+end
+
+"""
+    split_gids_by_color(gids::PRange, lid_to_color::AbstractArray, ncolors) -> NTuple{ncolors,PRange}
+
+Given a set of global ids and a mapping from local ids to colors, this function splits
+the global ids into different sets of global ids per color. Returns a tuple with a `PRange`
+of `PermutedLocalIndices` per color.
+"""
+function split_gids_by_color(
+  gids::PRange, lid_to_color::AbstractArray,
+  ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))
+)
+  color_to_nlids, color_to_noids = map(partition(gids), lid_to_color) do ids, lid_to_color
+    rank = part_id(ids)
+    lid_to_owner = local_to_owner(ids)
+    color_to_nlids = zeros(Int,ncolors)
+    color_to_noids = zeros(Int,ncolors)
+    for (owner,color) in zip(lid_to_owner,lid_to_color)
+      color_to_nlids[color] += 1
+      color_to_noids[color] += isequal(owner,rank)
+    end
+    return color_to_nlids, color_to_noids
+  end |> tuple_of_arrays
+
+  # I wish we could reduce arrays directly...
+  color_to_fgid = map(1:ncolors) do c
+    scan(+,map(Base.Fix2(getindex,c), color_to_noids); type=:exclusive, init=1)
+  end |> to_parray_of_arrays
+
+  lid_to_cgid = map(partition(gids), lid_to_color, color_to_fgid) do ids, lid_to_color, color_to_fgid
+    rank = part_id(ids)
+    lid_to_cgid = zeros(Int,length(lid_to_color))
+    color_to_offset = zeros(Int,ncolors)
+    for (lid,(owner,color)) in enumerate(zip(local_to_owner(ids),lid_to_color))
+      if isequal(owner,rank)
+        lid_to_cgid[lid] = color_to_fgid[color] + color_to_offset[color]
+        color_to_offset[color] += 1
+      end
+    end
+    return lid_to_cgid
+  end
+  consistent!(PVector(lid_to_cgid,partition(gids))) |> wait
+
+  # Pack per-color gid/owner arrays in a single pass over local dofs
+  all_c_gids, all_c_owners = map(
+    lid_to_color, lid_to_cgid, partition(gids), color_to_nlids
+  ) do lid_to_color, lid_to_cgid, ids, color_to_nlids
+    c_gids   = [zeros(Int,   n) for n in color_to_nlids]
+    c_owners = [zeros(Int32, n) for n in color_to_nlids]
+    offsets = zeros(Int, ncolors)
+    for (color, cgid, owner) in zip(lid_to_color, lid_to_cgid, local_to_owner(ids))
+      offsets[color] += 1
+      lid = offsets[color]
+      c_gids[color][lid]   = cgid
+      c_owners[color][lid] = owner
+    end
+    return c_gids, c_owners
+  end |> tuple_of_arrays
+
+  return ntuple(ncolors) do c
+    c_noids  = map(Base.Fix2(getindex,c), color_to_noids)
+    c_fgid   = color_to_fgid[c]
+    c_gids   = map(Base.Fix2(getindex,c), all_c_gids)
+    c_owners = map(Base.Fix2(getindex,c), all_c_owners)
+    PRange(permuted_variable_partition(c_noids, c_gids, c_owners; start=c_fgid))
+  end
+end
+
+# PRange restriction
+
+function restrict_gids(gids::PRange, new_to_old_lid::AbstractArray)
+
+  n_own = map(partition(gids), new_to_old_lid) do ids, n2o_lid
+    rank = part_id(ids)
+    return count(isequal(rank), view(local_to_owner(ids), n2o_lid))
+  end
+
+  # Assign global ids to owned lids
+  first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+
+  old_lid_to_new_gid = map(first_gid,new_to_old_lid,partition(gids)) do first_gid, n2o_lid, ids
+    old_lid_to_new_gid = zeros(Int,local_length(ids))
+    old_lid_to_owner = local_to_owner(ids)
+    rank = part_id(ids)
+    gid = first_gid
+    for old in n2o_lid
+      if old_lid_to_owner[old] == rank
+        old_lid_to_new_gid[old] = gid
+        gid += 1
+      end
+    end
+    return old_lid_to_new_gid
+  end
+
+  consistent!(PVector(old_lid_to_new_gid,partition(gids))) |> wait
+
+  # Prepare new partition
+  n_gids     = reduction(+,n_own,destination=:all,init=zero(eltype(n_own)))
+  new_gids   = map(getindex, old_lid_to_new_gid, new_to_old_lid)
+  new_owners = map(getindex, local_to_owner(gids), new_to_old_lid)
+
+  new_indices = permuted_variable_partition(
+    n_own, new_gids, new_owners; n_global=n_gids, start=first_gid
+  )
+
+  p_snd, p_rcv = assembly_neighbors(partition(gids))
+  assembly_neighbors(new_indices; neighbors=ExchangeGraph(p_snd, p_rcv))
+
+  return PRange(new_indices)
+end
+
+# Multi-field PRange concatenation
+
+"""
+    vcat_gids(f_p_flid_lid, f_frange) -> PRange
+
+Concatenate N per-field PRanges into a single multi-field PRange.
+
+- `f_p_flid_lid`: for each field f, a distributed array mapping single-field local ids to multi-field local ids.
+- `f_frange`: per-field PRanges.
+"""
+function vcat_gids(
+  f_p_flid_lid::AbstractVector{<:AbstractArray{<:AbstractVector}},
+  f_frange::AbstractVector{<:PRange}
+)
+  f_p_fiset = map(local_views,f_frange)
+
+  v(x...) = collect(x)
+  p_f_fiset    = map(v,f_p_fiset...)
+  p_f_flid_lid = map(v,f_p_flid_lid...)
+
+  # Per-part own DOF count and first global ID
+  p_noids    = map(f_fiset->sum(map(own_length,f_fiset)),p_f_fiset)
+  p_firstgid = scan(+,p_noids,type=:exclusive,init=one(eltype(p_noids)))
+
+  # Fill owned gids and owners
+  p_lid_gid, p_lid_part = map(p_f_flid_lid, p_f_fiset, p_firstgid) do f_flid_lid, f_fiset, firstgid
+    nlids    = sum(length, f_flid_lid)
+    lid_gid  = zeros(Int,   nlids)
+    lid_part = zeros(Int32, nlids)
+    gid = firstgid
+    for (flid_lid, fiset) in zip(f_flid_lid, f_fiset)
+      part = part_id(fiset)
+      for flid in own_to_local(fiset)
+        lid_gid[flid_lid[flid]]  = gid
+        lid_part[flid_lid[flid]] = part
+        gid += 1
+      end
+    end
+    return lid_gid, lid_part
+  end |> tuple_of_arrays
+
+  # Propagate gid and owner to ghost DOFs via each field's communicator
+  f_aux_gids = map(frange->PVector{Vector{eltype(eltype(p_lid_gid))}}(undef,partition(frange)),f_frange)
+  f_aux_part = map(frange->PVector{Vector{eltype(eltype(p_lid_part))}}(undef,partition(frange)),f_frange)
+  _vcat_propagate_ghost!(p_lid_gid,f_aux_gids,f_p_flid_lid,f_p_fiset)
+  _vcat_propagate_ghost!(p_lid_part,f_aux_part,f_p_flid_lid,f_p_fiset)
+
+  # Build permuted partition
+  p_iset = permuted_variable_partition(p_noids,p_lid_gid,p_lid_part;start=p_firstgid)
+
+  # Merge assembly neighbors from all fields
+  f_p_parts_snd, f_p_parts_rcv = map(assembly_neighbors ∘ partition, f_frange) |> tuple_of_arrays
+  merge_neigs(f_neigs...) = sort(unique(vcat(f_neigs...)))
+  p_neigs_snd = map(merge_neigs,f_p_parts_snd...)
+  p_neigs_rcv = map(merge_neigs,f_p_parts_rcv...)
+
+  exchange_graph = ExchangeGraph(p_neigs_snd,p_neigs_rcv)
+  assembly_neighbors(p_iset; neighbors=exchange_graph)
+
+  return PRange(p_iset)
+end
+
+function _vcat_propagate_ghost!(
+  p_lid_gid, f_gids, f_p_flid_lid, f_p_fiset
+)
+  for (gids, p_flid_lid, p_fiset) in zip(f_gids, f_p_flid_lid, f_p_fiset)
+    p_flid_gid = local_views(gids)
+    foreach(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
+      for flid in own_to_local(fiset)
+        flid_gid[flid] = lid_gid[flid_lid[flid]]
+      end
+    end
+    cache = fetch_vector_ghost_values_cache(partition(gids), p_fiset)
+    fetch_vector_ghost_values!(partition(gids), cache) |> wait
+    foreach(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
+      for flid in ghost_to_local(fiset)
+        lid_gid[flid_lid[flid]] = flid_gid[flid]
+      end
+    end
+  end
+end
diff --git a/src/Pullbacks.jl b/src/Pullbacks.jl
index 420f89fe..eded2413 100644
--- a/src/Pullbacks.jl
+++ b/src/Pullbacks.jl
@@ -127,7 +127,7 @@ function FESpaces.compute_facet_owners(model::DistributedDiscreteModel)
   end
 
   # Map local owners to global ids
-  map(facet_to_owner, cell_ids) do facet_to_owner, cell_ids
+  foreach(facet_to_owner, cell_ids) do facet_to_owner, cell_ids
     lid_to_gid = local_to_global(cell_ids)
     for f in eachindex(facet_to_owner)
       lid = facet_to_owner[f]
@@ -138,9 +138,9 @@ function FESpaces.compute_facet_owners(model::DistributedDiscreteModel)
   # Communicate true owners across processes
   wait(consistent!(PVector(facet_to_owner, facet_ids)))
 
-  # Non-local owners will be set to zero, which 
+  # Non-local owners will be set to zero, which
   # will trigger a sign flip (which is the correct behaviour)
-  map(facet_to_owner, cell_ids) do facet_to_owner, cell_ids
+  foreach(facet_to_owner, cell_ids) do facet_to_owner, cell_ids
     gid_to_lid = global_to_local(cell_ids)
     for f in eachindex(facet_to_owner)
       gid = facet_to_owner[f]
diff --git a/src/Redistribution.jl b/src/Redistribution.jl
index f3d8a8f2..a20e2723 100644
--- a/src/Redistribution.jl
+++ b/src/Redistribution.jl
@@ -649,7 +649,11 @@ function p_vector_redistribution_cache(values, indices, indices_red)
   buffers_snd, buffers_rcv = map(
     PartitionedArrays.assembly_buffers,values,lids_snd,lids_rcv
   ) |> tuple_of_arrays
-  caches = map(PartitionedArrays.VectorAssemblyCache,nbors_snd, nbors_rcv,lids_snd, lids_rcv,buffers_snd,buffers_rcv)
+  graph = ExchangeGraph(nbors_snd, nbors_rcv)
+  exchange_setup = PartitionedArrays.setup_exchange(buffers_rcv, buffers_snd, graph)
+  caches = map(nbors_snd, nbors_rcv, lids_snd, lids_rcv, buffers_snd, buffers_rcv) do ns, nr, ls, lr, bs, br
+    PartitionedArrays.VectorAssemblyCache(ns, nr, ls, lr, bs, br, exchange_setup)
+  end
   return caches
 end
 
diff --git a/src/Visualization.jl b/src/Visualization.jl
index cfaafb34..71f2b527 100644
--- a/src/Visualization.jl
+++ b/src/Visualization.jl
@@ -216,6 +216,14 @@ function Visualization.createvtk(
   )
 end
 
+"""
+    DistributedPvd{T<:AbstractArray}
+
+Handle for a distributed ParaView Data (PVD) collection used to write time-series
+VTK output from a parallel simulation. The actual file is only written by rank 0.
+
+Created by `createpvd(parts, filename)` and closed by `savepvd(pvd)`.
+"""
 struct DistributedPvd{T<:AbstractArray}
   pvds::T
 end
diff --git a/test/AdaptivityCartesianTests.jl b/test/AdaptivityCartesianTests.jl
index b35ae70d..fde21bb1 100644
--- a/test/AdaptivityCartesianTests.jl
+++ b/test/AdaptivityCartesianTests.jl
@@ -11,7 +11,7 @@ using GridapDistributed
 using PartitionedArrays
 
 using GridapDistributed: i_am_in, generate_subparts
-using GridapDistributed: find_local_to_local_map
+using GridapDistributed: local_to_local_map
 using GridapDistributed: DistributedAdaptedDiscreteModel, redistribute, redistribute_cartesian
 using GridapDistributed: RedistributeGlue, redistribute_cell_dofs, redistribute_fe_function, redistribute_free_values
 
diff --git a/test/AdaptivityMultiFieldTests.jl b/test/AdaptivityMultiFieldTests.jl
index 5596d5d9..51c90d2e 100644
--- a/test/AdaptivityMultiFieldTests.jl
+++ b/test/AdaptivityMultiFieldTests.jl
@@ -11,7 +11,7 @@ using GridapDistributed
 using PartitionedArrays
 
 using GridapDistributed: i_am_in, generate_subparts
-using GridapDistributed: find_local_to_local_map
+using GridapDistributed: local_to_local_map
 using GridapDistributed: DistributedAdaptedDiscreteModel
 using GridapDistributed: RedistributeGlue, redistribute_cell_dofs, redistribute_fe_function, redistribute_free_values
 
@@ -58,8 +58,8 @@ function get_redistribute_glue(old_parts,new_parts::DebugArray,old_cell_to_part,
       old_partition = partition(get_cell_gids(model)).items[p]
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> lazy_map(Reindex(global_to_local(old_partition)),gids),gids_snd_by_part)
-      old2new = replace(find_local_to_local_map(old_partition,new_partition), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_partition,old_partition), -1 => 0)
+      old2new = replace(local_to_local_map(old_partition,new_partition), -1 => 0)
+      new2old = replace(local_to_local_map(new_partition,old_partition), -1 => 0)
     else
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> fill(Int32(0),length(gids)),gids_snd_by_part)
@@ -90,8 +90,8 @@ function get_redistribute_glue(old_parts,new_parts::MPIArray,old_cell_to_part,ne
       old_partition = PartitionedArrays.getany(partition(get_cell_gids(model)))
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> lazy_map(Reindex(global_to_local(old_partition)),gids),gids_snd_by_part)
-      old2new = replace(find_local_to_local_map(old_partition,new_partition), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_partition,old_partition), -1 => 0)
+      old2new = replace(local_to_local_map(old_partition,new_partition), -1 => 0)
+      new2old = replace(local_to_local_map(new_partition,old_partition), -1 => 0)
     else
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> fill(Int32(0),length(gids)),gids_snd_by_part)
diff --git a/test/AdaptivityTests.jl b/test/AdaptivityTests.jl
index 3d35263b..3bde688a 100644
--- a/test/AdaptivityTests.jl
+++ b/test/AdaptivityTests.jl
@@ -11,7 +11,7 @@ using GridapDistributed
 using PartitionedArrays
 
 using GridapDistributed: i_am_in, generate_subparts
-using GridapDistributed: find_local_to_local_map
+using GridapDistributed: local_to_local_map
 using GridapDistributed: DistributedAdaptedDiscreteModel
 using GridapDistributed: RedistributeGlue, redistribute_cell_dofs, redistribute_fe_function, redistribute_free_values
 
@@ -58,8 +58,8 @@ function get_redistribute_glue(old_parts,new_parts::DebugArray,old_cell_to_part,
       old_partition = partition(get_cell_gids(model)).items[p]
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> lazy_map(Reindex(global_to_local(old_partition)),gids),gids_snd_by_part)
-      old2new = replace(find_local_to_local_map(old_partition,new_partition), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_partition,old_partition), -1 => 0)
+      old2new = replace(local_to_local_map(old_partition,new_partition), -1 => 0)
+      new2old = replace(local_to_local_map(new_partition,old_partition), -1 => 0)
     else
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> fill(Int32(0),length(gids)),gids_snd_by_part)
@@ -90,8 +90,8 @@ function get_redistribute_glue(old_parts,new_parts::MPIArray,old_cell_to_part,ne
       old_partition = PartitionedArrays.getany(partition(get_cell_gids(model)))
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> lazy_map(Reindex(global_to_local(old_partition)),gids),gids_snd_by_part)
-      old2new = replace(find_local_to_local_map(old_partition,new_partition), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_partition,old_partition), -1 => 0)
+      old2new = replace(local_to_local_map(old_partition,new_partition), -1 => 0)
+      new2old = replace(local_to_local_map(new_partition,old_partition), -1 => 0)
     else
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> fill(Int32(0),length(gids)),gids_snd_by_part)
diff --git a/test/AutodiffTests.jl b/test/AutodiffTests.jl
index f86a79d8..ab61fe6c 100644
--- a/test/AutodiffTests.jl
+++ b/test/AutodiffTests.jl
@@ -7,6 +7,32 @@ using PartitionedArrays
 using SparseArrays
 using ForwardDiff
 
+function same_matrix(A,B;debug=false)
+  same_own = map(≈, own_own_values(A), own_own_values(B))
+  same_ghost = map(≈, own_ghost_values(A), own_ghost_values(B))
+
+  if debug
+    so = reduce(&, same_own)
+    println("Same own values: ", so)
+    if !so 
+      display(same_own)
+      map(own_own_values(A),own_own_values(B)) do a, b
+        display(a - b)
+      end
+    end
+    sg = reduce(&, same_ghost)
+    println("Same ghost values: ", sg)
+    if !sg
+      display(same_ghost)
+      map(own_ghost_values(A),own_ghost_values(B)) do a, b
+        display(a - b)
+      end
+    end
+  end
+
+  return reduce(&, same_own) && reduce(&, same_ghost)
+end
+
 function main_sf(distribute,parts)
   ranks = distribute(LinearIndices((prod(parts),)))
 
@@ -35,7 +61,7 @@ function main_sf(distribute,parts)
   uh = interpolate(1.0,U)
   A = jacobian(op,uh)
   A_AD = jacobian(op_AD,uh)
-  @test reduce(&,map(≈,partition(A),partition(A_AD)))
+  @test same_matrix(A,A_AD)
 
   g(v) = ∫(0.5*v⋅v)dΩ
   dg(v) = ∫(uh⋅v)dΩ
@@ -95,7 +121,7 @@ function main_mf(distribute,parts)
   uh, ph = xh
   A = jacobian(op,xh)
   A_AD = jacobian(op_AD,xh)
-  @test reduce(&,map(≈,partition(A),partition(A_AD)))
+  @test same_matrix(A,A_AD)
 
   g((v,q)) = ∫(0.5*v⋅v + 0.5*q*q)dΩ
   dg((v,q)) = ∫(uh⋅v + ph*q)dΩ
@@ -166,7 +192,7 @@ function mf_different_fespace_trians(distribute,parts)
     op = FEOperator(f2,f2_jac,X,X)
     J_fwd = jacobian(op,uh)
 
-    @test reduce(&,map(≈,partition(J),partition(J_fwd)))
+    @test same_matrix(J,J_fwd)
   end
 end
 
@@ -202,7 +228,7 @@ function skeleton_mf_different_fespace_trians(distribute,parts)
     op = FEOperator(f2,f2_jac,X,X)
     J_fwd = jacobian(op,uh)
 
-    @test reduce(&,map(≈,partition(J),partition(J_fwd)))
+    @test same_matrix(J,J_fwd)
   end
 end
 
diff --git a/test/BlockPartitionedArraysTests.jl b/test/BlockPartitionedArraysTests.jl
index ec7bd60e..826b1195 100644
--- a/test/BlockPartitionedArraysTests.jl
+++ b/test/BlockPartitionedArraysTests.jl
@@ -77,4 +77,4 @@ function main(distribute, parts)
   @test norm(v4) ≈ 6.0 * sqrt(2*n_global)
 end
 
-end # module
+end
\ No newline at end of file
diff --git a/test/BlockSparseMatrixAssemblersTests.jl b/test/BlockSparseMatrixAssemblersTests.jl
index 13eb3d7e..54ce34d2 100644
--- a/test/BlockSparseMatrixAssemblersTests.jl
+++ b/test/BlockSparseMatrixAssemblersTests.jl
@@ -52,7 +52,7 @@ function _main(n_spaces,mfs,weakform,U,V)
   matdata = collect_cell_matrix(X,Y,biform(u,v))
   vecdata = collect_cell_vector(Y,liform(v))  
 
-  assem = SparseMatrixAssembler(X,Y,FullyAssembledRows())
+  assem = SparseMatrixAssembler(X,Y,LocallyAssembled())
   A1 = assemble_matrix(assem,matdata)
   b1 = assemble_vector(assem,vecdata)
   A2,b2 = assemble_matrix_and_vector(assem,data);
@@ -68,7 +68,7 @@ function _main(n_spaces,mfs,weakform,U,V)
   bmatdata = collect_cell_matrix(Xb,Yb,biform(ub,vb))
   bvecdata = collect_cell_vector(Yb,liform(vb))
 
-  assem_blocks = SparseMatrixAssembler(Xb,Yb,FullyAssembledRows())
+  assem_blocks = SparseMatrixAssembler(Xb,Yb,LocallyAssembled())
   A1_blocks = assemble_matrix(assem_blocks,bmatdata);
   b1_blocks = assemble_vector(assem_blocks,bvecdata);
   @test is_same_vector(b1_blocks,b1,Yb,Y)
diff --git a/test/FESpacesTests.jl b/test/FESpacesTests.jl
index 76cf57cb..627f5512 100644
--- a/test/FESpacesTests.jl
+++ b/test/FESpacesTests.jl
@@ -9,6 +9,44 @@ using Test
 
 u((x,y)) = x+y
 
+function gid_generation_tests(distribute,parts)
+  ranks = distribute(LinearIndices((prod(parts),)))
+
+  model = CartesianDiscreteModel(ranks,parts,(0,1,0,1),(8,4))
+  reffe = ReferenceFE(lagrangian,Float64,1)
+  cgids = GridapDistributed.get_cell_gids(model)
+
+  V0 = FESpace(model,reffe)
+  I0 = partition(get_free_dof_ids(V0))
+
+  V1 = FESpace(model,reffe;dirichlet_tags=["tag_1","tag_2","tag_3","tag_4","tag_5","tag_6"])
+  I1 = partition(get_free_dof_ids(V1))
+
+  fgids_1, dgids_1 = GridapDistributed.generate_posneg_gids(
+    cgids,get_cell_dof_ids(V1),map(num_free_dofs,local_views(V1)),map(num_dirichlet_dofs,local_views(V1))
+  )
+
+  # 1 -> free, 2 -> dirichlet
+  lid_to_color = map(local_views(V0),local_views(V1)) do V0, V1
+    lid_to_color = zeros(Int16,num_free_dofs(V0))
+    for (I,J) in zip(get_cell_dof_ids(V0),get_cell_dof_ids(V1))
+      lid_to_color[I] .= map(j -> ifelse(j > 0, 1, 2), J)
+    end
+    return lid_to_color
+  end
+
+  fgids_2, dgids_2 = GridapDistributed.split_gids_by_color(
+    get_free_dof_ids(V0), lid_to_color
+  )
+
+  (fgids_3, dgids_3), _, _ = GridapDistributed.generate_gids_by_color(
+    cgids, get_cell_dof_ids(V0), lid_to_color
+  )
+
+  @test fgids_1 == fgids_2 == fgids_3
+  @test dgids_1 == dgids_2 == dgids_3
+end
+
 function assemble_tests(das,dΩ,dΩass,U,V)
   # Assembly
   dv = get_fe_basis(V)
@@ -71,8 +109,8 @@ function assemble_tests(das,dΩ,dΩass,U,V)
 end
 
 function main(distribute,parts)
-  main(distribute,parts,SubAssembledRows())
-  main(distribute,parts,FullyAssembledRows())
+  main(distribute,parts,Assembled())
+  main(distribute,parts,LocallyAssembled())
 end
 
 function main(distribute,parts,das)
diff --git a/test/HeatEquationTests.jl b/test/HeatEquationTests.jl
index fa96dc11..89077ecb 100644
--- a/test/HeatEquationTests.jl
+++ b/test/HeatEquationTests.jl
@@ -37,7 +37,7 @@ function main(distribute,parts)
 
   op = TransientFEOperator(res,jac,jac_t,U,V0)
 
-  assembler = SparseMatrixAssembler(U,V0,SubAssembledRows())
+  assembler = SparseMatrixAssembler(U,V0,Assembled())
   op_constant = TransientLinearFEOperator(
     (a,m),b,U,V0,constant_forms=(true,true),assembler=assembler
   )
diff --git a/test/PLaplacianTests.jl b/test/PLaplacianTests.jl
index a20a2683..bf501359 100644
--- a/test/PLaplacianTests.jl
+++ b/test/PLaplacianTests.jl
@@ -8,8 +8,8 @@ using Test
 using SparseArrays
 
 function main(distribute,parts)
-  main(distribute,parts,FullyAssembledRows(),SparseMatrixCSR{0,Float64,Int},false)
-  main(distribute,parts,SubAssembledRows(),SparseMatrixCSC{Float64,Int},true)
+  main(distribute,parts,LocallyAssembled(),SparseMatrixCSR{0,Float64,Int},false)
+  main(distribute,parts,Assembled(),SparseMatrixCSC{Float64,Int},true)
 end
 
 function main(distribute,parts,strategy,local_matrix_type,autodiff)
diff --git a/test/PeriodicBCsTests.jl b/test/PeriodicBCsTests.jl
index 6177752d..247cb6fc 100644
--- a/test/PeriodicBCsTests.jl
+++ b/test/PeriodicBCsTests.jl
@@ -35,4 +35,6 @@ function main(distribute,parts)
 
 end
 
+main(DebugArray,(2,2))
+
 end # module
diff --git a/test/PullbackTests.jl b/test/PullbackTests.jl
index 3c12f37d..3e970819 100644
--- a/test/PullbackTests.jl
+++ b/test/PullbackTests.jl
@@ -116,7 +116,7 @@ function main(distribute,nranks)
 
   model = GridapDistributed.DistributedDiscreteModel(models,gids)
 
-  das = FullyAssembledRows()
+  das = LocallyAssembled()
   trian = Triangulation(das,model)
 
   reffe = ReferenceFE(raviart_thomas,Float64,0)
diff --git a/test/mpi/AdaptivityCartesianTests.jl b/test/mpi/AdaptivityCartesianTests.jl
new file mode 100644
index 00000000..6d37f685
--- /dev/null
+++ b/test/mpi/AdaptivityCartesianTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AdaptivityCartesianTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/AdaptivityMultiFieldTests.jl b/test/mpi/AdaptivityMultiFieldTests.jl
new file mode 100644
index 00000000..88e55359
--- /dev/null
+++ b/test/mpi/AdaptivityMultiFieldTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AdaptivityMultiFieldTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/AdaptivityTests.jl b/test/mpi/AdaptivityTests.jl
new file mode 100644
index 00000000..89a8c60f
--- /dev/null
+++ b/test/mpi/AdaptivityTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AdaptivityTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/AdaptivityUnstructuredTests.jl b/test/mpi/AdaptivityUnstructuredTests.jl
new file mode 100644
index 00000000..8c453c0b
--- /dev/null
+++ b/test/mpi/AdaptivityUnstructuredTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AdaptivityUnstructuredTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/AutodiffTests.jl b/test/mpi/AutodiffTests.jl
new file mode 100644
index 00000000..ecd6c67a
--- /dev/null
+++ b/test/mpi/AutodiffTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AutodiffTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/BlockPartitionedArraysTests.jl b/test/mpi/BlockPartitionedArraysTests.jl
new file mode 100644
index 00000000..22baf713
--- /dev/null
+++ b/test/mpi/BlockPartitionedArraysTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "BlockPartitionedArraysTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/BlockSparseMatrixAssemblersTests.jl b/test/mpi/BlockSparseMatrixAssemblersTests.jl
new file mode 100644
index 00000000..309c8056
--- /dev/null
+++ b/test/mpi/BlockSparseMatrixAssemblersTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "BlockSparseMatrixAssemblersTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/CellDataTests.jl b/test/mpi/CellDataTests.jl
new file mode 100644
index 00000000..415466fa
--- /dev/null
+++ b/test/mpi/CellDataTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "CellDataTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/ConstantFESpacesTests.jl b/test/mpi/ConstantFESpacesTests.jl
new file mode 100644
index 00000000..68be3c3a
--- /dev/null
+++ b/test/mpi/ConstantFESpacesTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "ConstantFESpacesTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/FESpacesTests.jl b/test/mpi/FESpacesTests.jl
new file mode 100644
index 00000000..de718e04
--- /dev/null
+++ b/test/mpi/FESpacesTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "FESpacesTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/GeometryTests.jl b/test/mpi/GeometryTests.jl
new file mode 100644
index 00000000..fc2db280
--- /dev/null
+++ b/test/mpi/GeometryTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "GeometryTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/HcurlProjectionTests.jl b/test/mpi/HcurlProjectionTests.jl
new file mode 100644
index 00000000..323ffa14
--- /dev/null
+++ b/test/mpi/HcurlProjectionTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "HcurlProjectionTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/HeatEquationTests.jl b/test/mpi/HeatEquationTests.jl
new file mode 100644
index 00000000..417de562
--- /dev/null
+++ b/test/mpi/HeatEquationTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "HeatEquationTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/MacroDiscreteModelsTests.jl b/test/mpi/MacroDiscreteModelsTests.jl
new file mode 100644
index 00000000..c3a7e0e0
--- /dev/null
+++ b/test/mpi/MacroDiscreteModelsTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "MacroDiscreteModelsTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/MultiFieldTests.jl b/test/mpi/MultiFieldTests.jl
new file mode 100644
index 00000000..c9dc8241
--- /dev/null
+++ b/test/mpi/MultiFieldTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "MultiFieldTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/PLaplacianTests.jl b/test/mpi/PLaplacianTests.jl
new file mode 100644
index 00000000..fa361328
--- /dev/null
+++ b/test/mpi/PLaplacianTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "PLaplacianTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/PeriodicBCsTests.jl b/test/mpi/PeriodicBCsTests.jl
new file mode 100644
index 00000000..6c6d2ed8
--- /dev/null
+++ b/test/mpi/PeriodicBCsTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "PeriodicBCsTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/PoissonTests.jl b/test/mpi/PoissonTests.jl
new file mode 100644
index 00000000..e8da2e38
--- /dev/null
+++ b/test/mpi/PoissonTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "PoissonTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/PolytopalCoarseningTests.jl b/test/mpi/PolytopalCoarseningTests.jl
new file mode 100644
index 00000000..8eef0370
--- /dev/null
+++ b/test/mpi/PolytopalCoarseningTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "PolytopalCoarseningTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/StokesHdivDGTests.jl b/test/mpi/StokesHdivDGTests.jl
new file mode 100644
index 00000000..be60561f
--- /dev/null
+++ b/test/mpi/StokesHdivDGTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "StokesHdivDGTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/StokesOpenBoundaryTests.jl b/test/mpi/StokesOpenBoundaryTests.jl
new file mode 100644
index 00000000..6f6a4f41
--- /dev/null
+++ b/test/mpi/StokesOpenBoundaryTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "StokesOpenBoundaryTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/SurfaceCouplingTests.jl b/test/mpi/SurfaceCouplingTests.jl
new file mode 100644
index 00000000..c6ab8156
--- /dev/null
+++ b/test/mpi/SurfaceCouplingTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "SurfaceCouplingTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/TransientDistributedCellFieldTests.jl b/test/mpi/TransientDistributedCellFieldTests.jl
new file mode 100644
index 00000000..ef96b0d8
--- /dev/null
+++ b/test/mpi/TransientDistributedCellFieldTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "TransientDistributedCellFieldTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/TransientMultiFieldDistributedCellFieldTests.jl b/test/mpi/TransientMultiFieldDistributedCellFieldTests.jl
new file mode 100644
index 00000000..3c780107
--- /dev/null
+++ b/test/mpi/TransientMultiFieldDistributedCellFieldTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "TransientMultiFieldDistributedCellFieldTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/VisualizationTests.jl b/test/mpi/VisualizationTests.jl
new file mode 100644
index 00000000..f97402cd
--- /dev/null
+++ b/test/mpi/VisualizationTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "VisualizationTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/ZeroMeanFESpacesTests.jl b/test/mpi/ZeroMeanFESpacesTests.jl
new file mode 100644
index 00000000..c0bf280a
--- /dev/null
+++ b/test/mpi/ZeroMeanFESpacesTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "ZeroMeanFESpacesTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/drivers/AdaptivityCartesianTests.jl b/test/mpi/drivers/AdaptivityCartesianTests.jl
new file mode 100644
index 00000000..50193fd2
--- /dev/null
+++ b/test/mpi/drivers/AdaptivityCartesianTests.jl
@@ -0,0 +1,14 @@
+module AdaptivityCartesianMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AdaptivityCartesianTests.jl"))
+
+with_mpi() do distribute
+  AdaptivityCartesianTests.main(distribute)
+end
+
+end # module
diff --git a/test/mpi/drivers/AdaptivityMultiFieldTests.jl b/test/mpi/drivers/AdaptivityMultiFieldTests.jl
new file mode 100644
index 00000000..98d4a0fc
--- /dev/null
+++ b/test/mpi/drivers/AdaptivityMultiFieldTests.jl
@@ -0,0 +1,14 @@
+module AdaptivityMultiFieldMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AdaptivityMultiFieldTests.jl"))
+
+with_mpi() do distribute
+  AdaptivityMultiFieldTests.main(distribute)
+end
+
+end # module
diff --git a/test/mpi/drivers/AdaptivityTests.jl b/test/mpi/drivers/AdaptivityTests.jl
new file mode 100644
index 00000000..795d40d1
--- /dev/null
+++ b/test/mpi/drivers/AdaptivityTests.jl
@@ -0,0 +1,14 @@
+module AdaptivityMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AdaptivityTests.jl"))
+
+with_mpi() do distribute
+  AdaptivityTests.main(distribute)
+end
+
+end # module
diff --git a/test/mpi/drivers/AdaptivityUnstructuredTests.jl b/test/mpi/drivers/AdaptivityUnstructuredTests.jl
new file mode 100644
index 00000000..f9cd1c05
--- /dev/null
+++ b/test/mpi/drivers/AdaptivityUnstructuredTests.jl
@@ -0,0 +1,14 @@
+module AdaptivityUnstructuredMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AdaptivityUnstructuredTests.jl"))
+
+with_mpi() do distribute
+  AdaptivityUnstructuredTests.main(distribute)
+end
+
+end # module
diff --git a/test/mpi/drivers/AutodiffTests.jl b/test/mpi/drivers/AutodiffTests.jl
new file mode 100644
index 00000000..81404068
--- /dev/null
+++ b/test/mpi/drivers/AutodiffTests.jl
@@ -0,0 +1,16 @@
+module AutodiffMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AutodiffTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  AutodiffTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/BlockPartitionedArraysTests.jl b/test/mpi/drivers/BlockPartitionedArraysTests.jl
new file mode 100644
index 00000000..5cccec53
--- /dev/null
+++ b/test/mpi/drivers/BlockPartitionedArraysTests.jl
@@ -0,0 +1,16 @@
+module BlockPartitionedArraysMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "BlockPartitionedArraysTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  BlockPartitionedArraysTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl b/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl
new file mode 100644
index 00000000..ba27fa3b
--- /dev/null
+++ b/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl
@@ -0,0 +1,16 @@
+module BlockSparseMatrixAssemblersMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "BlockSparseMatrixAssemblersTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  BlockSparseMatrixAssemblersTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/CellDataTests.jl b/test/mpi/drivers/CellDataTests.jl
new file mode 100644
index 00000000..d01e7c24
--- /dev/null
+++ b/test/mpi/drivers/CellDataTests.jl
@@ -0,0 +1,16 @@
+module CellDataMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "CellDataTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  CellDataTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/ConstantFESpacesTests.jl b/test/mpi/drivers/ConstantFESpacesTests.jl
new file mode 100644
index 00000000..4166326c
--- /dev/null
+++ b/test/mpi/drivers/ConstantFESpacesTests.jl
@@ -0,0 +1,14 @@
+module ConstantFESpacesMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "ConstantFESpacesTests.jl"))
+
+with_mpi() do distribute
+  ConstantFESpacesTests.main(distribute, (2, 2))
+end
+
+end # module
diff --git a/test/mpi/drivers/FESpacesTests.jl b/test/mpi/drivers/FESpacesTests.jl
new file mode 100644
index 00000000..e3caf521
--- /dev/null
+++ b/test/mpi/drivers/FESpacesTests.jl
@@ -0,0 +1,16 @@
+module FESpacesMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "FESpacesTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  FESpacesTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/GeometryTests.jl b/test/mpi/drivers/GeometryTests.jl
new file mode 100644
index 00000000..2cac1822
--- /dev/null
+++ b/test/mpi/drivers/GeometryTests.jl
@@ -0,0 +1,16 @@
+module GeometryMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "GeometryTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  GeometryTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/HcurlProjectionTests.jl b/test/mpi/drivers/HcurlProjectionTests.jl
new file mode 100644
index 00000000..d580f1f1
--- /dev/null
+++ b/test/mpi/drivers/HcurlProjectionTests.jl
@@ -0,0 +1,16 @@
+module HcurlProjectionMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "HcurlProjectionTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  HcurlProjectionTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/HeatEquationTests.jl b/test/mpi/drivers/HeatEquationTests.jl
new file mode 100644
index 00000000..dfb31c8c
--- /dev/null
+++ b/test/mpi/drivers/HeatEquationTests.jl
@@ -0,0 +1,16 @@
+module HeatEquationMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "HeatEquationTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  HeatEquationTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/MacroDiscreteModelsTests.jl b/test/mpi/drivers/MacroDiscreteModelsTests.jl
new file mode 100644
index 00000000..543feb1c
--- /dev/null
+++ b/test/mpi/drivers/MacroDiscreteModelsTests.jl
@@ -0,0 +1,14 @@
+module MacroDiscreteModelsMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "MacroDiscreteModelsTests.jl"))
+
+with_mpi() do distribute
+  MacroDiscreteModelsTests.main(distribute, (2, 2))
+end
+
+end # module
diff --git a/test/mpi/drivers/MultiFieldTests.jl b/test/mpi/drivers/MultiFieldTests.jl
new file mode 100644
index 00000000..12a49fb3
--- /dev/null
+++ b/test/mpi/drivers/MultiFieldTests.jl
@@ -0,0 +1,16 @@
+module MultiFieldMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "MultiFieldTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  MultiFieldTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/PLaplacianTests.jl b/test/mpi/drivers/PLaplacianTests.jl
new file mode 100644
index 00000000..bf5ec3f0
--- /dev/null
+++ b/test/mpi/drivers/PLaplacianTests.jl
@@ -0,0 +1,16 @@
+module PLaplacianMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "PLaplacianTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  PLaplacianTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/PeriodicBCsTests.jl b/test/mpi/drivers/PeriodicBCsTests.jl
new file mode 100644
index 00000000..8221b3df
--- /dev/null
+++ b/test/mpi/drivers/PeriodicBCsTests.jl
@@ -0,0 +1,16 @@
+module PeriodicBCsMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "PeriodicBCsTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  PeriodicBCsTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/PoissonTests.jl b/test/mpi/drivers/PoissonTests.jl
new file mode 100644
index 00000000..f644a4d4
--- /dev/null
+++ b/test/mpi/drivers/PoissonTests.jl
@@ -0,0 +1,16 @@
+module PoissonMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "PoissonTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  PoissonTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/PolytopalCoarseningTests.jl b/test/mpi/drivers/PolytopalCoarseningTests.jl
new file mode 100644
index 00000000..97bd6163
--- /dev/null
+++ b/test/mpi/drivers/PolytopalCoarseningTests.jl
@@ -0,0 +1,14 @@
+module PolytopalCoarseningMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "PolytopalCoarseningTests.jl"))
+
+with_mpi() do distribute
+  PolytopalCoarseningTests.main(distribute, (2, 2))
+end
+
+end # module
diff --git a/test/mpi/drivers/StokesHdivDGTests.jl b/test/mpi/drivers/StokesHdivDGTests.jl
new file mode 100644
index 00000000..687e16af
--- /dev/null
+++ b/test/mpi/drivers/StokesHdivDGTests.jl
@@ -0,0 +1,16 @@
+module StokesHdivDGMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "StokesHdivDGTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  StokesHdivDGTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/StokesOpenBoundaryTests.jl b/test/mpi/drivers/StokesOpenBoundaryTests.jl
new file mode 100644
index 00000000..d1c9d707
--- /dev/null
+++ b/test/mpi/drivers/StokesOpenBoundaryTests.jl
@@ -0,0 +1,16 @@
+module StokesOpenBoundaryMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "StokesOpenBoundaryTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  StokesOpenBoundaryTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/SurfaceCouplingTests.jl b/test/mpi/drivers/SurfaceCouplingTests.jl
new file mode 100644
index 00000000..b79ce487
--- /dev/null
+++ b/test/mpi/drivers/SurfaceCouplingTests.jl
@@ -0,0 +1,16 @@
+module SurfaceCouplingMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "SurfaceCouplingTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  SurfaceCouplingTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/TransientDistributedCellFieldTests.jl b/test/mpi/drivers/TransientDistributedCellFieldTests.jl
new file mode 100644
index 00000000..e28fa5fe
--- /dev/null
+++ b/test/mpi/drivers/TransientDistributedCellFieldTests.jl
@@ -0,0 +1,16 @@
+module TransientDistributedCellFieldMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "TransientDistributedCellFieldTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  TransientDistributedCellFieldTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl b/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl
new file mode 100644
index 00000000..ffa2f2bd
--- /dev/null
+++ b/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl
@@ -0,0 +1,16 @@
+module TransientMultiFieldDistributedCellFieldMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "TransientMultiFieldDistributedCellFieldTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  TransientMultiFieldDistributedCellFieldTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/VisualizationTests.jl b/test/mpi/drivers/VisualizationTests.jl
new file mode 100644
index 00000000..49bf425c
--- /dev/null
+++ b/test/mpi/drivers/VisualizationTests.jl
@@ -0,0 +1,14 @@
+module VisualizationMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "VisualizationTests.jl"))
+
+with_mpi() do distribute
+  VisualizationTests.main(distribute, (2, 2))
+end
+
+end # module
diff --git a/test/mpi/drivers/ZeroMeanFESpacesTests.jl b/test/mpi/drivers/ZeroMeanFESpacesTests.jl
new file mode 100644
index 00000000..81ccf552
--- /dev/null
+++ b/test/mpi/drivers/ZeroMeanFESpacesTests.jl
@@ -0,0 +1,16 @@
+module ZeroMeanFESpacesMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "ZeroMeanFESpacesTests.jl"))
+
+with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
+  ZeroMeanFESpacesTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/run_mpi_driver.jl b/test/mpi/run_mpi_driver.jl
new file mode 100644
index 00000000..34ba6ff9
--- /dev/null
+++ b/test/mpi/run_mpi_driver.jl
@@ -0,0 +1,12 @@
+using MPI
+using Test
+function run_mpi_driver(file; procs)
+  repodir = normpath(joinpath(@__DIR__, "..", ".."))
+  cmd = mpiexec()
+  if MPI.MPI_LIBRARY == "OpenMPI" || (isdefined(MPI, :OpenMPI) && MPI.MPI_LIBRARY == MPI.OpenMPI)
+    run(`$cmd -n $procs --oversubscribe $(Base.julia_cmd()) --project=$repodir $(file)`)
+  else
+    run(`$cmd -n $procs $(Base.julia_cmd()) --project=$repodir $(file)`)
+  end
+  @test true
+end
diff --git a/test/mpi/runtests.jl b/test/mpi/runtests.jl
index 80f1cca4..7c97eb9f 100644
--- a/test/mpi/runtests.jl
+++ b/test/mpi/runtests.jl
@@ -1,24 +1,59 @@
 module MPITests
 
-using MPI
 using Test
+using MPI
+
+TESTCASE = get(ENV, "TESTCASE", "all")
+@info "Running MPI tests with TESTCASE=$TESTCASE"
+
+if TESTCASE ∈ ("all", "mpi-geometry")
+  @time @testset "Geometry" begin include("GeometryTests.jl") end
+  @time @testset "CellData" begin include("CellDataTests.jl") end
+end
+
+if TESTCASE ∈ ("all", "mpi-fespaces")
+  @time @testset "FESpaces"         begin include("FESpacesTests.jl") end
+  @time @testset "MultiField"       begin include("MultiFieldTests.jl") end
+  @time @testset "ZeroMeanFESpaces" begin include("ZeroMeanFESpacesTests.jl") end
+  @time @testset "PeriodicBCs"      begin include("PeriodicBCsTests.jl") end
+  @time @testset "ConstantFESpaces" begin include("ConstantFESpacesTests.jl") end
+end
 
-mpidir  = @__DIR__
-testdir = joinpath(mpidir, "..")
-repodir = joinpath(testdir, "..")
-
-function run_driver(procs, file)
-  mpiexec() do cmd
-    if MPI.MPI_LIBRARY == "OpenMPI" || (isdefined(MPI, :OpenMPI) && MPI.MPI_LIBRARY == MPI.OpenMPI)
-      run(`$cmd -n $procs --oversubscribe $(Base.julia_cmd()) --code-coverage=user --project=$repodir $(joinpath(mpidir, file))`)
-    else
-      run(`$cmd -n $procs $(Base.julia_cmd()) --code-coverage=user --project=$repodir $(joinpath(mpidir, file))`)
-    end
-    @test true
+if TESTCASE ∈ ("all", "mpi-physics")
+  @time @testset "Poisson"            begin include("PoissonTests.jl") end
+  @time @testset "PLaplacian"         begin include("PLaplacianTests.jl") end
+  @time @testset "SurfaceCoupling"    begin include("SurfaceCouplingTests.jl") end
+  @time @testset "StokesOpenBoundary" begin include("StokesOpenBoundaryTests.jl") end
+  @time @testset "StokesHdivDG"       begin include("StokesHdivDGTests.jl") end
+  @time @testset "HcurlProjection"    begin include("HcurlProjectionTests.jl") end
+end
+
+if TESTCASE ∈ ("all", "mpi-transient")
+  @time @testset "TransientDistributedCellField" begin
+    include("TransientDistributedCellFieldTests.jl")
   end
+  @time @testset "TransientMultiFieldDistributedCellField" begin
+    include("TransientMultiFieldDistributedCellFieldTests.jl")
+  end
+  @time @testset "HeatEquation" begin include("HeatEquationTests.jl") end
 end
 
-run_driver(4, "runtests_np4.jl")
-run_driver(1, "runtests_np4.jl") # Check that the degenerated case works
+if TESTCASE ∈ ("all", "mpi-adaptivity")
+  @time @testset "AdaptivityTests"             begin include("AdaptivityTests.jl") end
+  @time @testset "AdaptivityCartesianTests"    begin include("AdaptivityCartesianTests.jl") end
+  @time @testset "AdaptivityMultiFieldTests"   begin include("AdaptivityMultiFieldTests.jl") end
+  @time @testset "AdaptivityUnstructuredTests" begin include("AdaptivityUnstructuredTests.jl") end
+  @time @testset "PolytopalCoarsening"         begin include("PolytopalCoarseningTests.jl") end
+end
+
+if TESTCASE ∈ ("all", "mpi-misc")
+  @time @testset "BlockSparseMatrixAssemblers" begin
+    include("BlockSparseMatrixAssemblersTests.jl")
+  end
+  @time @testset "BlockPartitionedArrays" begin include("BlockPartitionedArraysTests.jl") end
+  @time @testset "Visualization"          begin include("VisualizationTests.jl") end
+  @time @testset "Autodiff"               begin include("AutodiffTests.jl") end
+  @time @testset "MacroDiscreteModels"    begin include("MacroDiscreteModelsTests.jl") end
+end
 
 end # module
diff --git a/test/mpi/runtests_np4.jl b/test/mpi/runtests_np4.jl
deleted file mode 100644
index d2946152..00000000
--- a/test/mpi/runtests_np4.jl
+++ /dev/null
@@ -1,54 +0,0 @@
-module NP4
-# All tests running on either 1 or 4 procs here
-
-using GridapDistributed
-using PartitionedArrays
-const PArrays = PartitionedArrays
-using MPI
-
-if !MPI.Initialized()
-  MPI.Init()
-end
-
-include("../CellDataTests.jl")
-include("../FESpacesTests.jl")
-include("../GeometryTests.jl")
-include("../MultiFieldTests.jl")
-include("../PLaplacianTests.jl")
-include("../PoissonTests.jl")
-include("../PeriodicBCsTests.jl")
-include("../SurfaceCouplingTests.jl")
-include("../TransientDistributedCellFieldTests.jl")
-include("../TransientMultiFieldDistributedCellFieldTests.jl")
-include("../ZeroMeanFESpacesTests.jl")
-include("../HeatEquationTests.jl")
-include("../StokesOpenBoundaryTests.jl")
-include("../StokesHdivDGTests.jl")
-include("../AdaptivityTests.jl")
-include("../AdaptivityCartesianTests.jl")
-include("../AdaptivityUnstructuredTests.jl")
-include("../AdaptivityMultiFieldTests.jl")
-include("../PolytopalCoarseningTests.jl")
-include("../BlockSparseMatrixAssemblersTests.jl")
-include("../BlockPartitionedArraysTests.jl")
-include("../VisualizationTests.jl")
-include("../AutodiffTests.jl")
-include("../ConstantFESpacesTests.jl")
-include("../MacroDiscreteModelsTests.jl")
-include("../HcurlProjectionTests.jl")
-
-include("runtests_np4_body.jl")
-
-if MPI.Comm_size(MPI.COMM_WORLD) == 4
-  with_mpi() do distribute
-    all_tests(distribute, (2,2))
-  end
-elseif MPI.Comm_size(MPI.COMM_WORLD) == 1
-  with_mpi() do distribute
-    all_tests(distribute, (1,1))
-  end
-else
-  MPI.Abort(MPI.COMM_WORLD, 0)
-end
-
-end #module
diff --git a/test/mpi/runtests_np4_body.jl b/test/mpi/runtests_np4_body.jl
deleted file mode 100644
index cc3753ed..00000000
--- a/test/mpi/runtests_np4_body.jl
+++ /dev/null
@@ -1,99 +0,0 @@
-function all_tests(distribute, parts)
-  TESTCASE = get(ENV, "TESTCASE", "all")
-
-  ranks = distribute(LinearIndices((prod(parts),)))
-
-  t = PArrays.PTimer(ranks, verbose=true)
-  PArrays.tic!(t)
-
-  if TESTCASE ∈ ("all", "mpi-geometry")
-    GeometryTests.main(distribute, parts)
-    PArrays.toc!(t, "Geometry")
-
-    CellDataTests.main(distribute, parts)
-    PArrays.toc!(t, "CellData")
-  end
-
-  if TESTCASE ∈ ("all", "mpi-fespaces")
-    FESpacesTests.main(distribute, parts)
-    PArrays.toc!(t, "FESpaces")
-
-    MultiFieldTests.main(distribute, parts)
-    PArrays.toc!(t, "MultiField")
-
-    ZeroMeanFESpacesTests.main(distribute, parts)
-    PArrays.toc!(t, "ZeroMeanFESpaces")
-
-    PeriodicBCsTests.main(distribute, parts)
-    PArrays.toc!(t, "PeriodicBCs")
-
-    if prod(parts) == 4
-      ConstantFESpacesTests.main(distribute, parts)
-      PArrays.toc!(t, "ConstantFESpaces")
-    end
-  end
-
-  if TESTCASE ∈ ("all", "mpi-physics")
-    PoissonTests.main(distribute, parts)
-    PArrays.toc!(t, "Poisson")
-
-    PLaplacianTests.main(distribute, parts)
-    PArrays.toc!(t, "PLaplacian")
-
-    SurfaceCouplingTests.main(distribute, parts)
-    PArrays.toc!(t, "SurfaceCoupling")
-
-    StokesOpenBoundaryTests.main(distribute, parts)
-    PArrays.toc!(t, "StokesOpenBoundary")
-
-    StokesHdivDGTests.main(distribute, parts)
-    PArrays.toc!(t, "StokesHdivDG")
-
-    HcurlProjectionTests.main(distribute, parts)
-    PArrays.toc!(t, "HcurlProjection")
-    
-  end
-
-  if TESTCASE ∈ ("all", "mpi-transient")
-    TransientDistributedCellFieldTests.main(distribute, parts)
-    PArrays.toc!(t, "TransientDistributedCellField")
-
-    TransientMultiFieldDistributedCellFieldTests.main(distribute, parts)
-    PArrays.toc!(t, "TransientMultiFieldDistributedCellField")
-
-    HeatEquationTests.main(distribute, parts)
-    PArrays.toc!(t, "HeatEquation")
-  end
-
-  if TESTCASE ∈ ("all", "mpi-adaptivity")
-    if prod(parts) == 4
-      AdaptivityTests.main(distribute)
-      AdaptivityCartesianTests.main(distribute)
-      AdaptivityMultiFieldTests.main(distribute)
-      AdaptivityUnstructuredTests.main(distribute)
-      PolytopalCoarseningTests.main(distribute, parts)
-      PArrays.toc!(t, "Adaptivity")
-    end
-  end
-
-  if TESTCASE ∈ ("all", "mpi-misc")
-    BlockSparseMatrixAssemblersTests.main(distribute, parts)
-    BlockPartitionedArraysTests.main(distribute, parts)
-    PArrays.toc!(t, "BlockSparseMatrixAssemblers")
-
-    if prod(parts) == 4
-      VisualizationTests.main(distribute, parts)
-      PArrays.toc!(t, "Visualization")
-    end
-
-    AutodiffTests.main(distribute, parts)
-    PArrays.toc!(t, "Autodiff")
-
-    if prod(parts) == 4
-      MacroDiscreteModelsTests.main(distribute, parts)
-      PArrays.toc!(t, "MacroDiscreteModels")
-    end
-  end
-
-  isempty(t.timings) || display(t)
-end
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
new file mode 100644
index 00000000..ec106730
--- /dev/null
+++ b/test/parrays_update.jl
@@ -0,0 +1,64 @@
+
+using Gridap
+using PartitionedArrays
+using GridapDistributed
+
+using Gridap.FESpaces, Gridap.Algebra
+
+np = (1,2)
+ranks = with_debug() do distribute
+  distribute(LinearIndices((prod(np),)))
+end
+
+nc = (2,4)
+serial_model = CartesianDiscreteModel((0,1,0,1),nc)
+dist_model = CartesianDiscreteModel(ranks,np,(0,1,0,1),nc)
+
+cids = get_cell_gids(dist_model)
+
+reffe = ReferenceFE(lagrangian,Float64,1)
+serial_V = TestFESpace(serial_model,reffe)
+dist_V = TestFESpace(dist_model,reffe)
+
+serial_ids = get_free_dof_ids(serial_V)
+dist_ids = get_free_dof_ids(dist_V)
+
+dof_ids = get_free_dof_ids(dist_V)
+
+serial_Ω = Triangulation(serial_model)
+serial_dΩ = Measure(serial_Ω,2)
+
+dist_Ω = Triangulation(dist_model)
+dist_dΩ = Measure(dist_Ω,2)
+
+dist_Ωg = Triangulation(GridapDistributed.with_ghost,dist_model)
+dist_dΩg = Measure(dist_Ωg,2)
+
+serial_a(u,v) = ∫(u⋅v)*serial_dΩ
+dist_a(u,v) = ∫(u⋅v)*dist_dΩ
+dist_ag(u,v) = ∫(u⋅v)*dist_dΩg
+
+serial_A = assemble_matrix(serial_a,serial_V,serial_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.Assembled())
+dist_A_AS = assemble_matrix(dist_a,assem,dist_V,dist_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.LocallyAssembled())
+dist_A_LA = assemble_matrix(dist_ag,assem,dist_V,dist_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.SubAssembled())
+dist_A_SA = assemble_matrix(dist_a,assem,dist_V,dist_V)
+
+all(centralize(dist_A_AS) - serial_A .< 1e-10)
+
+x_AS = prand(partition(axes(dist_A_AS,2)))
+x_LA = GridapDistributed.change_ghost(x_AS,axes(dist_A_LA,2))
+x_SA = GridapDistributed.change_ghost(x_AS,axes(dist_A_SA,2))
+
+norm(dist_A_AS*x_AS - dist_A_LA*x_LA)
+norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
+
+assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V)
+norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
+
+############################################################################################
diff --git a/test/sequential/runtests.jl b/test/sequential/runtests.jl
index aee13a6d..90370037 100644
--- a/test/sequential/runtests.jl
+++ b/test/sequential/runtests.jl
@@ -3,6 +3,7 @@ module SequentialTests
 using Test
 
 TESTCASE = get(ENV, "TESTCASE", "all")
+@info "Running SEQUENTIAL tests with TESTCASE=$TESTCASE"
 
 if TESTCASE ∈ ("all", "seq-geometry")
   @time @testset "Geometry" begin include("GeometryTests.jl") end
@@ -19,11 +20,11 @@ if TESTCASE ∈ ("all", "seq-fespaces")
 end
 
 if TESTCASE ∈ ("all", "seq-physics")
-  @time @testset "Poisson"                     begin include("PoissonTests.jl") end
-  @time @testset "PLaplacian"                  begin include("PLaplacianTests.jl") end
-  @time @testset "SurfaceCouplingTests"        begin include("SurfaceCouplingTests.jl") end
-  @time @testset "StokesHdivDGTests"           begin include("StokesHdivDGTests.jl") end
-  @time @testset "StokesOpenBoundary"          begin include("StokesOpenBoundaryTests.jl") end
+  @time @testset "Poisson"              begin include("PoissonTests.jl") end
+  @time @testset "PLaplacian"           begin include("PLaplacianTests.jl") end
+  @time @testset "SurfaceCouplingTests" begin include("SurfaceCouplingTests.jl") end
+  @time @testset "StokesHdivDGTests"    begin include("StokesHdivDGTests.jl") end
+  @time @testset "StokesOpenBoundary"   begin include("StokesOpenBoundaryTests.jl") end
 end
 
 if TESTCASE ∈ ("all", "seq-transient")