From fb0de46755e8776ac8dd965b61c6934844deca20 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Fri, 30 Aug 2024 12:26:12 +1000
Subject: [PATCH 01/48] Updated to PartitionedArrays v0.5

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 1dc2bd64..494e3732 100644
--- a/Project.toml
+++ b/Project.toml
@@ -20,7 +20,7 @@ FillArrays = "0.8.4,1"
 Gridap = "0.18.3"
 LinearAlgebra = "1.3"
 MPI = "0.16, 0.17, 0.18, 0.19, 0.20"
-PartitionedArrays = "0.3.3"
+PartitionedArrays = "0.5"
 SparseArrays = "1.3"
 SparseMatricesCSR = "0.6.6"
 WriteVTK = "1.12.0"

From 9678cbd448f874513af8f99af1ee53089fbaa390 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Fri, 30 Aug 2024 12:26:34 +1000
Subject: [PATCH 02/48] Assembly working, no reuse

---
 src/Algebra.jl                      | 174 ++++++++++++++--------------
 src/BlockPartitionedArrays.jl       |   4 +-
 src/FESpaces.jl                     |   4 +-
 test/BlockPartitionedArraysTests.jl |   2 +-
 test/parrays_update.jl              |  36 ++++++
 5 files changed, 128 insertions(+), 92 deletions(-)
 create mode 100644 test/parrays_update.jl

diff --git a/src/Algebra.jl b/src/Algebra.jl
index 57bcbac3..11af9149 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -62,8 +62,8 @@ function Algebra.axpy_entries!(
 # We should definitely check here that the index partitions are the same. 
 # However: Because the different matrices are assembled separately, the objects are not the 
 # same (i.e can't use ===). Checking the index partitions would then be costly...
-  @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
-  @assert reduce(&,map(PartitionedArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
   map(partition(A),partition(B)) do A, B
     Algebra.axpy_entries!(α,A,B;check)
   end
@@ -394,13 +394,14 @@ struct PSparseMatrixBuilderCOO{T,B}
 end
 
 function Algebra.nz_counter(
-  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}) where A
-  test_dofs_gids_prange, trial_dofs_gids_prange = axs
-  counters = map(partition(test_dofs_gids_prange),partition(trial_dofs_gids_prange)) do r,c
+  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}
+) where A
+  rows, cols = axs # test ids, trial ids
+  counters = map(partition(rows),partition(cols)) do r,c
     axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
     Algebra.CounterCOO{A}(axs)
   end
-  DistributedCounterCOO(builder.par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange)
+  DistributedCounterCOO(builder.par_strategy,counters,rows,cols)
 end
 
 function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
@@ -414,56 +415,58 @@ end
 struct DistributedCounterCOO{A,B,C,D} <: GridapType
   par_strategy::A
   counters::B
-  test_dofs_gids_prange::C
-  trial_dofs_gids_prange::D
+  test_gids::C
+  trial_gids::D
   function DistributedCounterCOO(
     par_strategy,
     counters::AbstractArray{<:Algebra.CounterCOO},
-    test_dofs_gids_prange::PRange,
-    trial_dofs_gids_prange::PRange)
+    test_gids::PRange,
+    trial_gids::PRange)
     A = typeof(par_strategy)
     B = typeof(counters)
-    C = typeof(test_dofs_gids_prange)
-    D = typeof(trial_dofs_gids_prange)
-    new{A,B,C,D}(par_strategy,counters,test_dofs_gids_prange,trial_dofs_gids_prange)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids)
   end
 end
 
 function local_views(a::DistributedCounterCOO)
-  a.counters
+  return a.counters
 end
 
-function local_views(a::DistributedCounterCOO,test_dofs_gids_prange,trial_dofs_gids_prange)
-  @check test_dofs_gids_prange === a.test_dofs_gids_prange
-  @check trial_dofs_gids_prange === a.trial_dofs_gids_prange
-  a.counters
+function local_views(a::DistributedCounterCOO,test_gids,trial_gids)
+  @check PArrays.matching_local_indices(test_gids,a.test_gids)
+  @check PArrays.matching_local_indices(trial_gids,a.trial_gids)
+  return a.counters
 end
 
 function Algebra.nz_allocation(a::DistributedCounterCOO)
   allocs = map(nz_allocation,a.counters)
-  DistributedAllocationCOO(a.par_strategy,allocs,a.test_dofs_gids_prange,a.trial_dofs_gids_prange)
+  DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids)
 end
 
 struct DistributedAllocationCOO{A,B,C,D} <: GridapType
   par_strategy::A
   allocs::B
-  test_dofs_gids_prange::C
-  trial_dofs_gids_prange::D
+  test_gids::C
+  trial_gids::D
   function DistributedAllocationCOO(
     par_strategy,
     allocs::AbstractArray{<:Algebra.AllocationCOO},
-    test_dofs_gids_prange::PRange,
-    trial_dofs_gids_prange::PRange)
+    test_gids::PRange,
+    trial_gids::PRange)
     A = typeof(par_strategy)
     B = typeof(allocs)
-    C = typeof(test_dofs_gids_prange)
-    D = typeof(trial_dofs_gids_prange)
-    new{A,B,C,D}(par_strategy,allocs,test_dofs_gids_prange,trial_dofs_gids_prange)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids)
   end
 end
 
-function change_axes(a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
-                     axes::Tuple{<:PRange,<:PRange}) where {A,B}
+function change_axes(
+  a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
+  axes::Tuple{<:PRange,<:PRange}
+) where {A,B}
   local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
     (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
   end
@@ -471,8 +474,10 @@ function change_axes(a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
   DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
 end
 
-function change_axes(a::MatrixBlock{<:DistributedAllocationCOO},
-                     axes::Tuple{<:Vector,<:Vector})
+function change_axes(
+  a::MatrixBlock{<:DistributedAllocationCOO},
+  axes::Tuple{<:Vector,<:Vector}
+)
   block_ids  = CartesianIndices(a.array)
   rows, cols = axes
   array = map(block_ids) do I
@@ -485,9 +490,9 @@ function local_views(a::DistributedAllocationCOO)
   a.allocs
 end
 
-function local_views(a::DistributedAllocationCOO,test_dofs_gids_prange,trial_dofs_gids_prange)
-  @check test_dofs_gids_prange === a.test_dofs_gids_prange
-  @check trial_dofs_gids_prange === a.trial_dofs_gids_prange
+function local_views(a::DistributedAllocationCOO,test_gids,trial_gids)
+  @check test_gids === a.test_gids
+  @check trial_gids === a.trial_gids
   a.allocs
 end
 
@@ -508,8 +513,8 @@ function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
   return tuple_of_array_of_parrays
 end
 
-get_test_gids(a::DistributedAllocationCOO)  = a.test_dofs_gids_prange
-get_trial_gids(a::DistributedAllocationCOO) = a.trial_dofs_gids_prange
+get_test_gids(a::DistributedAllocationCOO)  = a.test_gids
+get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids
 get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
 get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
 
@@ -536,31 +541,24 @@ function _fa_create_from_nz_with_callback(callback,a)
 
   # Recover some data
   I,J,V = get_allocations(a)
-  test_dofs_gids_prange  = get_test_gids(a)
-  trial_dofs_gids_prange = get_trial_gids(a)
+  test_gids  = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
 
-  rows = _setup_prange(test_dofs_gids_prange,I;ghost=false,ax=:rows)
+  rows = _setup_prange(test_gids,I;ghost=false,ax=:rows)
   b = callback(rows)
 
   # convert I and J to global dof ids
-  to_global_indices!(I,test_dofs_gids_prange;ax=:rows)
-  to_global_indices!(J,trial_dofs_gids_prange;ax=:cols)
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
 
   # Create the range for cols
-  cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols)
+  cols = _setup_prange(trial_gids,J;ax=:cols)
 
   # Convert again I,J to local numeration
   to_local_indices!(I,rows;ax=:rows)
   to_local_indices!(J,cols;ax=:cols)
 
-  # Adjust local matrix size to linear system's index sets
-  asys = change_axes(a,(rows,cols))
-
-  # Compress local portions
-  values = map(create_from_nz,local_views(asys))
-
-  # Finally build the matrix
-  A = _setup_matrix(values,rows,cols)
+  A = _setup_matrix(I,J,V,rows,cols)
   return A, b
 end
 
@@ -579,19 +577,19 @@ end
 function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
   # Recover some data
   I,J,V = get_allocations(a)
-  test_dofs_gids_prange = get_test_gids(a)
-  trial_dofs_gids_prange = get_trial_gids(a)
+  test_gids = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
 
   # convert I and J to global dof ids
-  to_global_indices!(I,test_dofs_gids_prange;ax=:rows)
-  to_global_indices!(J,trial_dofs_gids_prange;ax=:cols)
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
 
   # Create the Prange for the rows
-  rows = _setup_prange(test_dofs_gids_prange,I;ax=:rows)
+  rows = _setup_prange(test_gids,I;ax=:rows)
   
   # Move (I,J,V) triplets to the owner process of each row I.
   # The triplets are accompanyed which Jo which is the process column owner
-  Jo = get_gid_owners(J,trial_dofs_gids_prange;ax=:cols)
+  Jo = get_gid_owners(J,trial_gids;ax=:cols)
   t  = _assemble_coo!(I,J,V,rows;owners=Jo)
 
   # Here we can overlap computations
@@ -606,7 +604,7 @@ function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
   wait(t)
 
   # Create the Prange for the cols
-  cols = _setup_prange(trial_dofs_gids_prange,J;ax=:cols,owners=Jo)
+  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
 
   # Overlap rhs communications with CSC compression
   t2 = async_callback(b)
@@ -615,19 +613,13 @@ function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
   to_local_indices!(I,rows;ax=:rows)
   to_local_indices!(J,cols;ax=:cols)
 
-  # Adjust local matrix size to linear system's index sets
-  asys = change_axes(a,(rows,cols))
-
-  # Compress the local matrices
-  values = map(create_from_nz,local_views(asys))
+  A = _setup_matrix(I,J,V,rows,cols)
 
   # Wait the transfer to finish
   if !isa(t2,Nothing)
     wait(t2)
   end
-
-  # Finally build the matrix
-  A = _setup_matrix(values,rows,cols)
+  
   return A, b
 end
 
@@ -657,7 +649,7 @@ end
 struct PVectorCounter{A,B,C}
   par_strategy::A
   counters::B
-  test_dofs_gids_prange::C
+  test_gids::C
 end
 
 Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
@@ -667,12 +659,12 @@ function local_views(a::PVectorCounter)
 end
 
 function local_views(a::PVectorCounter,rows)
-  @check rows === a.test_dofs_gids_prange
+  @check rows === a.test_gids
   a.counters
 end
 
 function Arrays.nz_allocation(a::PVectorCounter{<:FullyAssembledRows})
-  dofs = a.test_dofs_gids_prange
+  dofs = a.test_gids
   values = map(nz_allocation,a.counters)
   PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
 end
@@ -680,7 +672,7 @@ end
 struct PVectorAllocationTrackOnlyValues{A,B,C}
   par_strategy::A
   values::B
-  test_dofs_gids_prange::C
+  test_gids::C
 end
 
 function local_views(a::PVectorAllocationTrackOnlyValues)
@@ -688,12 +680,12 @@ function local_views(a::PVectorAllocationTrackOnlyValues)
 end
 
 function local_views(a::PVectorAllocationTrackOnlyValues,rows)
-  @check rows === a.test_dofs_gids_prange
+  @check rows === a.test_gids
   a.values
 end
 
 function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-  rows = _setup_prange_without_ghosts(a.test_dofs_gids_prange)
+  rows = _setup_prange_without_ghosts(a.test_gids)
   _rhs_callback(a,rows)
 end
 
@@ -707,7 +699,7 @@ function _rhs_callback(row_partitioned_vector_partition,rows)
   # The ghost values in row_partitioned_vector_partition are 
   # aligned with the FESpace but not with the ghost values in the rows of A
   b_fespace = PVector(row_partitioned_vector_partition.values,
-                      partition(row_partitioned_vector_partition.test_dofs_gids_prange))
+                      partition(row_partitioned_vector_partition.test_gids))
 
   # This one is aligned with the rows of A
   b = similar(b_fespace,eltype(b_fespace),(rows,))
@@ -758,7 +750,7 @@ end
 struct PVectorAllocationTrackTouchedAndValues{A,B,C}
   allocations::A
   values::B
-  test_dofs_gids_prange::C
+  test_gids::C
 end
 
 function Algebra.create_from_nz(
@@ -787,7 +779,7 @@ Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Grida
 
 
 function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
-  @check rows === a.test_dofs_gids_prange
+  @check rows === a.test_gids
   a.allocations
 end
 
@@ -835,7 +827,7 @@ end
 function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows},
                               b::PVectorCounter{<:SubAssembledRows})
   A      = nz_allocation(a)
-  dofs   = b.test_dofs_gids_prange
+  dofs   = b.test_gids
   values = map(nz_allocation,b.counters)
   touched,allocations=_setup_touched_and_allocations_arrays(values)
   B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
@@ -843,7 +835,7 @@ function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows},
 end
 
 function Arrays.nz_allocation(a::PVectorCounter{<:SubAssembledRows})
-  dofs = a.test_dofs_gids_prange
+  dofs = a.test_gids
   values = map(nz_allocation,a.counters)
   touched,allocations=_setup_touched_and_allocations_arrays(values)
   return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
@@ -857,7 +849,7 @@ function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedA
 
   # Find the ghost rows
   allocations = local_views(a.allocations)
-  indices = partition(a.test_dofs_gids_prange)
+  indices = partition(a.test_gids)
   I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
     dofs_lids_touched = findall(allocation.touched)
     loc_to_gho = local_to_ghost(indices)
@@ -877,7 +869,7 @@ function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedA
   ghost_to_global, ghost_to_owner = map(
     find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
 
-  ngids = length(a.test_dofs_gids_prange)
+  ngids = length(a.test_gids)
   _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
 end
 
@@ -996,15 +988,23 @@ end
 
 # _setup_matrix : local matrices + global PRanges -> Global matrix
 
-function _setup_matrix(values,rows::PRange,cols::PRange)
-  return PSparseMatrix(values,partition(rows),partition(cols))
-end
-
-function _setup_matrix(values,rows::Vector{<:PRange},cols::Vector{<:PRange})
-  block_ids  = CartesianIndices((length(rows),length(cols)))
-  block_mats = map(block_ids) do I
-    block_values = map(v -> blocks(v)[I],values)
-    return _setup_matrix(block_values,rows[I[1]],cols[I[2]])
+function _setup_matrix(I,J,V,rows::PRange,cols::PRange)
+  assembled_rows = map(PArrays.remove_ghost,partition(rows))
+  psparse(
+    I,J,V,assembled_rows,partition(cols);
+    split_format=true,
+    assembled=true,
+    assemble=true,
+    indices=:local,
+    reuse=false,
+  ) |> fetch
+end
+
+function _setup_matrix(I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange})
+  block_ids  = CartesianIndices(I)
+  block_mats = map(block_ids) do id
+    i = id[1]; j = id[2];
+    _setup_matrix(I[i,j],J[i,j],V[i,j],rows[i],cols[j])
   end
   return mortar(block_mats)
 end
@@ -1116,7 +1116,7 @@ function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
   end
 end
 
-# dofs_gids_prange can be either test_dofs_gids_prange or trial_dofs_gids_prange
+# dofs_gids_prange can be either test_gids or trial_gids
 # In the former case, gids is a vector of global test dof identifiers, while in the 
 # latter, a vector of global trial dof identifiers
 function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
diff --git a/src/BlockPartitionedArrays.jl b/src/BlockPartitionedArrays.jl
index 41e3f6dc..942a7c28 100644
--- a/src/BlockPartitionedArrays.jl
+++ b/src/BlockPartitionedArrays.jl
@@ -271,8 +271,8 @@ function PartitionedArrays.partition(a::BlockPArray)
   return map(mortar,vals)
 end
 
-function PartitionedArrays.to_trivial_partition(a::BlockPArray)
-  vals = map(PartitionedArrays.to_trivial_partition,blocks(a))
+function PartitionedArrays.centralize(a::BlockPArray)
+  vals = map(PartitionedArrays.centralize,blocks(a))
   return mortar(vals)
 end
 
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index c2b09475..844ec9e6 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -118,7 +118,7 @@ end
 
 function fetch_vector_ghost_values_cache(vector_partition,partition)
   cache = PArrays.p_vector_cache(vector_partition,partition)
-  map(reverse,cache)
+  reverse(cache)
 end
 
 function fetch_vector_ghost_values!(vector_partition,cache)
@@ -163,7 +163,7 @@ function generate_gids(
   #         swapped in the AssemblyCache of partition(cell_range)
 
   # Exchange the dof owners
-  cache_fetch=fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range))
+  cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range))
   fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait
   
   cell_wise_to_dof_wise!(ldof_to_owner,
diff --git a/test/BlockPartitionedArraysTests.jl b/test/BlockPartitionedArraysTests.jl
index 3e810085..5ec6579c 100644
--- a/test/BlockPartitionedArraysTests.jl
+++ b/test/BlockPartitionedArraysTests.jl
@@ -76,7 +76,7 @@ t = assemble!(__v)
 assemble!(__m) |> wait
 fetch(t);
 
-PartitionedArrays.to_trivial_partition(m)
+PartitionedArrays.centralize(m)
 
 partition(v)
 partition(m)
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
new file mode 100644
index 00000000..951a68d7
--- /dev/null
+++ b/test/parrays_update.jl
@@ -0,0 +1,36 @@
+
+using Gridap
+using PartitionedArrays
+using GridapDistributed
+
+np = (1,2)
+ranks = with_debug() do distribute
+  distribute(LinearIndices((prod(np),)))
+end
+
+nc = (2,4)
+serial_model = CartesianDiscreteModel((0,1,0,1),nc)
+dist_model = CartesianDiscreteModel(ranks,np,(0,1,0,1),nc)
+
+cids = get_cell_gids(dist_model)
+
+reffe = ReferenceFE(lagrangian,Float64,1)
+serial_V = TestFESpace(serial_model,reffe)
+dist_V = TestFESpace(dist_model,reffe)
+
+serial_ids = get_free_dof_ids(serial_V)
+dist_ids = get_free_dof_ids(dist_V)
+
+serial_Ω = Triangulation(serial_model)
+serial_dΩ = Measure(serial_Ω,2)
+
+dist_Ω = Triangulation(dist_model)
+dist_dΩ = Measure(dist_Ω,2)
+
+serial_a1(u,v) = ∫(u⋅v)*serial_dΩ
+serial_A1 = assemble_matrix(serial_a1,serial_V,serial_V)
+
+dist_a1(u,v) = ∫(u⋅v)*dist_dΩ
+dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
+all(centralize(dist_A1) - serial_A1 .< 1e-10)
+

From e17e85ce107030fcfdc3fafd8ae386a46176eb8f Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Mon, 2 Sep 2024 10:37:17 +1000
Subject: [PATCH 03/48] FESpace gids are now permuted
 LocalIndicesWithVariableBlockSize

---
 src/DistributedUtils.jl  |  44 +++++++
 src/FESpaces.jl          | 240 ++++++++++++++++++---------------------
 src/GridapDistributed.jl |   6 +-
 3 files changed, 156 insertions(+), 134 deletions(-)
 create mode 100644 src/DistributedUtils.jl

diff --git a/src/DistributedUtils.jl b/src/DistributedUtils.jl
new file mode 100644
index 00000000..0b34a0a1
--- /dev/null
+++ b/src/DistributedUtils.jl
@@ -0,0 +1,44 @@
+
+function permuted_variable_partition(
+  n_own::AbstractArray{<:Integer},
+  gids::AbstractArray{<:AbstractArray{<:Integer}},
+  owners::AbstractArray{<:AbstractArray{<:Integer}};
+  n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))),
+  start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+)
+  ranks = linear_indices(n_own)
+  np = length(ranks)
+  map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners
+    n_local = length(gids)
+    n_ghost = n_local - n_own
+    perm = fill(zero(Int32),n_local)
+    ghost_gids = fill(zero(Int),n_ghost)
+    ghost_owners = fill(zero(Int32),n_ghost)
+
+    n_ghost = 0
+    for (lid,(gid,owner)) in enumerate(zip(gids,owners))
+      if owner == rank
+        perm[lid] = gid-start+1
+      else
+        n_ghost += 1
+        ghost_gids[n_ghost] = gid
+        ghost_owners[n_ghost] = owner
+        perm[lid] = n_own + n_ghost
+      end
+    end
+    @assert n_ghost == n_local - n_own
+
+    ghost = GhostIndices(n_global,ghost_gids,ghost_owners)
+    dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize(
+      CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost
+    )
+    permute_indices(dof_ids,perm)
+  end
+end
+
+function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T
+  ptrs = Vector{Int32}(undef,length(vv)+1)
+  Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv)
+  Arrays.length_to_ptrs!(ptrs)
+  return ptrs
+end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 844ec9e6..02530cfa 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -56,36 +56,25 @@ function FESpaces.gather_free_and_dirichlet_values(f::DistributedFESpace,cell_va
   return free_values, dirichlet_values
 end
 
-function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_prange)
-  map(cell_wise_vector,
-          dof_wise_vector,
-          cell_to_ldofs,
-          partition(cell_prange)) do cwv,dwv,cell_to_ldofs,indices
+function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids)
+  map(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids
     cache  = array_cache(cell_to_ldofs)
-    ncells = length(cell_to_ldofs)
-    ptrs = cwv.ptrs
-    data = cwv.data
-    cell_own_to_local = own_to_local(indices)
-    for cell in cell_own_to_local
+    for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
-      p = ptrs[cell]-1
+      p = cwv.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
         if ldof > 0
-          data[i+p] = dwv[ldof]
+          cwv.data[i+p] = dwv[ldof]
         end
       end
     end
   end
 end
 
-function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_range)
-  map(dof_wise_vector,
-      cell_wise_vector,
-      cell_to_ldofs,
-      partition(cell_range)) do dwv,cwv,cell_to_ldofs,indices
+function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids)
+  map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
     cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
-    for cell in cell_ghost_to_local
+    for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
       p = cwv.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
@@ -97,23 +86,14 @@ function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,c
   end
 end
 
-function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_prange)
-    cwv=map(cell_to_ldofs) do cell_to_ldofs
-      cache = array_cache(cell_to_ldofs)
-      ncells = length(cell_to_ldofs)
-      ptrs = Vector{Int32}(undef,ncells+1)
-      for cell in 1:ncells
-        ldofs = getindex!(cache,cell_to_ldofs,cell)
-        ptrs[cell+1] = length(ldofs)
-      end
-      PArrays.length_to_ptrs!(ptrs)
-      ndata = ptrs[end]-1
-      data = Vector{Int}(undef,ndata)
-      data .= -1
-      JaggedArray(data,ptrs)
-    end
-    dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_prange)
-    cwv
+function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_ids)
+  cwv = map(cell_to_ldofs) do cell_to_ldofs
+    ptrs = generate_ptrs(cell_to_ldofs)
+    data = fill(-1,ptrs[end]-1)
+    JaggedArray(data,ptrs)
+  end
+  dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_ids)
+  return cwv
 end
 
 function fetch_vector_ghost_values_cache(vector_partition,partition)
@@ -128,13 +108,15 @@ end
 function generate_gids(
   cell_range::PRange,
   cell_to_ldofs::AbstractArray{<:AbstractArray},
-  nldofs::AbstractArray{<:Integer})
+  nldofs::AbstractArray{<:Integer}
+)
+  ranks = linear_indices(partition(cell_range))
 
   # Find and count number owned dofs
   ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs
     ldof_to_owner = fill(Int32(0),nldofs)
-    cache = array_cache(cell_to_ldofs)
     lcell_to_owner = local_to_owner(indices)
+    cache = array_cache(cell_to_ldofs)
     for cell in 1:length(cell_to_ldofs)
       owner = lcell_to_owner[cell]
       ldofs = getindex!(cache,cell_to_ldofs,cell)
@@ -151,102 +133,54 @@ function generate_gids(
     ldof_to_owner, nodofs
   end |> tuple_of_arrays
 
-  cell_ldofs_to_part = dof_wise_to_cell_wise(ldof_to_owner,
-                                              cell_to_ldofs,
-                                              cell_range)
-
-  # Note1 : this call potentially updates cell_prange with the 
-  #         info required to exchange info among nearest neighbours
-  #         so that it can be re-used in the future for other exchanges
-
-  # Note2 : we need to call reverse() as the senders and receivers are 
-  #         swapped in the AssemblyCache of partition(cell_range)
-
-  # Exchange the dof owners
-  cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range))
-  fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait
-  
-  cell_wise_to_dof_wise!(ldof_to_owner,
-                         cell_ldofs_to_part,
-                         cell_to_ldofs,
-                         cell_range)
-
   # Find the global range of owned dofs
   first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs)))
+
+  # Exchange the dof owners. Cell owner always has correct dof owner.
+  cell_ldofs_to_owner = dof_wise_to_cell_wise(ldof_to_owner,cell_to_ldofs,own_to_local(cell_range))
+  consistent!(PVector(cell_ldofs_to_owner,partition(cell_range))) |> wait
+  cell_wise_to_dof_wise!(ldof_to_owner,cell_ldofs_to_owner,cell_to_ldofs,ghost_to_local(cell_range))
   
-  # Distribute gdofs to owned ones
-  ldof_to_gdof = map(first_gdof,ldof_to_owner,partition(cell_range)) do first_gdof,ldof_to_owner,indices
-    me = part_id(indices)
+  # Fill owned gids
+  ldof_to_gdof = map(ranks,first_gdof,ldof_to_owner) do rank,first_gdof,ldof_to_owner
+    ldof_to_gdof = fill(0,length(ldof_to_owner))
     offset = first_gdof-1
-    ldof_to_gdof = Vector{Int}(undef,length(ldof_to_owner))
     odof = 0
     for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
+      if owner == rank
         odof += 1
-        ldof_to_gdof[ldof] = odof
-      else
-        ldof_to_gdof[ldof] = 0
-      end
-    end
-    for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
-        ldof_to_gdof[ldof] += offset
+        ldof_to_gdof[ldof] = odof + offset
       end
     end
     ldof_to_gdof
   end
 
-  # Create cell-wise global dofs
-  cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,
-                                        cell_to_ldofs,
-                                        cell_range)
+  # Exchange gids
+  cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range))
+  consistent!(PVector(cell_to_gdofs,partition(cell_range))) |> wait
 
-  # Exchange the global dofs
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-
-  # Distribute global dof ids also to ghost
+  # Fill ghost gids with exchanged information
   map(cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
-    gdof = 0
     cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
     cell_local_to_owner = local_to_owner(indices)
-    for cell in cell_ghost_to_local
+    for cell in ghost_to_local(indices)
       ldofs = getindex!(cache,cell_to_ldofs,cell)
+      cell_owner = cell_local_to_owner[cell]
       p = cell_to_gdofs.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
-        if ldof > 0 && ldof_to_owner[ldof] == cell_local_to_owner[cell]
+        dof_owner = ldof_to_owner[ldof]
+        if ldof > 0 && (dof_owner == cell_owner)
           ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
         end
       end
     end
   end
 
-  dof_wise_to_cell_wise!(cell_to_gdofs,
-                         ldof_to_gdof,
-                         cell_to_ldofs,
-                         cell_range)
-
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-
-  cell_wise_to_dof_wise!(ldof_to_gdof,
-                         cell_to_gdofs,
-                         cell_to_ldofs,
-                         cell_range)
-
-  # Setup DoFs LocalIndices
-  ngdofs = reduction(+,nodofs,destination=:all,init=zero(eltype(nodofs)))
-  local_indices = map(ngdofs,partition(cell_range),ldof_to_gdof,ldof_to_owner) do ngdofs,
-                                                                                  indices,
-                                                                                  ldof_to_gdof,
-                                                                                  ldof_to_owner
-     me = part_id(indices)
-     LocalIndices(ngdofs,me,ldof_to_gdof,ldof_to_owner)
-  end
-
-  # Setup dof range
-  dofs_range = PRange(local_indices)
-
-  return dofs_range
+  # Setup DoFs AbstractLocalIndices
+  dof_ids = permuted_variable_partition(
+    nodofs,ldof_to_gdof,ldof_to_owner;start=first_gdof
+  )
+  return PRange(dof_ids)
 end
 
 # FEFunction related
@@ -576,12 +510,14 @@ end
 function FESpaces.collect_cell_matrix(
   trial::DistributedFESpace,
   test::DistributedFESpace,
-  a::DistributedDomainContribution)
+  a::DistributedDomainContribution
+)
   map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
 end
 
 function FESpaces.collect_cell_vector(
-  test::DistributedFESpace, a::DistributedDomainContribution)
+  test::DistributedFESpace, a::DistributedDomainContribution
+)
   map(collect_cell_vector,local_views(test),local_views(a))
 end
 
@@ -589,12 +525,14 @@ function FESpaces.collect_cell_matrix_and_vector(
   trial::DistributedFESpace,
   test::DistributedFESpace,
   biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution)
+  liform::DistributedDomainContribution
+)
   map(collect_cell_matrix_and_vector,
     local_views(trial),
     local_views(test),
     local_views(biform),
-    local_views(liform))
+    local_views(liform)
+  )
 end
 
 function FESpaces.collect_cell_matrix_and_vector(
@@ -602,17 +540,20 @@ function FESpaces.collect_cell_matrix_and_vector(
   test::DistributedFESpace,
   biform::DistributedDomainContribution,
   liform::DistributedDomainContribution,
-  uhd)
+  uhd
+)
   map(collect_cell_matrix_and_vector,
     local_views(trial),
     local_views(test),
     local_views(biform),
     local_views(liform),
-    local_views(uhd))
+    local_views(uhd)
+  )
 end
 
 function FESpaces.collect_cell_vector(
-  test::DistributedFESpace,l::Number)
+  test::DistributedFESpace,l::Number
+)
   map(local_views(test)) do s
     collect_cell_vector(s,l)
   end
@@ -622,7 +563,8 @@ function FESpaces.collect_cell_matrix_and_vector(
   trial::DistributedFESpace,
   test::DistributedFESpace,
   mat::DistributedDomainContribution,
-  l::Number)
+  l::Number
+)
   map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
     collect_cell_matrix_and_vector(u,v,m,l)
   end
@@ -633,26 +575,29 @@ function FESpaces.collect_cell_matrix_and_vector(
   test::DistributedFESpace,
   mat::DistributedDomainContribution,
   l::Number,
-  uhd)
+  uhd
+)
   map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
     collect_cell_matrix_and_vector(u,v,m,l,f)
   end
 end
+
 """
 """
-struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
+struct DistributedSparseMatrixAssembler{A,B,C,D,E,F,G} <: SparseMatrixAssembler
   strategy::A
   assems::B
   matrix_builder::C
   vector_builder::D
-  test_dofs_gids_prange::E
-  trial_dofs_gids_prange::F
+  test_gids::E
+  trial_gids::F
+  caches::G
 end
 
 local_views(a::DistributedSparseMatrixAssembler) = a.assems
 
-FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_dofs_gids_prange
-FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_dofs_gids_prange
+FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
+FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
 FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
 FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
 FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
@@ -694,20 +639,43 @@ function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrix
   map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
 end
 
+function FESpaces.assemble_matrix!(mat::PSparseMatrix,a::DistributedSparseMatrixAssembler,matdata)
+  @check !isnothing(a.caches) "Assembler is not reusable!"
+  @check haskey(a.caches,objectid(mat)) "Cache not found!"
+  cache = a.caches[objectid(mat)]
+
+  LinearAlgebra.fillstored!(mat,zero(eltype(mat)))
+
+  counter = nz_counter(get_matrix_builder(a),(get_rows(a),get_cols(a)))
+  map(local_views(counter),partition(mat)) do counter, mat
+    counter.nnz = nnz(mat)
+  end
+  allocs = nz_allocation(counter)
+  numeric_loop_matrix!(allocs,a,matdata)
+
+  create_from_nz!(mat,allocs,cache)
+end
+
+function create_from_nz!(A,a::DistributedAllocationCOO{<:Assembled},cache)
+  _,_,V = get_allocations(a)
+  psparse!(A,V,cache) |> wait
+  return A
+end
+
 # Parallel Assembly strategies
 
-function local_assembly_strategy(::SubAssembledRows,rows,cols)
+function local_assembly_strategy(::Assembled,rows,cols)
   DefaultAssemblyStrategy()
 end
 
 # When using this one, make sure that you also loop over ghost cells.
 # This is at your own risk.
-function local_assembly_strategy(::FullyAssembledRows,rows,cols)
+function local_assembly_strategy(::LocallyAssembled,rows,cols)
   rows_local_to_ghost = local_to_ghost(rows)
   GenericAssemblyStrategy(
     identity,
     identity,
-    row->rows_local_to_ghost[row]==0,
+    row->iszero(rows_local_to_ghost[row]),
     col->true
   )
 end
@@ -718,7 +686,8 @@ function FESpaces.SparseMatrixAssembler(
   local_vec_type,
   rows::PRange,
   cols::PRange,
-  par_strategy=SubAssembledRows()
+  par_strategy=Assembled();
+  reuse_caches=false
 )
   assems = map(partition(rows),partition(cols)) do rows,cols
     local_strategy = local_assembly_strategy(par_strategy,rows,cols)
@@ -732,7 +701,12 @@ function FESpaces.SparseMatrixAssembler(
   end
   mat_builder = PSparseMatrixBuilderCOO(local_mat_type,par_strategy)
   vec_builder = PVectorBuilder(local_vec_type,par_strategy)
-  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
+  if reuse_caches
+    caches = Dict{UInt64,Any}()
+  else
+    caches = nothing
+  end
+  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols,caches)
 end
 
 function FESpaces.SparseMatrixAssembler(
@@ -740,22 +714,24 @@ function FESpaces.SparseMatrixAssembler(
   local_vec_type,
   trial::DistributedFESpace,
   test::DistributedFESpace,
-  par_strategy=SubAssembledRows()
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
 )
   rows = get_free_dof_ids(test)
   cols = get_free_dof_ids(trial)
-  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy)
+  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
 end
 
 function FESpaces.SparseMatrixAssembler(
   trial::DistributedFESpace,
   test::DistributedFESpace,
-  par_strategy=SubAssembledRows()
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
 )
   Tv = PartitionedArrays.getany(map(get_vector_type,local_views(trial)))
   T  = eltype(Tv)
   Tm = SparseMatrixCSC{T,Int}
-  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy)
+  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
 end
 
 # ZeroMean FESpace
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index 8c20a975..7edf2a48 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -31,8 +31,8 @@ import Base: inv, abs, abs2, *, +, -, /, adjoint, transpose, real, imag, conj, g
 import Gridap.Fields: grad2curl
 import Gridap.CellData: Interpolable
 
-export FullyAssembledRows
-export SubAssembledRows
+export LocallyAssembled
+export Assembled
 
 export get_cell_gids
 export get_face_gids
@@ -42,6 +42,8 @@ export with_ghost, no_ghost
 
 export redistribute
 
+include("DistributedUtils.jl")
+
 include("BlockPartitionedArrays.jl")
 
 include("Algebra.jl")

From aab58d445844909236f3b3f1852261f334295fbf Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Sat, 7 Sep 2024 00:18:42 +1000
Subject: [PATCH 04/48] Started re-structuring assembly

---
 src/Algebra.jl                           | 1143 --------------------
 src/Assembly.jl                          |  199 ++++
 src/DistributedUtils.jl                  |   44 -
 src/Geometry.jl                          |    4 +-
 src/GridapDistributed.jl                 |    5 +-
 src/GridapExtras.jl                      |   55 +
 src/MultiField.jl                        |    2 +-
 src/PArraysExtras.jl                     |  210 ++++
 src/_Algebra_old.jl                      | 1238 ++++++++++++++++++++++
 test/BlockSparseMatrixAssemblersTests.jl |    4 +-
 test/DivConformingTests.jl               |    2 +-
 test/FESpacesTests.jl                    |    4 +-
 test/HeatEquationTests.jl                |    2 +-
 test/PLaplacianTests.jl                  |    4 +-
 test/parrays_update.jl                   |   51 +
 15 files changed, 1768 insertions(+), 1199 deletions(-)
 create mode 100644 src/Assembly.jl
 delete mode 100644 src/DistributedUtils.jl
 create mode 100644 src/GridapExtras.jl
 create mode 100644 src/PArraysExtras.jl
 create mode 100644 src/_Algebra_old.jl

diff --git a/src/Algebra.jl b/src/Algebra.jl
index 11af9149..3fa0b502 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -29,192 +29,6 @@ function Algebra.allocate_in_domain(matrix::BlockPMatrix)
   allocate_in_domain(BlockPVector{V},matrix)
 end
 
-# PSparseMatrix copy
-
-function Base.copy(a::PSparseMatrix)
-  mats = map(copy,partition(a))
-  cache = map(PartitionedArrays.copy_cache,a.cache)
-  return PSparseMatrix(mats,partition(axes(a,1)),partition(axes(a,2)),cache)
-end
-
-# PartitionedArrays extras
-
-function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
-  @check partition(axes(x,1)) === partition(axes(y,1))
-  map(partition(x),partition(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  consistent!(y) |> wait
-  return y
-end
-
-function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
-  map(blocks(x),blocks(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  return y
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::PSparseMatrix, B::PSparseMatrix;
-  check::Bool=true
-)
-# We should definitely check here that the index partitions are the same. 
-# However: Because the different matrices are assembled separately, the objects are not the 
-# same (i.e can't use ===). Checking the index partitions would then be costly...
-  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
-  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
-  map(partition(A),partition(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::BlockPMatrix, B::BlockPMatrix;
-  check::Bool=true
-)
-  map(blocks(A),blocks(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.ArrayCounter,axes)
-  @notimplemented
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
-  b = Algebra.CounterCOO{T}(axes)
-  b.nnz = a.nnz
-  b
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
-  counter = change_axes(a.counter,axes)
-  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
-end
-
-# Array of PArrays -> PArray of Arrays 
-function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      PartitionedArrays.getany(aj)
-    end
-  end
-end
-
-function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      aj.items[i]
-    end
-  end
-end
-
-# This type is required because MPIArray from PArrays 
-# cannot be instantiated with a NULL communicator
-struct MPIVoidVector{T} <: AbstractVector{T}
-  comm::MPI.Comm
-  function MPIVoidVector(::Type{T}) where {T}
-    new{T}(MPI.COMM_NULL)
-  end
-end
-
-Base.size(a::MPIVoidVector) = (0,)
-Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
-function Base.getindex(a::MPIVoidVector,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.setindex!(a::MPIVoidVector,v,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
-  println(io,"MPIVoidVector")
-end
-
-function num_parts(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    nparts = MPI.Comm_size(comm)
-  else
-    nparts = -1
-  end
-  nparts
-end
-@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
-@inline num_parts(comm::DebugArray) = length(comm.items)
-@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
-
-function get_part_id(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    id = MPI.Comm_rank(comm)+1
-  else
-    id = -1
-  end
-  id
-end
-@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
-@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
-
-"""
-    i_am_in(comm::MPIArray)
-    i_am_in(comm::DebugArray)
-  
-  Returns `true` if the processor is part of the subcommunicator `comm`.
-"""
-function i_am_in(comm::MPI.Comm)
-  get_part_id(comm) >=0
-end
-@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
-@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
-@inline i_am_in(comm::DebugArray) = true
-
-function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
-  x_new = map(new_parts) do p
-    if isa(x,MPIArray)
-      PartitionedArrays.getany(x)
-    elseif isa(x,DebugArray) && (p <= length(x.items))
-      x.items[p]
-    else
-      default
-    end
-  end
-  return x_new
-end
-
-function generate_subparts(parts::MPIArray,new_comm_size)
-  root_comm = parts.comm
-  root_size = MPI.Comm_size(root_comm)
-  rank = MPI.Comm_rank(root_comm)
-
-  @static if isdefined(MPI,:MPI_UNDEFINED)
-    mpi_undefined = MPI.MPI_UNDEFINED[]
-  else
-    mpi_undefined = MPI.API.MPI_UNDEFINED[]
-  end
-  
-  if root_size == new_comm_size
-    return parts
-  else
-    if rank < new_comm_size
-      comm = MPI.Comm_split(root_comm,0,0)
-      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
-    else
-      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
-      return MPIVoidVector(eltype(parts))
-    end
-  end
-end
-
-function generate_subparts(parts::DebugArray,new_comm_size)
-  DebugArray(LinearIndices((new_comm_size,)))
-end
-
 # local_views
 
 function local_views(a)
@@ -284,960 +98,3 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_
   end
   return BlockPVector(vals,ids)
 end
-
-# This function computes a mapping among the local identifiers of a and b
-# for which the corresponding global identifiers are both in a and b. 
-# Note that the haskey check is necessary because in the general case 
-# there might be gids in b which are not present in a
-function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
-  a_local_to_b_local = fill(Int32(-1),local_length(a))
-  b_local_to_global  = local_to_global(b)
-  a_global_to_local  = global_to_local(a)
-  for blid in 1:local_length(b)
-    gid = b_local_to_global[blid]
-    if a_global_to_local[gid] != zero(eltype(a_global_to_local))
-      alid = a_global_to_local[gid]
-      a_local_to_b_local[alid] = blid
-    end  
-  end
-  a_local_to_b_local
-end
-
-# This type is required in order to be able to access the local portion 
-# of distributed sparse matrices and vectors using local indices from the 
-# distributed test and trial spaces
-struct LocalView{T,N,A,B} <:AbstractArray{T,N}
-  plids_to_value::A
-  d_to_lid_to_plid::B
-  local_size::NTuple{N,Int}
-  function LocalView(
-    plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N}
-    A = typeof(plids_to_value)
-    B = typeof(d_to_lid_to_plid)
-    local_size = map(length,d_to_lid_to_plid)
-    new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size)
-  end
-end
-
-Base.size(a::LocalView) = a.local_size
-Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
-function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
-  if all(i->i>0,plids)
-    a.plids_to_value[plids...]
-  else
-    zero(T)
-  end
-end
-function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
-  @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
-  a.plids_to_value[plids...] = v
-end
-
-function _lid_to_plid(lid,lid_to_plid)
-  plid = lid_to_plid[lid]
-  plid
-end
-
-function local_views(a::PVector,new_rows::PRange)
-  old_rows = axes(a,1)
-  if partition(old_rows) === partition(new_rows)
-    partition(a)
-  else
-    map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows
-      LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),))
-    end
-  end
-end
-
-function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
-  old_rows, old_cols = axes(a)
-  if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
-    partition(a)
-  else
-    map(partition(a),
-        partition(old_rows),partition(old_cols),
-        partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols
-      rl2lmap = find_local_to_local_map(new_rows,old_rows)
-      cl2lmap = find_local_to_local_map(new_cols,old_cols)
-      LocalView(matrix_partition,(rl2lmap,cl2lmap))
-    end
-  end
-end
-
-function local_views(a::BlockPVector,new_rows::BlockPRange)
-  vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays
-  return map(mortar,vals)
-end
-
-function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange)
-  vals = map(CartesianIndices(blocksize(a))) do I
-    local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I])
-  end |> to_parray_of_arrays
-  return map(mortar,vals)
-end
-
-# PSparseMatrix assembly
-
-struct FullyAssembledRows end
-struct SubAssembledRows end
-
-# For the moment we use COO format even though
-# it is quite memory consuming.
-# We need to implement communication in other formats in
-# PartitionedArrays.jl
-
-struct PSparseMatrixBuilderCOO{T,B}
-  local_matrix_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(
-  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}
-) where A
-  rows, cols = axs # test ids, trial ids
-  counters = map(partition(rows),partition(cols)) do r,c
-    axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
-    Algebra.CounterCOO{A}(axs)
-  end
-  DistributedCounterCOO(builder.par_strategy,counters,rows,cols)
-end
-
-function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
-  T = eltype(Tv)
-  return PSparseMatrix{T}
-end
-
-
-"""
-"""
-struct DistributedCounterCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  counters::B
-  test_gids::C
-  trial_gids::D
-  function DistributedCounterCOO(
-    par_strategy,
-    counters::AbstractArray{<:Algebra.CounterCOO},
-    test_gids::PRange,
-    trial_gids::PRange)
-    A = typeof(par_strategy)
-    B = typeof(counters)
-    C = typeof(test_gids)
-    D = typeof(trial_gids)
-    new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids)
-  end
-end
-
-function local_views(a::DistributedCounterCOO)
-  return a.counters
-end
-
-function local_views(a::DistributedCounterCOO,test_gids,trial_gids)
-  @check PArrays.matching_local_indices(test_gids,a.test_gids)
-  @check PArrays.matching_local_indices(trial_gids,a.trial_gids)
-  return a.counters
-end
-
-function Algebra.nz_allocation(a::DistributedCounterCOO)
-  allocs = map(nz_allocation,a.counters)
-  DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids)
-end
-
-struct DistributedAllocationCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  allocs::B
-  test_gids::C
-  trial_gids::D
-  function DistributedAllocationCOO(
-    par_strategy,
-    allocs::AbstractArray{<:Algebra.AllocationCOO},
-    test_gids::PRange,
-    trial_gids::PRange)
-    A = typeof(par_strategy)
-    B = typeof(allocs)
-    C = typeof(test_gids)
-    D = typeof(trial_gids)
-    new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids)
-  end
-end
-
-function change_axes(
-  a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
-  axes::Tuple{<:PRange,<:PRange}
-) where {A,B}
-  local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
-    (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
-  end
-  allocs = map(change_axes,a.allocs,local_axes)
-  DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
-end
-
-function change_axes(
-  a::MatrixBlock{<:DistributedAllocationCOO},
-  axes::Tuple{<:Vector,<:Vector}
-)
-  block_ids  = CartesianIndices(a.array)
-  rows, cols = axes
-  array = map(block_ids) do I
-    change_axes(a[I],(rows[I[1]],cols[I[2]]))
-  end
-  return ArrayBlock(array,a.touched)
-end
-
-function local_views(a::DistributedAllocationCOO)
-  a.allocs
-end
-
-function local_views(a::DistributedAllocationCOO,test_gids,trial_gids)
-  @check test_gids === a.test_gids
-  @check trial_gids === a.trial_gids
-  a.allocs
-end
-
-function local_views(a::MatrixBlock{<:DistributedAllocationCOO})
-  array = map(local_views,a.array) |> to_parray_of_arrays
-  return map(ai -> ArrayBlock(ai,a.touched),array)
-end
-
-function get_allocations(a::DistributedAllocationCOO)
-  I,J,V = map(local_views(a)) do alloc
-    alloc.I, alloc.J, alloc.V
-  end |> tuple_of_arrays
-  return I,J,V
-end
-
-function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
-  tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays
-  return tuple_of_array_of_parrays
-end
-
-get_test_gids(a::DistributedAllocationCOO)  = a.test_gids
-get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids
-get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
-get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
-
-function Algebra.create_from_nz(a::PSparseMatrix)
-  # For FullyAssembledRows the underlying Exchanger should
-  # not have ghost layer making assemble! do nothing (TODO check)
-  assemble!(a) |> wait
-  a
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:FullyAssembledRows})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:FullyAssembledRows}})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function _fa_create_from_nz_with_callback(callback,a)
-
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_gids  = get_test_gids(a)
-  trial_gids = get_trial_gids(a)
-
-  rows = _setup_prange(test_gids,I;ghost=false,ax=:rows)
-  b = callback(rows)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
-
-  # Create the range for cols
-  cols = _setup_prange(trial_gids,J;ax=:cols)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  A = _setup_matrix(I,J,V,rows,cols)
-  return A, b
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:SubAssembledRows})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:SubAssembledRows}})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_gids = get_test_gids(a)
-  trial_gids = get_trial_gids(a)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
-
-  # Create the Prange for the rows
-  rows = _setup_prange(test_gids,I;ax=:rows)
-  
-  # Move (I,J,V) triplets to the owner process of each row I.
-  # The triplets are accompanyed which Jo which is the process column owner
-  Jo = get_gid_owners(J,trial_gids;ax=:cols)
-  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
-
-  # Here we can overlap computations
-  # This is a good place to overlap since
-  # sending the matrix rows is a lot of data
-  if !isa(b,Nothing)
-    bprange=_setup_prange_from_pvector_allocation(b)
-    b = callback(bprange)
-  end
-
-  # Wait the transfer to finish
-  wait(t)
-
-  # Create the Prange for the cols
-  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
-
-  # Overlap rhs communications with CSC compression
-  t2 = async_callback(b)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  A = _setup_matrix(I,J,V,rows,cols)
-
-  # Wait the transfer to finish
-  if !isa(t2,Nothing)
-    wait(t2)
-  end
-  
-  return A, b
-end
-
-
-# PVector assembly 
-
-struct PVectorBuilder{T,B}
-  local_vector_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange})
-  T = builder.local_vector_type
-  rows, = axs
-  counters = map(partition(rows)) do rows
-    axs = (Base.OneTo(local_length(rows)),)
-    nz_counter(ArrayBuilder(T),axs)
-  end
-  PVectorCounter(builder.par_strategy,counters,rows)
-end
-
-function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
-  T = eltype(Tv)
-  return PVector{T}
-end
-
-struct PVectorCounter{A,B,C}
-  par_strategy::A
-  counters::B
-  test_gids::C
-end
-
-Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
-
-function local_views(a::PVectorCounter)
-  a.counters
-end
-
-function local_views(a::PVectorCounter,rows)
-  @check rows === a.test_gids
-  a.counters
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:FullyAssembledRows})
-  dofs = a.test_gids
-  values = map(nz_allocation,a.counters)
-  PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
-end
-
-struct PVectorAllocationTrackOnlyValues{A,B,C}
-  par_strategy::A
-  values::B
-  test_gids::C
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues)
-  a.values
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues,rows)
-  @check rows === a.test_gids
-  a.values
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-  rows = _setup_prange_without_ghosts(a.test_gids)
-  _rhs_callback(a,rows)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:SubAssembledRows})
-  # This point MUST NEVER be reached. If reached there is an inconsistency
-  # in the parallel code in charge of vector assembly
-  @assert false
-end
-
-function _rhs_callback(row_partitioned_vector_partition,rows)
-  # The ghost values in row_partitioned_vector_partition are 
-  # aligned with the FESpace but not with the ghost values in the rows of A
-  b_fespace = PVector(row_partitioned_vector_partition.values,
-                      partition(row_partitioned_vector_partition.test_gids))
-
-  # This one is aligned with the rows of A
-  b = similar(b_fespace,eltype(b_fespace),(rows,))
-
-  # First transfer owned values
-  b .= b_fespace
-
-  # Now transfer ghost
-  function transfer_ghost(b,b_fespace,ids,ids_fespace)
-    num_ghosts_vec = ghost_length(ids)
-    gho_to_loc_vec = ghost_to_local(ids)
-    loc_to_glo_vec = local_to_global(ids)
-    gid_to_lid_fe  = global_to_local(ids_fespace)
-    for ghost_lid_vec in 1:num_ghosts_vec
-      lid_vec     = gho_to_loc_vec[ghost_lid_vec]
-      gid         = loc_to_glo_vec[lid_vec]
-      lid_fespace = gid_to_lid_fe[gid]
-      b[lid_vec] = b_fespace[lid_fespace]
-    end
-  end
-  map(
-    transfer_ghost,
-    partition(b),
-    partition(b_fespace),
-    b.index_partition,
-    b_fespace.index_partition)
-
-  return b
-end
-
-function Algebra.create_from_nz(a::PVector)
-  assemble!(a) |> wait
-  return a
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:FullyAssembledRows},
-  c_fespace::PVectorAllocationTrackOnlyValues{<:FullyAssembledRows})
-
-  function callback(rows)
-    _rhs_callback(c_fespace,rows)
-  end
-
-  A,b = _fa_create_from_nz_with_callback(callback,a)
-  return A,b
-end
-
-struct PVectorAllocationTrackTouchedAndValues{A,B,C}
-  allocations::A
-  values::B
-  test_gids::C
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:SubAssembledRows},
-  b::PVectorAllocationTrackTouchedAndValues)
-
-  function callback(rows)
-    _rhs_callback(b,rows)
-  end
-
-  function async_callback(b)
-    # now we can assemble contributions
-    assemble!(b)
-  end
-
-  A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  return A,b
-end
-
-struct ArrayAllocationTrackTouchedAndValues{A}
-  touched::Vector{Bool}
-  values::A
-end
-
-Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop()
-
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
-  @check rows === a.test_gids
-  a.allocations
-end
-
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if i>0
-    if !(a.touched[i])
-      a.touched[i]=true
-    end
-    if !isa(v,Nothing)
-      a.values[i]=c(v,a.values[i])
-    end
-  end
-  nothing
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if !isa(v,Nothing)
-    for (ve,ie) in zip(v,i)
-      Arrays.add_entry!(c,a,ve,ie)
-    end
-  else
-    for ie in i
-      Arrays.add_entry!(c,a,nothing,ie)
-    end
-  end
-  nothing
-end
-
-
-function _setup_touched_and_allocations_arrays(values)
-  touched = map(values) do values
-    fill!(Vector{Bool}(undef,length(values)),false)
-  end
-  allocations = map(values,touched) do values,touched
-   ArrayAllocationTrackTouchedAndValues(touched,values)
-  end
-  touched, allocations
-end
-
-function Arrays.nz_allocation(a::DistributedCounterCOO{<:SubAssembledRows},
-                              b::PVectorCounter{<:SubAssembledRows})
-  A      = nz_allocation(a)
-  dofs   = b.test_gids
-  values = map(nz_allocation,b.counters)
-  touched,allocations=_setup_touched_and_allocations_arrays(values)
-  B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-  return A,B
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:SubAssembledRows})
-  dofs = a.test_gids
-  values = map(nz_allocation,a.counters)
-  touched,allocations=_setup_touched_and_allocations_arrays(values)
-  return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-end
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues)
-  a.allocations
-end
-
-function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues)
-
-  # Find the ghost rows
-  allocations = local_views(a.allocations)
-  indices = partition(a.test_gids)
-  I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
-    dofs_lids_touched = findall(allocation.touched)
-    loc_to_gho = local_to_ghost(indices)
-    n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched)
-    I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids)
-    cur = 1
-    for lid in dofs_lids_touched
-      dof_lid = loc_to_gho[lid]
-      if dof_lid != 0
-        I_ghost_lids[cur] = dof_lid
-        cur = cur+1
-      end
-    end
-    I_ghost_lids
-  end
-
-  ghost_to_global, ghost_to_owner = map(
-    find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
-
-  ngids = length(a.test_gids)
-  _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues)
-  rows = _setup_prange_from_pvector_allocation(a)
-  b    = _rhs_callback(a,rows)
-  t2   = assemble!(b)
-  # Wait the transfer to finish
-  if t2 !== nothing
-    wait(t2)
-  end
-  return b
-end
-
-# Common Assembly Utilities
-function first_gdof_from_ids(ids)
-  if own_length(ids) == 0
-    return 1
-  end
-  lid_to_gid   = local_to_global(ids) 
-  owned_to_lid = own_to_local(ids)
-  return Int(lid_to_gid[first(owned_to_lid)])
-end
-
-function find_gid_and_owner(ighost_to_jghost,jindices)
-  jghost_to_local  = ghost_to_local(jindices)
-  jlocal_to_global = local_to_global(jindices)
-  jlocal_to_owner  = local_to_owner(jindices)
-  ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost))
-
-  ighost_to_global = jlocal_to_global[ighost_to_jlocal]
-  ighost_to_owner  = jlocal_to_owner[ighost_to_jlocal]
-  return ighost_to_global, ighost_to_owner
-end
-
-# The given ids are assumed to be a sub-set of the lids
-function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer})
-  glo_to_loc = global_to_local(a)
-  loc_to_gho = local_to_ghost(a)
-  
-  # First pass: Allocate
-  i = 0
-  ghost_lids_touched = fill(false,ghost_length(a))
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      i += 1
-    end
-  end
-  gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i)
-
-  # Second pass: fill 
-  i = 1
-  fill!(ghost_lids_touched,false)
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      gids_ghost_lid_to_ghost_lid[i] = ghost_lid
-      i += 1
-    end
-  end
-
-  return gids_ghost_lid_to_ghost_lid
-end
-
-# Find the neighbours of partition1 trying 
-# to use those in partition2 as a hint 
-function _find_neighbours!(partition1, partition2)
-  partition2_snd, partition2_rcv = assembly_neighbors(partition2)
-  partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv)
-  return assembly_neighbors(partition1; neighbors=partition2_graph)
-end
-
-# to_global! & to_local! analogs, needed for dispatching in block assembly
-
-function to_local_indices!(I,ids::PRange;kwargs...)
-  map(to_local!,I,partition(ids))
-end
-
-function to_global_indices!(I,ids::PRange;kwargs...)
-  map(to_global!,I,partition(ids))
-end
-
-function get_gid_owners(I,ids::PRange;kwargs...)
-  map(I,partition(ids)) do I, indices 
-    glo_to_loc = global_to_local(indices) 
-    loc_to_own = local_to_owner(indices)
-    map(x->loc_to_own[glo_to_loc[x]], I)
-  end 
-end
-
-for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
-  @eval begin
-    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
-      map($f,I,ids)
-    end
-
-    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
-      @check ax ∈ [:rows,:cols]
-      block_ids = CartesianIndices(I)
-      map(block_ids) do id
-        i = id[1]; j = id[2];
-        if ax == :rows
-          $f(I[i,j],ids[i])
-        else
-          $f(I[i,j],ids[j])
-        end
-      end
-    end
-  end
-end
-
-# _setup_matrix : local matrices + global PRanges -> Global matrix
-
-function _setup_matrix(I,J,V,rows::PRange,cols::PRange)
-  assembled_rows = map(PArrays.remove_ghost,partition(rows))
-  psparse(
-    I,J,V,assembled_rows,partition(cols);
-    split_format=true,
-    assembled=true,
-    assemble=true,
-    indices=:local,
-    reuse=false,
-  ) |> fetch
-end
-
-function _setup_matrix(I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange})
-  block_ids  = CartesianIndices(I)
-  block_mats = map(block_ids) do id
-    i = id[1]; j = id[2];
-    _setup_matrix(I[i,j],J[i,j],V[i,j],rows[i],cols[j])
-  end
-  return mortar(block_mats)
-end
-
-# _assemble_coo! : local coo triplets + global PRange -> Global coo values
-
-function _assemble_coo!(I,J,V,rows::PRange;owners=nothing)
-  if isa(owners,Nothing)
-    PArrays.assemble_coo!(I,J,V,partition(rows))
-  else
-    assemble_coo_with_column_owner!(I,J,V,partition(rows),owners)
-  end
-end
-
-function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing)
-  block_ids = CartesianIndices(I)
-  map(block_ids) do id
-    i = id[1]; j = id[2];
-    if isa(owners,Nothing)
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i])
-    else
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j])
-    end
-  end
-end
-
-function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
-  """
-    Returns three JaggedArrays with the coo triplets
-    to be sent to the corresponding owner parts in parts_snd
-  """
-  function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner)
-    global_to_local_row = global_to_local(row_lids)
-    local_row_to_owner = local_to_owner(row_lids)
-    owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) ))
-    ptrs = zeros(Int32,length(parts_snd)+1)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        ptrs[owner_to_i[owner]+1] +=1
-      end
-    end
-    PArrays.length_to_ptrs!(ptrs)
-    gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1)
-    gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1)
-    jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1)
-    v_snd_data = zeros(eltype(k_v),ptrs[end]-1)
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        gj = k_gj[k]
-        v = k_v[k]
-        p = ptrs[owner_to_i[owner]]
-        gi_snd_data[p] = gi
-        gj_snd_data[p] = gj
-        jo_snd_data[p] = k_jo[k]
-        v_snd_data[p]  = v
-        k_v[k] = zero(v)
-        ptrs[owner_to_i[owner]] += 1
-      end
-    end
-    PArrays.rewind_ptrs!(ptrs)
-    gi_snd = JaggedArray(gi_snd_data,ptrs)
-    gj_snd = JaggedArray(gj_snd_data,ptrs)
-    jo_snd = JaggedArray(jo_snd_data,ptrs)
-    v_snd = JaggedArray(v_snd_data,ptrs)
-    gi_snd, gj_snd, jo_snd, v_snd
-  end
-  """
-    Pushes to coo_entries_with_column_owner the tuples 
-    gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes
-  """
-  function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    current_n = length(k_gi)
-    new_n = current_n + length(gi_rcv.data)
-    resize!(k_gi,new_n)
-    resize!(k_gj,new_n)
-    resize!(k_jo,new_n)
-    resize!(k_v,new_n)
-    for p in 1:length(gi_rcv.data)
-        k_gi[current_n+p] = gi_rcv.data[p]
-        k_gj[current_n+p] = gj_rcv.data[p]
-        k_jo[current_n+p] = jo_rcv.data[p]
-        k_v[current_n+p] = v_rcv.data[p]
-    end
-  end
-  part = linear_indices(row_partition)
-  parts_snd, parts_rcv = assembly_neighbors(row_partition)
-  coo_entries_with_column_owner = map(tuple,I,J,Jown,V)
-  gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays
-  graph = ExchangeGraph(parts_snd,parts_rcv)
-  t1 = exchange(gi_snd,graph)
-  t2 = exchange(gj_snd,graph)
-  t3 = exchange(jo_snd,graph)
-  t4 = exchange(v_snd,graph)
-  @async begin
-      gi_rcv = fetch(t1)
-      gj_rcv = fetch(t2)
-      jo_rcv = fetch(t3)
-      v_rcv = fetch(t4)
-      map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-      I,J,Jown,V
-  end
-end
-
-# dofs_gids_prange can be either test_gids or trial_gids
-# In the former case, gids is a vector of global test dof identifiers, while in the 
-# latter, a vector of global trial dof identifiers
-function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
-  if !ghost
-    _setup_prange_without_ghosts(dofs_gids_prange)
-  elseif isa(owners,Nothing)
-    _setup_prange_with_ghosts(dofs_gids_prange,gids)
-  else
-    _setup_prange_with_ghosts(dofs_gids_prange,gids,owners)
-  end
-end
-
-function _setup_prange(
-  dofs_gids_prange::AbstractVector{<:PRange},
-  gids::AbstractMatrix;
-  ax=:rows,ghost=true,owners=nothing
-)
-  @check ax ∈ (:rows,:cols)
-  block_ids = LinearIndices(dofs_gids_prange)
-  pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x))
-
-  gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange
-    gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id]
-    gids_union = pvcat(gids_slice)
-
-    owners_union = nothing
-    if !isnothing(owners)
-      owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id]
-      owners_union = pvcat(owners_slice)
-    end
-
-    return gids_union, owners_union
-  end |> tuple_of_arrays
-  
-  return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union)
-end
-
-# Create PRange for the rows of the linear system
-# without local ghost dofs as per required in the 
-# FullyAssembledRows() parallel assembly strategy 
-function _setup_prange_without_ghosts(dofs_gids_prange::PRange)
-  ngdofs = length(dofs_gids_prange)
-  indices = map(partition(dofs_gids_prange)) do dofs_indices 
-    owner = part_id(dofs_indices)
-    own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,Int64[],Int32[])
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  return PRange(indices)
-end
-
-# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition
-# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system.
-# The precondition required for the consistency of any parallel assembly process in GridapDistributed 
-# is that each processor can determine locally with a single layer of ghost cells the global indices and associated 
-# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities 
-# (i.e., either cells or faces). 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> dof PRange ghost ids
-  gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition) |> tuple_of_arrays
-
-  return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner)
-end
-
-# Here we cannot assume that the sparse communication graph underlying 
-# trial_dofs_gids_partition is a superset of the one underlying indices.
-# Here we chould check whether it is included and call _find_neighbours!()
-# if this is the case. At present, we are not taking advantage of this, 
-# but let the parallel scalable algorithm to compute the reciprocal to do the job. 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    gids,owners,dofs_gids_partition) do gids, owners, indices
-    ghost_touched   = Dict{Int,Bool}()
-    ghost_to_global = Int64[] 
-    ghost_to_owner  = Int64[]
-    me = part_id(indices)
-    for (j,jo) in zip(gids,owners)
-      if jo != me
-        if !haskey(ghost_touched,j)
-          push!(ghost_to_global,j)
-          push!(ghost_to_owner,jo)
-          ghost_touched[j] = true
-        end
-      end
-    end
-    ghost_to_global, ghost_to_owner
-  end |> tuple_of_arrays
-
-  return _setup_prange_impl_(ngdofs,
-                             dofs_gids_partition,
-                             gids_ghost_to_global,
-                             gids_ghost_to_owner;
-                             discover_neighbours=false)
-end 
-
-function _setup_prange_impl_(ngdofs,
-                             dofs_gids_partition,
-                             gids_ghost_to_global,
-                             gids_ghost_to_owner;
-                             discover_neighbours=true)
-  indices = map(dofs_gids_partition, 
-                gids_ghost_to_global, 
-                gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner 
-    owner = part_id(dofs_indices)
-    own_indices   = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner)
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  if discover_neighbours
-    _find_neighbours!(indices,dofs_gids_partition)
-  end
-  return PRange(indices)
-end
diff --git a/src/Assembly.jl b/src/Assembly.jl
new file mode 100644
index 00000000..25e2411f
--- /dev/null
+++ b/src/Assembly.jl
@@ -0,0 +1,199 @@
+
+# Parallel assembly strategies
+
+struct Assembled <: FESpaces.AssemblyStrategy end
+struct SubAssembled <: FESpaces.AssemblyStrategy end
+struct LocallyAssembled <: FESpaces.AssemblyStrategy end
+
+# PSparseMatrix and PVector builders
+
+struct PSparseMatrixBuilder{T,B}
+  local_matrix_type::Type{T}
+  strategy::B
+end
+
+function Algebra.get_array_type(::PSparseMatrixBuilder{Tv}) where Tv
+  T = eltype(Tv)
+  return PSparseMatrix{T}
+end
+
+struct PVectorBuilder{T,B}
+  local_vector_type::Type{T}
+  strategy::B
+end
+
+function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
+  T = eltype(Tv)
+  return PVector{T}
+end
+
+# Distributed counters and allocators
+
+"""
+    DistributedCounter{S,T,N} <: GridapType
+
+Distributed N-dimensional counter, with local counters of type T.
+Follows assembly strategy S.
+"""
+struct DistributedCounter{S,T,N,A,B} <: GridapType
+  counters :: A
+  axes     :: B
+  strategy :: S
+  function DistributedCounter(
+    counters :: AbstractArray{T},
+    axes     :: NTuple{N,<:PRange},
+    strategy :: Algebra.AssemblyStrategy
+  ) where {T,N}
+    A, B, S = typeof(counters), typeof(axes), typeof(strategy)
+    new{S,T,N,A,B}(counters,axes,strategy)
+  end
+end
+
+Base.axes(a::DistributedCounter) = a.axes
+local_views(a::DistributedCounter) = a.counters
+
+const PVectorCounter{S,T} = DistributedCounter{S,T<:Algebra.ArrayCounter,1}
+Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
+
+const PSparseMatrixCounter{S,T} = DistributedCounter{S,T<:Algebra.CounterCOO,2}
+Algebra.LoopStyle(::Type{<:PSparseMatrixCounter}) = Loop()
+
+"""
+    DistributedAllocation{S,T,N} <: GridapType
+
+Distributed N-dimensional allocator, with local allocators of type T. 
+Follows assembly strategy S.
+"""
+struct DistributedAllocation{S,T,N,A,B} <: GridapType
+  allocs   :: A
+  axes     :: B
+  strategy :: S
+  function DistributedAllocation(
+    allocs   :: AbstractArray{T},
+    axes     :: NTuple{N,<:PRange},
+    strategy :: Algebra.AssemblyStrategy
+  ) where {T,N}
+    A, B, S = typeof(allocs), typeof(axes), typeof(strategy)
+    new{S,T,N,A,B}(allocs,axes,strategy)
+  end
+end
+
+Base.axes(a::DistributedAllocation) = a.axes
+local_views(a::DistributedAllocation) = a.allocs
+
+const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
+const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
+
+# PSparseMatrix assembly chain
+#
+#   1 - nz_counter(PSparseMatrixBuilder) -> PSparseMatrixCounter
+#   2 - nz_allocation(PSparseMatrixCounter) -> PSparseMatrixAllocation
+#   3 - create_from_nz(PSparseMatrixAllocation) -> PSparseMatrix
+
+function Algebra.nz_counter(builder::PSparseMatrixBuilder,axs::NTuple{2,<:PRange})
+  rows, cols = axs # test ids, trial ids
+  counters = map(partition(rows),partition(cols)) do rows,cols
+    axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols)))
+    Algebra.CounterCOO{A}(axs)
+  end
+  DistributedCounter(builder.par_strategy,counters,rows,cols)
+end
+
+function Algebra.nz_allocation(a::PSparseMatrixCounter)
+  allocs = map(nz_allocation,local_views(a))
+  DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+# PVector assembly chain:
+#
+#   1 - nz_counter(PVectorBuilder) -> PVectorCounter
+#   2 - nz_allocation(PVectorCounter) -> PVectorAllocation
+#   3 - create_from_nz(PVectorAllocation) -> PVector
+
+function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::Tuple{<:PRange}) where VT
+  rows, = axs
+  counters = map(partition(rows)) do rows
+    axs = (Base.OneTo(local_length(rows)),)
+    nz_counter(ArrayBuilder(VT),axs)
+  end
+  DistributedCounter(counters,axs,builder.strategy)
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled})
+  allocs = map(nz_allocation,local_views(a))
+  DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:Assembled})
+  values = map(nz_allocation,local_views(a))
+  allocs = map(TrackedArrayAllocation,values)
+  return DistributedAllocation(allocs,a.axes,a.strategy)
+end
+
+function Algebra.create_from_nz(a::PVector)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVector})
+  # This point MUST NEVER be reached. If reached there is an inconsistency
+  # in the parallel code in charge of vector assembly
+  @assert false
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{<:LocallyAssembled,<:AbstractVector})
+  rows = _setup_prange_without_ghosts(axes(a,1))
+  _rhs_callback(a,rows)
+end
+
+function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
+  rows = _setup_prange_from_pvector_allocation(a)
+  b    = _rhs_callback(a,rows)
+  t2   = assemble!(b)
+  if t2 !== nothing
+    wait(t2)
+  end
+  return b
+end
+
+# PSystem assembly chain:
+#
+# When assembling a full system (matrix + vector), it is more efficient to 
+# overlap communications the assembly of the matrix and the vector.
+# Not only it is faster, but also necessary to ensure identical ghost indices 
+# in both the matrix and vector rows.
+# This is done by using the following specializations chain:
+
+function Arrays.nz_allocation(
+  a::PSparseMatrixCounter{<:Assembled},
+  b::PVectorCounter{<:Assembled}
+)
+  A = nz_allocation(a) # PSparseMatrixAllocation
+  B = nz_allocation(b) # PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
+  return A, B
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:LocallyAssembled},
+  b::PVectorAllocation{<:LocallyAssembled,<:AbstractVector}
+)
+  function callback(rows)
+    _rhs_callback(b,rows)
+  end
+  A, B = _fa_create_from_nz_with_callback(callback,a)
+  return A, B
+end
+
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:Assembled},
+  b::PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
+)
+  function callback(rows)
+    _rhs_callback(b,rows)
+  end
+  function async_callback(b)
+    assemble!(b)
+  end
+  A, B = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  return A, B
+end
diff --git a/src/DistributedUtils.jl b/src/DistributedUtils.jl
deleted file mode 100644
index 0b34a0a1..00000000
--- a/src/DistributedUtils.jl
+++ /dev/null
@@ -1,44 +0,0 @@
-
-function permuted_variable_partition(
-  n_own::AbstractArray{<:Integer},
-  gids::AbstractArray{<:AbstractArray{<:Integer}},
-  owners::AbstractArray{<:AbstractArray{<:Integer}};
-  n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))),
-  start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
-)
-  ranks = linear_indices(n_own)
-  np = length(ranks)
-  map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners
-    n_local = length(gids)
-    n_ghost = n_local - n_own
-    perm = fill(zero(Int32),n_local)
-    ghost_gids = fill(zero(Int),n_ghost)
-    ghost_owners = fill(zero(Int32),n_ghost)
-
-    n_ghost = 0
-    for (lid,(gid,owner)) in enumerate(zip(gids,owners))
-      if owner == rank
-        perm[lid] = gid-start+1
-      else
-        n_ghost += 1
-        ghost_gids[n_ghost] = gid
-        ghost_owners[n_ghost] = owner
-        perm[lid] = n_own + n_ghost
-      end
-    end
-    @assert n_ghost == n_local - n_own
-
-    ghost = GhostIndices(n_global,ghost_gids,ghost_owners)
-    dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize(
-      CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost
-    )
-    permute_indices(dof_ids,perm)
-  end
-end
-
-function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T
-  ptrs = Vector{Int32}(undef,length(vv)+1)
-  Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv)
-  Arrays.length_to_ptrs!(ptrs)
-  return ptrs
-end
diff --git a/src/Geometry.jl b/src/Geometry.jl
index 4c788242..0f814b3c 100644
--- a/src/Geometry.jl
+++ b/src/Geometry.jl
@@ -582,7 +582,7 @@ function filter_cells_when_needed(
 end
 
 function filter_cells_when_needed(
-  portion::FullyAssembledRows,
+  portion::LocallyAssembled,
   cell_gids::AbstractLocalIndices,
   trian::Triangulation)
 
@@ -590,7 +590,7 @@ function filter_cells_when_needed(
 end
 
 function filter_cells_when_needed(
-  portion::SubAssembledRows,
+  portion::Assembled,
   cell_gids::AbstractLocalIndices,
   trian::Triangulation)
 
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index 7edf2a48..b55ddfe8 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -42,12 +42,15 @@ export with_ghost, no_ghost
 
 export redistribute
 
-include("DistributedUtils.jl")
+include("GridapExtras.jl")
+include("PArraysExtras.jl")
 
 include("BlockPartitionedArrays.jl")
 
 include("Algebra.jl")
 
+include("Assembly.jl")
+
 include("Geometry.jl")
 
 include("CellData.jl")
diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl
new file mode 100644
index 00000000..df5b1186
--- /dev/null
+++ b/src/GridapExtras.jl
@@ -0,0 +1,55 @@
+
+# Extensions to Gridap/Arrays/Tables.jl
+
+function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T
+  ptrs = Vector{Int32}(undef,length(vv)+1)
+  Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv)
+  Arrays.length_to_ptrs!(ptrs)
+  return ptrs
+end
+
+# New type of Vector allocation
+
+"""
+    TrackedArrayAllocation{T} <: GridapType
+
+Array that keeps track of which entries have been touched. For assembly purposes.
+"""
+struct TrackedArrayAllocation{T}
+  values  :: Vector{T}
+  touched :: Vector{Bool}
+end
+
+function TrackedArrayAllocation(values)
+  touched = fill(false,length(values))
+  TrackedArrayAllocation(values,touched)
+end
+
+Algebra.LoopStyle(::Type{<:TrackedArrayAllocation}) = Algebra.Loop()
+Algebra.create_from_nz(a::TrackedArrayAllocation) = a.values
+
+@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i)
+  if i > 0
+    a.touched[i] = true
+  end
+  nothing
+end
+@inline function Arrays.add_entry!(combine::Function,a::TrackedArrayAllocation,v,i)
+  if i > 0
+    a.touched[i] = true
+    a.values[i] = combine(v,a.values[i])
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v::Nothing,i)
+  for ie in i
+    Arrays.add_entry!(combine,a,nothing,ie)
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(combine::Function,a::TrackedArrayAllocation,v,i)
+  for (ve,ie) in zip(v,i)
+    Arrays.add_entry!(combine,a,ve,ie)
+  end
+  nothing
+end
diff --git a/src/MultiField.jl b/src/MultiField.jl
index 689a0621..75e55d83 100644
--- a/src/MultiField.jl
+++ b/src/MultiField.jl
@@ -453,7 +453,7 @@ function FESpaces.SparseMatrixAssembler(
   local_vec_type,
   trial::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}},
   test::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle{NB,SB,P}},
-  par_strategy=SubAssembledRows()) where {NB,SB,P}
+  par_strategy=Assembled()) where {NB,SB,P}
 
   block_idx  = CartesianIndices((NB,NB))
   block_rows = blocks(test.gids)
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
new file mode 100644
index 00000000..f4a5ae2e
--- /dev/null
+++ b/src/PArraysExtras.jl
@@ -0,0 +1,210 @@
+
+"""
+    permuted_variable_partition(n_own,gids,owners; n_global, start)
+
+Create indices which are a permuted version of a variable_partition.
+The advantage of this wrt the `LocalIndices` type, is that we can compute 
+dof owners with minimal communications, since we only need the size of the blocks.
+
+NOTE: This is the type for our FESpace dof_ids.
+"""
+function permuted_variable_partition(
+  n_own::AbstractArray{<:Integer},
+  gids::AbstractArray{<:AbstractArray{<:Integer}},
+  owners::AbstractArray{<:AbstractArray{<:Integer}};
+  n_global=reduction(+,n_own,destination=:all,init=zero(eltype(n_own))),
+  start=scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+)
+  ranks = linear_indices(n_own)
+  np = length(ranks)
+  map(ranks,n_own,n_global,start,gids,owners) do rank,n_own,n_global,start,gids,owners
+    n_local = length(gids)
+    n_ghost = n_local - n_own
+    perm = fill(zero(Int32),n_local)
+    ghost_gids = fill(zero(Int),n_ghost)
+    ghost_owners = fill(zero(Int32),n_ghost)
+
+    n_ghost = 0
+    for (lid,(gid,owner)) in enumerate(zip(gids,owners))
+      if owner == rank
+        perm[lid] = gid-start+1
+      else
+        n_ghost += 1
+        ghost_gids[n_ghost] = gid
+        ghost_owners[n_ghost] = owner
+        perm[lid] = n_own + n_ghost
+      end
+    end
+    @assert n_ghost == n_local - n_own
+
+    ghost = GhostIndices(n_global,ghost_gids,ghost_owners)
+    dof_ids = PartitionedArrays.LocalIndicesWithVariableBlockSize(
+      CartesianIndex((rank,)),(np,),(n_global,),((1:n_own).+(start-1),),ghost
+    )
+    permute_indices(dof_ids,perm)
+  end
+end
+
+# Linear algebra
+
+function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
+  @check partition(axes(x,1)) === partition(axes(y,1))
+  map(partition(x),partition(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  consistent!(y) |> wait
+  return y
+end
+
+function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
+  map(blocks(x),blocks(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  return y
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::PSparseMatrix, B::PSparseMatrix;
+  check::Bool=true
+)
+# We should definitely check here that the index partitions are the same. 
+# However: Because the different matrices are assembled separately, the objects are not the 
+# same (i.e can't use ===). Checking the index partitions would then be costly...
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
+  map(partition(A),partition(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::BlockPMatrix, B::BlockPMatrix;
+  check::Bool=true
+)
+  map(blocks(A),blocks(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+# Array of PArrays -> PArray of Arrays 
+# TODO: I think this is now implemented in PartitionedArrays.jl (check)
+function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      PartitionedArrays.getany(aj)
+    end
+  end
+end
+
+function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      aj.items[i]
+    end
+  end
+end
+
+# This type is required because MPIArray from PArrays 
+# cannot be instantiated with a NULL communicator
+struct MPIVoidVector{T} <: AbstractVector{T}
+  comm::MPI.Comm
+  function MPIVoidVector(::Type{T}) where {T}
+    new{T}(MPI.COMM_NULL)
+  end
+end
+
+Base.size(a::MPIVoidVector) = (0,)
+Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
+function Base.getindex(a::MPIVoidVector,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.setindex!(a::MPIVoidVector,v,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
+  println(io,"MPIVoidVector")
+end
+
+# Communication extras, subpartitioning extras
+
+function num_parts(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    nparts = MPI.Comm_size(comm)
+  else
+    nparts = -1
+  end
+  nparts
+end
+@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
+@inline num_parts(comm::DebugArray) = length(comm.items)
+@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
+
+function get_part_id(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    id = MPI.Comm_rank(comm)+1
+  else
+    id = -1
+  end
+  id
+end
+@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
+@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
+
+"""
+    i_am_in(comm::MPIArray)
+    i_am_in(comm::DebugArray)
+  
+  Returns `true` if the processor is part of the subcommunicator `comm`.
+"""
+function i_am_in(comm::MPI.Comm)
+  get_part_id(comm) >=0
+end
+@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
+@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
+@inline i_am_in(comm::DebugArray) = true
+
+function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
+  x_new = map(new_parts) do p
+    if isa(x,MPIArray)
+      PartitionedArrays.getany(x)
+    elseif isa(x,DebugArray) && (p <= length(x.items))
+      x.items[p]
+    else
+      default
+    end
+  end
+  return x_new
+end
+
+function generate_subparts(parts::MPIArray,new_comm_size)
+  root_comm = parts.comm
+  root_size = MPI.Comm_size(root_comm)
+  rank = MPI.Comm_rank(root_comm)
+
+  @static if isdefined(MPI,:MPI_UNDEFINED)
+    mpi_undefined = MPI.MPI_UNDEFINED[]
+  else
+    mpi_undefined = MPI.API.MPI_UNDEFINED[]
+  end
+  
+  if root_size == new_comm_size
+    return parts
+  else
+    if rank < new_comm_size
+      comm = MPI.Comm_split(root_comm,0,0)
+      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
+    else
+      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
+      return MPIVoidVector(eltype(parts))
+    end
+  end
+end
+
+function generate_subparts(parts::DebugArray,new_comm_size)
+  DebugArray(LinearIndices((new_comm_size,)))
+end
+
diff --git a/src/_Algebra_old.jl b/src/_Algebra_old.jl
new file mode 100644
index 00000000..e5aefd6a
--- /dev/null
+++ b/src/_Algebra_old.jl
@@ -0,0 +1,1238 @@
+
+# Vector allocation 
+
+function Algebra.allocate_vector(::Type{<:PVector{V}},ids::PRange) where {V}
+  PVector{V}(undef,partition(ids))
+end
+
+function Algebra.allocate_vector(::Type{<:BlockPVector{V}},ids::BlockPRange) where {V}
+  BlockPVector{V}(undef,ids)
+end
+
+function Algebra.allocate_in_range(matrix::PSparseMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_range(PVector{V},matrix)
+end
+
+function Algebra.allocate_in_domain(matrix::PSparseMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_domain(PVector{V},matrix)
+end
+
+function Algebra.allocate_in_range(matrix::BlockPMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_range(BlockPVector{V},matrix)
+end
+
+function Algebra.allocate_in_domain(matrix::BlockPMatrix)
+  V = Vector{eltype(matrix)}
+  allocate_in_domain(BlockPVector{V},matrix)
+end
+
+# PartitionedArrays extras
+
+function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
+  @check partition(axes(x,1)) === partition(axes(y,1))
+  map(partition(x),partition(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  consistent!(y) |> wait
+  return y
+end
+
+function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
+  map(blocks(x),blocks(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  return y
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::PSparseMatrix, B::PSparseMatrix;
+  check::Bool=true
+)
+# We should definitely check here that the index partitions are the same. 
+# However: Because the different matrices are assembled separately, the objects are not the 
+# same (i.e can't use ===). Checking the index partitions would then be costly...
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
+  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
+  map(partition(A),partition(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+function Algebra.axpy_entries!(
+  α::Number, A::BlockPMatrix, B::BlockPMatrix;
+  check::Bool=true
+)
+  map(blocks(A),blocks(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
+# TODO: Can we remove this?
+# This might go to Gridap in the future. We keep it here for the moment.
+function change_axes(a::Algebra.ArrayCounter,axes)
+  @notimplemented
+end
+
+# This might go to Gridap in the future. We keep it here for the moment.
+function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
+  b = Algebra.CounterCOO{T}(axes)
+  b.nnz = a.nnz
+  b
+end
+
+# This might go to Gridap in the future. We keep it here for the moment.
+function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
+  counter = change_axes(a.counter,axes)
+  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
+end
+
+# Array of PArrays -> PArray of Arrays 
+function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      PartitionedArrays.getany(aj)
+    end
+  end
+end
+
+function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
+  indices = linear_indices(first(a))
+  map(indices) do i
+    map(a) do aj
+      aj.items[i]
+    end
+  end
+end
+
+# This type is required because MPIArray from PArrays 
+# cannot be instantiated with a NULL communicator
+struct MPIVoidVector{T} <: AbstractVector{T}
+  comm::MPI.Comm
+  function MPIVoidVector(::Type{T}) where {T}
+    new{T}(MPI.COMM_NULL)
+  end
+end
+
+Base.size(a::MPIVoidVector) = (0,)
+Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
+function Base.getindex(a::MPIVoidVector,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.setindex!(a::MPIVoidVector,v,i::Int)
+  error("Indexing of MPIVoidVector not possible.")
+end
+function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
+  println(io,"MPIVoidVector")
+end
+
+function num_parts(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    nparts = MPI.Comm_size(comm)
+  else
+    nparts = -1
+  end
+  nparts
+end
+@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
+@inline num_parts(comm::DebugArray) = length(comm.items)
+@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
+
+function get_part_id(comm::MPI.Comm)
+  if comm != MPI.COMM_NULL
+    id = MPI.Comm_rank(comm)+1
+  else
+    id = -1
+  end
+  id
+end
+@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
+@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
+
+"""
+    i_am_in(comm::MPIArray)
+    i_am_in(comm::DebugArray)
+  
+  Returns `true` if the processor is part of the subcommunicator `comm`.
+"""
+function i_am_in(comm::MPI.Comm)
+  get_part_id(comm) >=0
+end
+@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
+@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
+@inline i_am_in(comm::DebugArray) = true
+
+function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
+  x_new = map(new_parts) do p
+    if isa(x,MPIArray)
+      PartitionedArrays.getany(x)
+    elseif isa(x,DebugArray) && (p <= length(x.items))
+      x.items[p]
+    else
+      default
+    end
+  end
+  return x_new
+end
+
+function generate_subparts(parts::MPIArray,new_comm_size)
+  root_comm = parts.comm
+  root_size = MPI.Comm_size(root_comm)
+  rank = MPI.Comm_rank(root_comm)
+
+  @static if isdefined(MPI,:MPI_UNDEFINED)
+    mpi_undefined = MPI.MPI_UNDEFINED[]
+  else
+    mpi_undefined = MPI.API.MPI_UNDEFINED[]
+  end
+  
+  if root_size == new_comm_size
+    return parts
+  else
+    if rank < new_comm_size
+      comm = MPI.Comm_split(root_comm,0,0)
+      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
+    else
+      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
+      return MPIVoidVector(eltype(parts))
+    end
+  end
+end
+
+function generate_subparts(parts::DebugArray,new_comm_size)
+  DebugArray(LinearIndices((new_comm_size,)))
+end
+
+# local_views
+
+function local_views(a)
+  @abstractmethod
+end
+
+function get_parts(a)
+  return linear_indices(local_views(a))
+end
+
+function local_views(a::AbstractVector,rows)
+  @notimplemented
+end
+
+function local_views(a::AbstractMatrix,rows,cols)
+  @notimplemented
+end
+
+local_views(a::AbstractArray) = a
+local_views(a::PRange) = partition(a)
+local_views(a::PVector) = partition(a)
+local_views(a::PSparseMatrix) = partition(a)
+
+function local_views(a::BlockPRange)
+  map(blocks(a)) do a
+    local_views(a)
+  end |> to_parray_of_arrays
+end
+
+function local_views(a::BlockPArray)
+  vals = map(blocks(a)) do a
+    local_views(a)
+  end |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+# change_ghost
+
+function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T
+  same_partition = (a.index_partition === partition(ids))
+  a_new = same_partition ? a : change_ghost(T,a,ids)
+  if make_consistent && (!same_partition || !is_consistent)
+    consistent!(a_new) |> wait
+  end
+  return a_new
+end
+
+function change_ghost(::Type{<:AbstractVector},a::PVector,ids::PRange)
+  a_new = similar(a,eltype(a),(ids,))
+  # Equivalent to copy!(a_new,a) but does not check that owned indices match
+  map(copy!,own_values(a_new),own_values(a))
+  return a_new
+end
+
+function change_ghost(::Type{<:OwnAndGhostVectors},a::PVector,ids::PRange)
+  values = map(own_values(a),partition(ids)) do own_vals,ids
+    ghost_vals = fill(zero(eltype(a)),ghost_length(ids))
+    perm = PartitionedArrays.local_permutation(ids)
+    OwnAndGhostVectors(own_vals,ghost_vals,perm)
+  end
+  return PVector(values,partition(ids))
+end
+
+function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_consistent=false)
+  vals = map(blocks(a),blocks(ids)) do a, ids
+    change_ghost(a,ids;is_consistent=is_consistent,make_consistent=make_consistent)
+  end
+  return BlockPVector(vals,ids)
+end
+
+# TODO: Can we remove this?
+# This function computes a mapping among the local identifiers of a and b
+# for which the corresponding global identifiers are both in a and b. 
+# Note that the haskey check is necessary because in the general case 
+# there might be gids in b which are not present in a
+function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
+  a_local_to_b_local = fill(Int32(-1),local_length(a))
+  b_local_to_global  = local_to_global(b)
+  a_global_to_local  = global_to_local(a)
+  for blid in 1:local_length(b)
+    gid = b_local_to_global[blid]
+    if a_global_to_local[gid] != zero(eltype(a_global_to_local))
+      alid = a_global_to_local[gid]
+      a_local_to_b_local[alid] = blid
+    end  
+  end
+  a_local_to_b_local
+end
+
+# TODO: Can we remove this?
+# This type is required in order to be able to access the local portion 
+# of distributed sparse matrices and vectors using local indices from the 
+# distributed test and trial spaces
+struct LocalView{T,N,A,B} <:AbstractArray{T,N}
+  plids_to_value::A
+  d_to_lid_to_plid::B
+  local_size::NTuple{N,Int}
+  function LocalView(
+    plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N}
+    A = typeof(plids_to_value)
+    B = typeof(d_to_lid_to_plid)
+    local_size = map(length,d_to_lid_to_plid)
+    new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size)
+  end
+end
+
+Base.size(a::LocalView) = a.local_size
+Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
+function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
+  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  if all(i->i>0,plids)
+    a.plids_to_value[plids...]
+  else
+    zero(T)
+  end
+end
+function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
+  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
+  @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
+  a.plids_to_value[plids...] = v
+end
+
+function _lid_to_plid(lid,lid_to_plid)
+  plid = lid_to_plid[lid]
+  plid
+end
+
+function local_views(a::PVector,new_rows::PRange)
+  old_rows = axes(a,1)
+  if partition(old_rows) === partition(new_rows)
+    partition(a)
+  else
+    map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows
+      LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),))
+    end
+  end
+end
+
+function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
+  old_rows, old_cols = axes(a)
+  if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
+    partition(a)
+  else
+    map(partition(a),
+        partition(old_rows),partition(old_cols),
+        partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols
+      rl2lmap = find_local_to_local_map(new_rows,old_rows)
+      cl2lmap = find_local_to_local_map(new_cols,old_cols)
+      LocalView(matrix_partition,(rl2lmap,cl2lmap))
+    end
+  end
+end
+
+function local_views(a::BlockPVector,new_rows::BlockPRange)
+  vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange)
+  vals = map(CartesianIndices(blocksize(a))) do I
+    local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I])
+  end |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+# PSparseMatrix assembly
+
+struct Assembled <: FESpaces.AssemblyStrategy end
+struct SubAssembled <: FESpaces.AssemblyStrategy end
+struct LocallyAssembled <: FESpaces.AssemblyStrategy end
+
+struct PSparseMatrixBuilderCOO{T,B}
+  local_matrix_type::Type{T}
+  par_strategy::B
+end
+
+function Algebra.nz_counter(
+  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}
+) where A
+  rows, cols = axs # test ids, trial ids
+  counters = map(partition(rows),partition(cols)) do r,c
+    axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
+    Algebra.CounterCOO{A}(axs)
+  end
+  DistributedCounterCOO(builder.par_strategy,counters,rows,cols)
+end
+
+function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
+  T = eltype(Tv)
+  return PSparseMatrix{T}
+end
+
+
+"""
+"""
+struct DistributedCounterCOO{A,B,C,D} <: GridapType
+  par_strategy::A
+  counters    ::B
+  test_gids   ::C
+  trial_gids  ::D
+  function DistributedCounterCOO(
+    par_strategy,
+    counters::AbstractArray{<:Algebra.CounterCOO},
+    test_gids::PRange,
+    trial_gids::PRange
+  )
+    A = typeof(par_strategy)
+    B = typeof(counters)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids)
+  end
+end
+
+function local_views(a::DistributedCounterCOO)
+  return a.counters
+end
+
+function local_views(a::DistributedCounterCOO,test_gids,trial_gids)
+  @check PArrays.matching_local_indices(test_gids,a.test_gids)
+  @check PArrays.matching_local_indices(trial_gids,a.trial_gids)
+  return a.counters
+end
+
+function Algebra.nz_allocation(a::DistributedCounterCOO)
+  allocs = map(nz_allocation,a.counters) # Array{AllocationCOO}
+  DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids)
+end
+
+struct DistributedAllocationCOO{A,B,C,D} <: GridapType
+  par_strategy::A
+  allocs::B
+  test_gids::C
+  trial_gids::D
+  function DistributedAllocationCOO(
+    par_strategy,
+    allocs::AbstractArray{<:Algebra.AllocationCOO},
+    test_gids::PRange,
+    trial_gids::PRange)
+    A = typeof(par_strategy)
+    B = typeof(allocs)
+    C = typeof(test_gids)
+    D = typeof(trial_gids)
+    new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids)
+  end
+end
+
+function change_axes(
+  a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
+  axes::Tuple{<:PRange,<:PRange}
+) where {A,B}
+  local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
+    (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
+  end
+  allocs = map(change_axes,a.allocs,local_axes)
+  DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
+end
+
+function change_axes(
+  a::MatrixBlock{<:DistributedAllocationCOO},
+  axes::Tuple{<:Vector,<:Vector}
+)
+  block_ids  = CartesianIndices(a.array)
+  rows, cols = axes
+  array = map(block_ids) do I
+    change_axes(a[I],(rows[I[1]],cols[I[2]]))
+  end
+  return ArrayBlock(array,a.touched)
+end
+
+function local_views(a::DistributedAllocationCOO)
+  a.allocs
+end
+
+function local_views(a::DistributedAllocationCOO,test_gids,trial_gids)
+  @check test_gids === a.test_gids
+  @check trial_gids === a.trial_gids
+  a.allocs
+end
+
+function local_views(a::MatrixBlock{<:DistributedAllocationCOO})
+  array = map(local_views,a.array) |> to_parray_of_arrays
+  return map(ai -> ArrayBlock(ai,a.touched),array)
+end
+
+function get_allocations(a::DistributedAllocationCOO)
+  I,J,V = map(local_views(a)) do alloc
+    alloc.I, alloc.J, alloc.V
+  end |> tuple_of_arrays
+  return I,J,V
+end
+
+function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
+  tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays
+  return tuple_of_array_of_parrays
+end
+
+get_test_gids(a::DistributedAllocationCOO)  = a.test_gids
+get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids
+get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
+get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
+
+function Algebra.create_from_nz(a::PSparseMatrix)
+  # For LocallyAssembled the underlying Exchanger should
+  # not have ghost layer making assemble! do nothing (TODO check)
+  assemble!(a) |> wait
+  a
+end
+
+function Algebra.create_from_nz(a::DistributedAllocationCOO{<:LocallyAssembled})
+  f(x) = nothing
+  A, = _fa_create_from_nz_with_callback(f,a)
+  return A
+end
+
+function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:LocallyAssembled}})
+  f(x) = nothing
+  A, = _fa_create_from_nz_with_callback(f,a)
+  return A
+end
+
+function _fa_create_from_nz_with_callback(callback,a)
+
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids  = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
+
+  rows = _setup_prange(test_gids,I;ghost=false,ax=:rows)
+  b = callback(rows)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the range for cols
+  cols = _setup_prange(trial_gids,J;ax=:cols)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(I,J,V,rows,cols)
+  return A, b
+end
+
+function Algebra.create_from_nz(a::DistributedAllocationCOO{<:Assembled})
+  f(x) = nothing
+  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
+  return A
+end
+
+function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:Assembled}})
+  f(x) = nothing
+  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
+  return A
+end
+
+function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the Prange for the rows
+  rows = _setup_prange(test_gids,I;ax=:rows)
+  
+  # Move (I,J,V) triplets to the owner process of each row I.
+  # The triplets are accompanyed which Jo which is the process column owner
+  Jo = get_gid_owners(J,trial_gids;ax=:cols)
+  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
+
+  # Here we can overlap computations
+  # This is a good place to overlap since
+  # sending the matrix rows is a lot of data
+  if !isa(b,Nothing)
+    bprange=_setup_prange_from_pvector_allocation(b)
+    b = callback(bprange)
+  end
+
+  # Wait the transfer to finish
+  wait(t)
+
+  # Create the Prange for the cols
+  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
+
+  # Overlap rhs communications with CSC compression
+  t2 = async_callback(b)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(a,I,J,V,rows,cols)
+
+  # Wait the transfer to finish
+  if !isa(t2,Nothing)
+    wait(t2)
+  end
+  
+  return A, b
+end
+
+
+# PVector assembly 
+
+struct PVectorBuilder{T,B}
+  local_vector_type::Type{T}
+  par_strategy::B
+end
+
+function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange})
+  T = builder.local_vector_type
+  rows, = axs
+  counters = map(partition(rows)) do rows
+    axs = (Base.OneTo(local_length(rows)),)
+    nz_counter(ArrayBuilder(T),axs)
+  end
+  PVectorCounter(builder.par_strategy,counters,rows)
+end
+
+function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
+  T = eltype(Tv)
+  return PVector{T}
+end
+
+struct PVectorCounter{A,B,C}
+  par_strategy::A
+  counters::B
+  test_gids::C
+end
+
+Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
+
+function local_views(a::PVectorCounter)
+  a.counters
+end
+
+function local_views(a::PVectorCounter,rows)
+  @check rows === a.test_gids
+  a.counters
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled})
+  dofs = a.test_gids
+  values = map(nz_allocation,a.counters)
+  PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
+end
+
+struct PVectorAllocationTrackOnlyValues{A,B,C}
+  par_strategy::A
+  values::B
+  test_gids::C
+end
+
+function local_views(a::PVectorAllocationTrackOnlyValues)
+  a.values
+end
+
+function local_views(a::PVectorAllocationTrackOnlyValues,rows)
+  @check rows === a.test_gids
+  a.values
+end
+
+function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:LocallyAssembled})
+  rows = _setup_prange_without_ghosts(a.test_gids)
+  _rhs_callback(a,rows)
+end
+
+function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:Assembled})
+  # This point MUST NEVER be reached. If reached there is an inconsistency
+  # in the parallel code in charge of vector assembly
+  @assert false
+end
+
+function _rhs_callback(row_partitioned_vector_partition,rows)
+  # The ghost values in row_partitioned_vector_partition are 
+  # aligned with the FESpace but not with the ghost values in the rows of A
+  b_fespace = PVector(row_partitioned_vector_partition.values,
+                      partition(row_partitioned_vector_partition.test_gids))
+
+  # This one is aligned with the rows of A
+  b = similar(b_fespace,eltype(b_fespace),(rows,))
+
+  # First transfer owned values
+  b .= b_fespace
+
+  # Now transfer ghost
+  function transfer_ghost(b,b_fespace,ids,ids_fespace)
+    num_ghosts_vec = ghost_length(ids)
+    gho_to_loc_vec = ghost_to_local(ids)
+    loc_to_glo_vec = local_to_global(ids)
+    gid_to_lid_fe  = global_to_local(ids_fespace)
+    for ghost_lid_vec in 1:num_ghosts_vec
+      lid_vec     = gho_to_loc_vec[ghost_lid_vec]
+      gid         = loc_to_glo_vec[lid_vec]
+      lid_fespace = gid_to_lid_fe[gid]
+      b[lid_vec] = b_fespace[lid_fespace]
+    end
+  end
+  map(
+    transfer_ghost,
+    partition(b),
+    partition(b_fespace),
+    b.index_partition,
+    b_fespace.index_partition
+  )
+
+  return b
+end
+
+function Algebra.create_from_nz(a::PVector)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(
+  a::DistributedAllocationCOO{<:LocallyAssembled},
+  c_fespace::PVectorAllocationTrackOnlyValues{<:LocallyAssembled}
+)
+
+  function callback(rows)
+    _rhs_callback(c_fespace,rows)
+  end
+
+  A,b = _fa_create_from_nz_with_callback(callback,a)
+  return A,b
+end
+
+struct PVectorAllocationTrackTouchedAndValues{A,B,C}
+  allocations::A
+  values::B
+  test_gids::C
+end
+
+function Algebra.create_from_nz(
+  a::DistributedAllocationCOO{<:Assembled},
+  b::PVectorAllocationTrackTouchedAndValues
+)
+
+  function callback(rows)
+    _rhs_callback(b,rows)
+  end
+
+  function async_callback(b)
+    # now we can assemble contributions
+    assemble!(b)
+  end
+
+  A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  return A,b
+end
+
+struct ArrayAllocationTrackTouchedAndValues{A}
+  touched::Vector{Bool}
+  values::A
+end
+
+Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop()
+
+function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
+  @check PArrays.matching_local_indices(rows, a.test_gids)
+  a.allocations
+end
+
+@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
+  @notimplemented
+end
+@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
+  if i > 0
+    a.touched[i]=true
+    if !isnothing(v)
+      a.values[i]=c(v,a.values[i])
+    end
+  end
+  nothing
+end
+@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
+  @notimplemented
+end
+@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
+  if !isa(v,Nothing)
+    for (ve,ie) in zip(v,i)
+      Arrays.add_entry!(c,a,ve,ie)
+    end
+  else
+    for ie in i
+      Arrays.add_entry!(c,a,nothing,ie)
+    end
+  end
+  nothing
+end
+
+
+function _setup_touched_and_allocations_arrays(values)
+  touched = map(values) do values
+    fill!(Vector{Bool}(undef,length(values)),false)
+  end
+  allocations = map(values,touched) do values,touched
+   ArrayAllocationTrackTouchedAndValues(touched,values)
+  end
+  touched, allocations
+end
+
+function Arrays.nz_allocation(
+  a::DistributedCounterCOO{<:Assembled},
+  b::PVectorCounter{<:Assembled}
+)
+  A      = nz_allocation(a)
+  dofs   = b.test_gids
+  values = map(nz_allocation,b.counters)
+  touched, allocations = _setup_touched_and_allocations_arrays(values)
+  B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
+  return A, B
+end
+
+function Arrays.nz_allocation(a::PVectorCounter{<:Assembled})
+  dofs = a.test_gids
+  values = map(nz_allocation,a.counters)
+  touched,allocations=_setup_touched_and_allocations_arrays(values)
+  return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
+end
+
+function local_views(a::PVectorAllocationTrackTouchedAndValues)
+  a.allocations
+end
+
+function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues)
+
+  # Find the ghost rows
+  allocations = local_views(a.allocations)
+  indices = partition(a.test_gids)
+  I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
+    dofs_lids_touched = findall(allocation.touched)
+    loc_to_gho = local_to_ghost(indices)
+    n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched)
+    I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids)
+    cur = 1
+    for lid in dofs_lids_touched
+      dof_lid = loc_to_gho[lid]
+      if dof_lid != 0
+        I_ghost_lids[cur] = dof_lid
+        cur = cur+1
+      end
+    end
+    I_ghost_lids
+  end
+
+  ghost_to_global, ghost_to_owner = map(
+    find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
+
+  ngids = length(a.test_gids)
+  _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
+end
+
+function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues)
+  rows = _setup_prange_from_pvector_allocation(a)
+  b    = _rhs_callback(a,rows)
+  t2   = assemble!(b)
+  # Wait the transfer to finish
+  if t2 !== nothing
+    wait(t2)
+  end
+  return b
+end
+
+# Common Assembly Utilities
+function first_gdof_from_ids(ids)
+  if own_length(ids) == 0
+    return 1
+  end
+  lid_to_gid   = local_to_global(ids) 
+  owned_to_lid = own_to_local(ids)
+  return Int(lid_to_gid[first(owned_to_lid)])
+end
+
+function find_gid_and_owner(ighost_to_jghost,jindices)
+  jghost_to_local  = ghost_to_local(jindices)
+  jlocal_to_global = local_to_global(jindices)
+  jlocal_to_owner  = local_to_owner(jindices)
+  ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost))
+
+  ighost_to_global = jlocal_to_global[ighost_to_jlocal]
+  ighost_to_owner  = jlocal_to_owner[ighost_to_jlocal]
+  return ighost_to_global, ighost_to_owner
+end
+
+# The given ids are assumed to be a sub-set of the lids
+function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer})
+  glo_to_loc = global_to_local(a)
+  loc_to_gho = local_to_ghost(a)
+  
+  # First pass: Allocate
+  i = 0
+  ghost_lids_touched = fill(false,ghost_length(a))
+  for gid in gids
+    lid = glo_to_loc[gid]
+    ghost_lid = loc_to_gho[lid]
+    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
+      ghost_lids_touched[ghost_lid] = true
+      i += 1
+    end
+  end
+  gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i)
+
+  # Second pass: fill 
+  i = 1
+  fill!(ghost_lids_touched,false)
+  for gid in gids
+    lid = glo_to_loc[gid]
+    ghost_lid = loc_to_gho[lid]
+    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
+      ghost_lids_touched[ghost_lid] = true
+      gids_ghost_lid_to_ghost_lid[i] = ghost_lid
+      i += 1
+    end
+  end
+
+  return gids_ghost_lid_to_ghost_lid
+end
+
+# Find the neighbours of partition1 trying 
+# to use those in partition2 as a hint 
+function _find_neighbours!(partition1, partition2)
+  partition2_snd, partition2_rcv = assembly_neighbors(partition2)
+  partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv)
+  return assembly_neighbors(partition1; neighbors=partition2_graph)
+end
+
+# to_global! & to_local! analogs, needed for dispatching in block assembly
+
+function to_local_indices!(I,ids::PRange;kwargs...)
+  map(to_local!,I,partition(ids))
+end
+
+function to_global_indices!(I,ids::PRange;kwargs...)
+  map(to_global!,I,partition(ids))
+end
+
+function get_gid_owners(I,ids::PRange;kwargs...)
+  map(I,partition(ids)) do I, indices 
+    glo_to_loc = global_to_local(indices) 
+    loc_to_own = local_to_owner(indices)
+    map(x->loc_to_own[glo_to_loc[x]], I)
+  end 
+end
+
+for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
+  @eval begin
+    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
+      map($f,I,ids)
+    end
+
+    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
+      @check ax ∈ [:rows,:cols]
+      block_ids = CartesianIndices(I)
+      map(block_ids) do id
+        i = id[1]; j = id[2];
+        if ax == :rows
+          $f(I[i,j],ids[i])
+        else
+          $f(I[i,j],ids[j])
+        end
+      end
+    end
+  end
+end
+
+
+function _setup_matrix(
+  a,I,J,V,rows::PRange,cols::PRange
+)
+  _rows = PRange(map(remove_ghost,partition(rows)))
+  asys = change_axes(a,(_rows,cols))
+  values = map(create_from_nz,local_views(asys))
+  PSparseMatrix(values,partition(_rows),partition(cols),true)
+end
+
+function _setup_matrix(a,I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange})
+  block_ids  = CartesianIndices(I)
+  block_mats = map(block_ids) do id
+    i = id[1]; j = id[2];
+    _setup_matrix(a[i,j],I[i,j],J[i,j],V[i,j],rows[i],cols[j])
+  end
+  return mortar(block_mats)
+end
+
+# _assemble_coo! : local coo triplets + global PRange -> Global coo values
+
+function _assemble_coo!(I,J,V,rows::PRange;owners=nothing)
+  if isa(owners,Nothing)
+    PArrays.assemble_coo!(I,J,V,partition(rows))
+  else
+    assemble_coo_with_column_owner!(I,J,V,partition(rows),owners)
+  end
+end
+
+function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing)
+  block_ids = CartesianIndices(I)
+  map(block_ids) do id
+    i = id[1]; j = id[2];
+    if isa(owners,Nothing)
+      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i])
+    else
+      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j])
+    end
+  end
+end
+
+function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
+  """
+    Returns three JaggedArrays with the coo triplets
+    to be sent to the corresponding owner parts in parts_snd
+  """
+  function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner)
+    global_to_local_row = global_to_local(row_lids)
+    local_row_to_owner = local_to_owner(row_lids)
+    owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) ))
+    ptrs = zeros(Int32,length(parts_snd)+1)
+    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
+    for k in 1:length(k_gi)
+      gi = k_gi[k]
+      li = global_to_local_row[gi]
+      owner = local_row_to_owner[li]
+      if owner != part
+        ptrs[owner_to_i[owner]+1] +=1
+      end
+    end
+    PArrays.length_to_ptrs!(ptrs)
+    gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1)
+    gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1)
+    jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1)
+    v_snd_data = zeros(eltype(k_v),ptrs[end]-1)
+    for k in 1:length(k_gi)
+      gi = k_gi[k]
+      li = global_to_local_row[gi]
+      owner = local_row_to_owner[li]
+      if owner != part
+        gj = k_gj[k]
+        v = k_v[k]
+        p = ptrs[owner_to_i[owner]]
+        gi_snd_data[p] = gi
+        gj_snd_data[p] = gj
+        jo_snd_data[p] = k_jo[k]
+        v_snd_data[p]  = v
+        k_v[k] = zero(v)
+        ptrs[owner_to_i[owner]] += 1
+      end
+    end
+    PArrays.rewind_ptrs!(ptrs)
+    gi_snd = JaggedArray(gi_snd_data,ptrs)
+    gj_snd = JaggedArray(gj_snd_data,ptrs)
+    jo_snd = JaggedArray(jo_snd_data,ptrs)
+    v_snd = JaggedArray(v_snd_data,ptrs)
+    gi_snd, gj_snd, jo_snd, v_snd
+  end
+  """
+    Pushes to coo_entries_with_column_owner the tuples 
+    gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes
+  """
+  function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
+    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
+    current_n = length(k_gi)
+    new_n = current_n + length(gi_rcv.data)
+    resize!(k_gi,new_n)
+    resize!(k_gj,new_n)
+    resize!(k_jo,new_n)
+    resize!(k_v,new_n)
+    for p in 1:length(gi_rcv.data)
+        k_gi[current_n+p] = gi_rcv.data[p]
+        k_gj[current_n+p] = gj_rcv.data[p]
+        k_jo[current_n+p] = jo_rcv.data[p]
+        k_v[current_n+p] = v_rcv.data[p]
+    end
+  end
+  part = linear_indices(row_partition)
+  parts_snd, parts_rcv = assembly_neighbors(row_partition)
+  coo_entries_with_column_owner = map(tuple,I,J,Jown,V)
+  gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays
+  graph = ExchangeGraph(parts_snd,parts_rcv)
+  t1 = exchange(gi_snd,graph)
+  t2 = exchange(gj_snd,graph)
+  t3 = exchange(jo_snd,graph)
+  t4 = exchange(v_snd,graph)
+  @async begin
+      gi_rcv = fetch(t1)
+      gj_rcv = fetch(t2)
+      jo_rcv = fetch(t3)
+      v_rcv = fetch(t4)
+      map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
+      I,J,Jown,V
+  end
+end
+
+# dofs_gids_prange can be either test_gids or trial_gids
+# In the former case, gids is a vector of global test dof identifiers, while in the 
+# latter, a vector of global trial dof identifiers
+function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
+  if !ghost
+    _setup_prange_without_ghosts(dofs_gids_prange)
+  elseif isa(owners,Nothing)
+    _setup_prange_with_ghosts(dofs_gids_prange,gids)
+  else
+    _setup_prange_with_ghosts(dofs_gids_prange,gids,owners)
+  end
+end
+
+function _setup_prange(
+  dofs_gids_prange::AbstractVector{<:PRange},
+  gids::AbstractMatrix;
+  ax=:rows,ghost=true,owners=nothing
+)
+  @check ax ∈ (:rows,:cols)
+  block_ids = LinearIndices(dofs_gids_prange)
+  pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x))
+
+  gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange
+    gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id]
+    gids_union = pvcat(gids_slice)
+
+    owners_union = nothing
+    if !isnothing(owners)
+      owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id]
+      owners_union = pvcat(owners_slice)
+    end
+
+    return gids_union, owners_union
+  end |> tuple_of_arrays
+  
+  return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union)
+end
+
+# Create PRange for the rows of the linear system
+# without local ghost dofs as per required in the 
+# LocallyAssembled() parallel assembly strategy 
+function _setup_prange_without_ghosts(dofs_gids_prange::PRange)
+  ngdofs = length(dofs_gids_prange)
+  indices = map(partition(dofs_gids_prange)) do dofs_indices 
+    owner = part_id(dofs_indices)
+    own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
+    ghost_indices = GhostIndices(ngdofs,Int64[],Int32[])
+    OwnAndGhostIndices(own_indices,ghost_indices)
+  end
+  return PRange(indices)
+end
+
+# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition
+# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system.
+# The precondition required for the consistency of any parallel assembly process in GridapDistributed 
+# is that each processor can determine locally with a single layer of ghost cells the global indices and associated 
+# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities 
+# (i.e., either cells or faces). 
+function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids)
+  ngdofs = length(dofs_gids_prange)
+  dofs_gids_partition = partition(dofs_gids_prange)
+
+  # Selected ghost ids -> dof PRange ghost ids
+  gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids)
+
+  # Selected ghost ids -> [global dof ids, owner processor ids]
+  gids_ghost_to_global, gids_ghost_to_owner = map(
+    find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition
+  ) |> tuple_of_arrays
+
+  return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner)
+end
+
+# Here we cannot assume that the sparse communication graph underlying 
+# trial_dofs_gids_partition is a superset of the one underlying indices.
+# Here we chould check whether it is included and call _find_neighbours!()
+# if this is the case. At present, we are not taking advantage of this, 
+# but let the parallel scalable algorithm to compute the reciprocal to do the job. 
+function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners)
+  ngdofs = length(dofs_gids_prange)
+  dofs_gids_partition = partition(dofs_gids_prange)
+
+  # Selected ghost ids -> [global dof ids, owner processor ids]
+  gids_ghost_to_global, gids_ghost_to_owner = map(
+    gids,owners,dofs_gids_partition) do gids, owners, indices
+    ghost_touched   = Dict{Int,Bool}()
+    ghost_to_global = Int64[] 
+    ghost_to_owner  = Int64[]
+    me = part_id(indices)
+    for (j,jo) in zip(gids,owners)
+      if jo != me
+        if !haskey(ghost_touched,j)
+          push!(ghost_to_global,j)
+          push!(ghost_to_owner,jo)
+          ghost_touched[j] = true
+        end
+      end
+    end
+    ghost_to_global, ghost_to_owner
+  end |> tuple_of_arrays
+
+  return _setup_prange_impl_(
+    ngdofs,
+    dofs_gids_partition,
+    gids_ghost_to_global,
+    gids_ghost_to_owner;
+    discover_neighbours=false
+  )
+end 
+
+function _setup_prange_impl_(
+  ngdofs,
+  dofs_gids_partition,
+  gids_ghost_to_global,
+  gids_ghost_to_owner;
+  discover_neighbours=true
+)
+  indices = map(dofs_gids_partition, 
+                gids_ghost_to_global, 
+                gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner 
+    owner = part_id(dofs_indices)
+    own_indices   = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
+    ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner)
+    OwnAndGhostIndices(own_indices,ghost_indices)
+  end
+  if discover_neighbours
+    _find_neighbours!(indices,dofs_gids_partition)
+  end
+  return PRange(indices)
+end
diff --git a/test/BlockSparseMatrixAssemblersTests.jl b/test/BlockSparseMatrixAssemblersTests.jl
index 82bafc97..645d7dc2 100644
--- a/test/BlockSparseMatrixAssemblersTests.jl
+++ b/test/BlockSparseMatrixAssemblersTests.jl
@@ -52,7 +52,7 @@ function _main(n_spaces,mfs,weakform,U,V)
   matdata = collect_cell_matrix(X,Y,biform(u,v))
   vecdata = collect_cell_vector(Y,liform(v))  
 
-  assem = SparseMatrixAssembler(X,Y,FullyAssembledRows())
+  assem = SparseMatrixAssembler(X,Y,LocallyAssembled())
   A1 = assemble_matrix(assem,matdata)
   b1 = assemble_vector(assem,vecdata)
   A2,b2 = assemble_matrix_and_vector(assem,data);
@@ -68,7 +68,7 @@ function _main(n_spaces,mfs,weakform,U,V)
   bmatdata = collect_cell_matrix(Xb,Yb,biform(ub,vb))
   bvecdata = collect_cell_vector(Yb,liform(vb))
 
-  assem_blocks = SparseMatrixAssembler(Xb,Yb,FullyAssembledRows())
+  assem_blocks = SparseMatrixAssembler(Xb,Yb,LocallyAssembled())
   A1_blocks = assemble_matrix(assem_blocks,bmatdata);
   b1_blocks = assemble_vector(assem_blocks,bvecdata);
   @test is_same_vector(b1_blocks,b1,Yb,Y)
diff --git a/test/DivConformingTests.jl b/test/DivConformingTests.jl
index 2a1886b3..38044564 100644
--- a/test/DivConformingTests.jl
+++ b/test/DivConformingTests.jl
@@ -63,7 +63,7 @@ function f(model,reffe)
     V = FESpace(model,reffe,conformity=:Hdiv)
     U = TrialFESpace(V)
 
-    das = FullyAssembledRows()
+    das = LocallyAssembled()
     trian = Triangulation(das,model)
     degree = 2
     dΩ = Measure(trian,degree)
diff --git a/test/FESpacesTests.jl b/test/FESpacesTests.jl
index bf1e78e6..22e35453 100644
--- a/test/FESpacesTests.jl
+++ b/test/FESpacesTests.jl
@@ -71,8 +71,8 @@ function assemble_tests(das,dΩ,dΩass,U,V)
 end
 
 function main(distribute,parts)
-  main(distribute,parts,SubAssembledRows())
-  main(distribute,parts,FullyAssembledRows())
+  main(distribute,parts,Assembled())
+  main(distribute,parts,LocallyAssembled())
 end
 
 function main(distribute,parts,das)
diff --git a/test/HeatEquationTests.jl b/test/HeatEquationTests.jl
index fa96dc11..89077ecb 100644
--- a/test/HeatEquationTests.jl
+++ b/test/HeatEquationTests.jl
@@ -37,7 +37,7 @@ function main(distribute,parts)
 
   op = TransientFEOperator(res,jac,jac_t,U,V0)
 
-  assembler = SparseMatrixAssembler(U,V0,SubAssembledRows())
+  assembler = SparseMatrixAssembler(U,V0,Assembled())
   op_constant = TransientLinearFEOperator(
     (a,m),b,U,V0,constant_forms=(true,true),assembler=assembler
   )
diff --git a/test/PLaplacianTests.jl b/test/PLaplacianTests.jl
index 2347dfc4..c64ed23f 100644
--- a/test/PLaplacianTests.jl
+++ b/test/PLaplacianTests.jl
@@ -8,8 +8,8 @@ using Test
 using SparseArrays
 
 function main(distribute,parts)
-  main(distribute,parts,FullyAssembledRows(),SparseMatrixCSR{0,Float64,Int})
-  main(distribute,parts,SubAssembledRows(),SparseMatrixCSC{Float64,Int})
+  main(distribute,parts,LocallyAssembled(),SparseMatrixCSR{0,Float64,Int})
+  main(distribute,parts,Assembled(),SparseMatrixCSC{Float64,Int})
 end
 
 function main(distribute,parts,strategy,local_matrix_type)
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index 951a68d7..ae00925e 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -21,6 +21,8 @@ dist_V = TestFESpace(dist_model,reffe)
 serial_ids = get_free_dof_ids(serial_V)
 dist_ids = get_free_dof_ids(dist_V)
 
+dof_ids = get_free_dof_ids(dist_V)
+
 serial_Ω = Triangulation(serial_model)
 serial_dΩ = Measure(serial_Ω,2)
 
@@ -31,6 +33,55 @@ serial_a1(u,v) = ∫(u⋅v)*serial_dΩ
 serial_A1 = assemble_matrix(serial_a1,serial_V,serial_V)
 
 dist_a1(u,v) = ∫(u⋅v)*dist_dΩ
+assem = SparseMatrixAssembler(dist_V,dist_V)
 dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
 all(centralize(dist_A1) - serial_A1 .< 1e-10)
 
+function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
+  K = zeros(Int32,length(I))
+  for (p,(i,j)) in enumerate(zip(I,J))
+      if !skip && (i < 1 || j < 1)
+          continue
+      end
+      K[p] = nzindex(A,i,j)
+  end
+  K
+end
+
+
+
+############################################################################################
+
+ids = cids
+indices = partition(ids)
+
+n_own = own_length(cids)
+ghost2global = ghost_to_global(cids)
+ghost2owner = ghost_to_owner(cids)
+
+first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+n_global = reduce(+,n_own)
+n_parts = length(ranks)
+indices2 = map(ranks,n_own,first_gid,ghost2global,ghost2owner) do rank,n_own,start,ghost2global,ghost2owner
+  p = CartesianIndex((rank,))
+  np = (n_parts,)
+  n = (n_global,)
+  ranges = ((1:n_own).+(start-1),)
+  ghost = GhostIndices(n_global,ghost2global,ghost2owner)
+
+  PartitionedArrays.LocalIndicesWithVariableBlockSize(p,np,n,ranges,ghost)
+end
+
+map(indices,indices2) do indices, indices2
+  ghost2local = ghost_to_local(indices)
+  own2local = own_to_local(indices)
+
+  n_own = own_length(indices)
+  n_ghost = ghost_length(indices)
+
+  #perm = vcat(own2local,ghost2local)
+  perm = fill(0,local_length(indices))
+  perm[own2local] .= 1:n_own
+  perm[ghost2local] .= (n_own+1):(n_ghost+n_own)
+  permute_indices(indices2,perm)
+end

From da06a4b4a276897863a82275b7c431a7f2beeaef Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Sun, 8 Sep 2024 00:31:31 +1000
Subject: [PATCH 05/48] Saving progress

---
 src/Assembly.jl      | 137 ++++++++++++++++++++++++++++++++++++++++---
 src/PArraysExtras.jl |  55 ++++++++++++++++-
 2 files changed, 182 insertions(+), 10 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 25e2411f..10e6d976 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -84,6 +84,20 @@ local_views(a::DistributedAllocation) = a.allocs
 const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
 const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
 
+function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation})
+  map(local_views(a),partition(axes(a,1))) do a, ids
+    rows = remove_ghost(unpermute(ids))
+
+    ghost_lids = ghost_to_local(ids)
+    ghost_owners = ghost_to_owner(ids)
+    touched_ghost_lids = filter(lid -> a.touched[lid],ghost_lids)
+    touched_ghost_owners = collect(ghost_owners[touched_ghost_lids])
+    touched_ghost_gids = to_global!(touched_ghost_lids,ids)
+    ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners)
+    return replace_ghost(rows,ghost)
+  end
+end
+
 # PSparseMatrix assembly chain
 #
 #   1 - nz_counter(PSparseMatrixBuilder) -> PSparseMatrixCounter
@@ -104,6 +118,21 @@ function Algebra.nz_allocation(a::PSparseMatrixCounter)
   DistributedAllocation(allocs,a.axes,a.strategy)
 end
 
+function Algebra.create_from_nz(a::PSparseMatrix)
+  assemble!(a) |> wait
+  return a
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled})
+  A, = create_from_nz_locally_assembled(a)
+  return A
+end
+
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
+  A, = create_from_nz_assembled(a,nothing)
+  return A
+end
+
 # PVector assembly chain:
 #
 #   1 - nz_counter(PVectorBuilder) -> PVectorCounter
@@ -142,13 +171,16 @@ function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVecto
 end
 
 function Algebra.create_from_nz(a::PVectorAllocation{<:LocallyAssembled,<:AbstractVector})
-  rows = _setup_prange_without_ghosts(axes(a,1))
-  _rhs_callback(a,rows)
+  rows = remove_ghost(unpermute(axes(a,1)))
+  values = local_views(a)
+  b = rhs_callback(values,rows)
+  return b
 end
 
 function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
-  rows = _setup_prange_from_pvector_allocation(a)
-  b    = _rhs_callback(a,rows)
+  rows = collect_touched_ids(a)
+  values = map(ai -> ai.values, local_views(a))
+  b    = rhs_callback(values,rows)
   t2   = assemble!(b)
   if t2 !== nothing
     wait(t2)
@@ -162,7 +194,7 @@ end
 # overlap communications the assembly of the matrix and the vector.
 # Not only it is faster, but also necessary to ensure identical ghost indices 
 # in both the matrix and vector rows.
-# This is done by using the following specializations chain:
+# This is done by using the following specializations:
 
 function Arrays.nz_allocation(
   a::PSparseMatrixCounter{<:Assembled},
@@ -178,9 +210,12 @@ function Algebra.create_from_nz(
   b::PVectorAllocation{<:LocallyAssembled,<:AbstractVector}
 )
   function callback(rows)
-    _rhs_callback(b,rows)
+    new_indices = partition(rows)
+    values = local_views(b)
+    b_fespace = PVector(values,partition(axes(b,1)))
+    locally_repartition(b_fespace,new_indices)
   end
-  A, B = _fa_create_from_nz_with_callback(callback,a)
+  A, B = create_from_nz_locally_assembled(a,callback)
   return A, B
 end
 
@@ -189,11 +224,95 @@ function Algebra.create_from_nz(
   b::PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
 )
   function callback(rows)
-    _rhs_callback(b,rows)
+    new_indices = collect_touched_ids(b)
+    values = map(ai -> ai.values, local_views(b))
+    b_fespace = PVector(values,partition(axes(b,1)))
+    locally_repartition(b_fespace,new_indices)
   end
   function async_callback(b)
     assemble!(b)
   end
-  A, B = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
+  A, B = create_from_nz_assembled(a,callback,async_callback)
   return A, B
 end
+
+# Assembly methods
+
+function create_from_nz_locally_assembled(
+  a,
+  callback::Function = rows -> nothing
+)
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids, trial_gids = axes(a)
+
+  rows = remove_ghost(unpermute(test_gids))
+  b = callback(rows)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the range for cols
+  cols = _setup_prange(trial_gids,J;ax=:cols)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(I,J,V,rows,cols)
+  return A, b
+end
+
+function create_from_nz_assembled(
+  a, 
+  callback::Function = rows -> nothing,
+  async_callback::Function = b -> nothing
+)
+  # Recover some data
+  I,J,V = get_allocations(a)
+  test_gids = get_test_gids(a)
+  trial_gids = get_trial_gids(a)
+
+  # convert I and J to global dof ids
+  to_global_indices!(I,test_gids;ax=:rows)
+  to_global_indices!(J,trial_gids;ax=:cols)
+
+  # Create the Prange for the rows
+  rows = _setup_prange(test_gids,I;ax=:rows)
+
+  # Move (I,J,V) triplets to the owner process of each row I.
+  # The triplets are accompanyed which Jo which is the process column owner
+  Jo = get_gid_owners(J,trial_gids;ax=:cols)
+  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
+
+  # Here we can overlap computations
+  # This is a good place to overlap since
+  # sending the matrix rows is a lot of data
+  if !isa(b,Nothing)
+    bprange=_setup_prange_from_pvector_allocation(b)
+    b = callback(bprange)
+  end
+
+  # Wait the transfer to finish
+  wait(t)
+
+  # Create the Prange for the cols
+  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
+
+  # Overlap rhs communications with CSC compression
+  t2 = async_callback(b)
+
+  # Convert again I,J to local numeration
+  to_local_indices!(I,rows;ax=:rows)
+  to_local_indices!(J,cols;ax=:cols)
+
+  A = _setup_matrix(a,I,J,V,rows,cols)
+
+  # Wait the transfer to finish
+  if !isa(t2,Nothing)
+    wait(t2)
+  end
+
+  return A, b
+end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index f4a5ae2e..1a1d84de 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -45,10 +45,63 @@ function permuted_variable_partition(
   end
 end
 
+"""
+    unpermute(indices::AbstractLocalIndices)
+
+Given local indices, reorders them to (locally) have own indices first, 
+followed by ghost indices.
+"""
+function unpermute(indices::AbstractLocalIndices) 
+  @notimplemented
+end
+
+unpermute(indices::PermutedLocalIndices) = unpermute(indices.indices)
+unpermute(indices::PartitionedArrays.LocalIndicesInBlockPartition) = indices
+unpermute(indices::OwnAndGhostIndices) = indices
+
+function unpermute(indices::LocalIndices)
+  nglobal = global_length(indices)
+  rank = part_id(indices)
+  own = OwnIndices(nglobal,rank,own_to_global(indices))
+  ghost = GhostIndices(nglobal,ghost_to_global(indices),ghost_to_owner(indices))
+  OwnAndGhostIndices(own,ghost,global_to_owner(indices))
+end
+
+"""
+    locally_repartition(v::PVector,new_indices)
+
+Map the values of a PVector to a new partitioning of the indices.
+
+Similar to `PartitionedArrays.repartition`, but without any communications. Instead, 
+it is assumed that the local-to-local mapping can be done locally.
+"""
+function locally_repartition(v::PVector,new_indices)
+  w = similar(v,PRange(new_indices))
+  locally_repartition!(w,v)
+end
+
+function locally_repartition!(w::PVector,v::PVector)
+  # Fill own values
+  map(copy!,own_values(w),own_values(v))
+
+  # Fill ghost values
+  new_indices = partition(axes(w,1))
+  old_indices = partition(axes(v,1))
+  map(partition(w),partition(v),new_indices,old_indices) do w,v,new_ids,old_ids
+    old_gid_to_lid = global_to_local(old_ids)
+    for (lid,gid) in zip(ghost_to_local(new_ids),ghost_to_global(new_ids))
+      old_lid = old_gid_to_lid[gid]
+      w[lid] = v[old_lid]
+    end
+  end
+
+  return w
+end
+
 # Linear algebra
 
 function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
-  @check partition(axes(x,1)) === partition(axes(y,1))
+  @check matching_local_indices(partition(axes(x,1)),partition(axes(y,1)))
   map(partition(x),partition(y)) do x,y
     LinearAlgebra.axpy!(α,x,y)
   end

From fe547432c393ef7bbff1da8359efd72759e41376 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 17 Sep 2024 13:50:20 +1000
Subject: [PATCH 06/48] Minor

---
 src/Assembly.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 10e6d976..f440fd49 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -236,7 +236,7 @@ function Algebra.create_from_nz(
   return A, B
 end
 
-# Assembly methods
+# Low-level assembly methods
 
 function create_from_nz_locally_assembled(
   a,

From 8bacdf25f109c408861c92060774dc594ef80033 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 22 Oct 2024 01:02:47 +1100
Subject: [PATCH 07/48] More fixes

---
 src/Assembly.jl          | 43 ++++++++++++++++++++++++----------------
 src/FESpaces.jl          | 12 ++++-------
 src/GridapDistributed.jl |  4 ++--
 src/PArraysExtras.jl     | 30 ++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index f440fd49..1797a996 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -42,7 +42,7 @@ struct DistributedCounter{S,T,N,A,B} <: GridapType
   function DistributedCounter(
     counters :: AbstractArray{T},
     axes     :: NTuple{N,<:PRange},
-    strategy :: Algebra.AssemblyStrategy
+    strategy :: AssemblyStrategy
   ) where {T,N}
     A, B, S = typeof(counters), typeof(axes), typeof(strategy)
     new{S,T,N,A,B}(counters,axes,strategy)
@@ -52,10 +52,10 @@ end
 Base.axes(a::DistributedCounter) = a.axes
 local_views(a::DistributedCounter) = a.counters
 
-const PVectorCounter{S,T} = DistributedCounter{S,T<:Algebra.ArrayCounter,1}
+const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B}
 Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
 
-const PSparseMatrixCounter{S,T} = DistributedCounter{S,T<:Algebra.CounterCOO,2}
+const PSparseMatrixCounter{S,T,A,B} = DistributedCounter{S,T,2,A,B}
 Algebra.LoopStyle(::Type{<:PSparseMatrixCounter}) = Loop()
 
 """
@@ -71,7 +71,7 @@ struct DistributedAllocation{S,T,N,A,B} <: GridapType
   function DistributedAllocation(
     allocs   :: AbstractArray{T},
     axes     :: NTuple{N,<:PRange},
-    strategy :: Algebra.AssemblyStrategy
+    strategy :: AssemblyStrategy
   ) where {T,N}
     A, B, S = typeof(allocs), typeof(axes), typeof(strategy)
     new{S,T,N,A,B}(allocs,axes,strategy)
@@ -84,6 +84,13 @@ local_views(a::DistributedAllocation) = a.allocs
 const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
 const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
 
+function get_allocations(a::PSparseMatrixAllocation{S,<:Algebra.AllocationCOO}) where S
+  I,J,V = map(local_views(a)) do alloc
+    alloc.I, alloc.J, alloc.V
+  end |> tuple_of_arrays
+  return I,J,V
+end
+
 function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation})
   map(local_views(a),partition(axes(a,1))) do a, ids
     rows = remove_ghost(unpermute(ids))
@@ -104,13 +111,13 @@ end
 #   2 - nz_allocation(PSparseMatrixCounter) -> PSparseMatrixAllocation
 #   3 - create_from_nz(PSparseMatrixAllocation) -> PSparseMatrix
 
-function Algebra.nz_counter(builder::PSparseMatrixBuilder,axs::NTuple{2,<:PRange})
+function Algebra.nz_counter(builder::PSparseMatrixBuilder{MT},axs::NTuple{2,<:PRange}) where MT
   rows, cols = axs # test ids, trial ids
-  counters = map(partition(rows),partition(cols)) do rows,cols
-    axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols)))
-    Algebra.CounterCOO{A}(axs)
+  counters = map(partition(rows),partition(cols)) do rows, cols
+    local_axs = (Base.OneTo(local_length(rows)),Base.OneTo(local_length(cols)))
+    Algebra.CounterCOO{MT}(local_axs)
   end
-  DistributedCounter(builder.par_strategy,counters,rows,cols)
+  DistributedCounter(counters,axs,builder.strategy)
 end
 
 function Algebra.nz_allocation(a::PSparseMatrixCounter)
@@ -129,7 +136,9 @@ function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled})
 end
 
 function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
-  A, = create_from_nz_assembled(a,nothing)
+  callback(rows) = nothing
+  async_callback(b) = nothing
+  A, = create_from_nz_assembled(a,callback,async_callback)
   return A
 end
 
@@ -180,8 +189,8 @@ end
 function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
   rows = collect_touched_ids(a)
   values = map(ai -> ai.values, local_views(a))
-  b    = rhs_callback(values,rows)
-  t2   = assemble!(b)
+  b  = rhs_callback(values,rows)
+  t2 = assemble!(b)
   if t2 !== nothing
     wait(t2)
   end
@@ -271,19 +280,19 @@ function create_from_nz_assembled(
 )
   # Recover some data
   I,J,V = get_allocations(a)
-  test_gids = get_test_gids(a)
-  trial_gids = get_trial_gids(a)
+  test_gids, trial_gids = axes(a)
 
   # convert I and J to global dof ids
   to_global_indices!(I,test_gids;ax=:rows)
   to_global_indices!(J,trial_gids;ax=:cols)
 
   # Create the Prange for the rows
-  rows = _setup_prange(test_gids,I;ax=:rows)
+  rows = unpermute(test_gids)
+  # rows = replace_ghost(unpermute(test_gids),I,find_owner(test_gids,I))
 
   # Move (I,J,V) triplets to the owner process of each row I.
-  # The triplets are accompanyed which Jo which is the process column owner
-  Jo = get_gid_owners(J,trial_gids;ax=:cols)
+  J_owners = find_owner(trial_gids,J)
+  cols = union_ghost(unpermute(trial_gids),J,J_owners)
   t  = _assemble_coo!(I,J,V,rows;owners=Jo)
 
   # Here we can overlap computations
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 02530cfa..e7aabe3d 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -603,15 +603,11 @@ FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_buil
 FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
 
 function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+  map(symbolic_loop_matrix!,local_views(A),local_views(a),matdata)
 end
 
 function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+  map(numeric_loop_matrix!,local_views(A),local_views(a),matdata)
 end
 
 function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
@@ -656,7 +652,7 @@ function FESpaces.assemble_matrix!(mat::PSparseMatrix,a::DistributedSparseMatrix
   create_from_nz!(mat,allocs,cache)
 end
 
-function create_from_nz!(A,a::DistributedAllocationCOO{<:Assembled},cache)
+function create_from_nz!(A,a::DistributedAllocation{<:Assembled},cache)
   _,_,V = get_allocations(a)
   psparse!(A,V,cache) |> wait
   return A
@@ -699,7 +695,7 @@ function FESpaces.SparseMatrixAssembler(
       local_strategy
     )
   end
-  mat_builder = PSparseMatrixBuilderCOO(local_mat_type,par_strategy)
+  mat_builder = PSparseMatrixBuilder(local_mat_type,par_strategy)
   vec_builder = PVectorBuilder(local_vec_type,par_strategy)
   if reuse_caches
     caches = Dict{UInt64,Any}()
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index b55ddfe8..4e31dcd3 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -42,11 +42,11 @@ export with_ghost, no_ghost
 
 export redistribute
 
+include("BlockPartitionedArrays.jl")
+
 include("GridapExtras.jl")
 include("PArraysExtras.jl")
 
-include("BlockPartitionedArrays.jl")
-
 include("Algebra.jl")
 
 include("Assembly.jl")
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 1a1d84de..19c97149 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -161,6 +161,36 @@ function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
   end
 end
 
+# To local/to global for blocks 
+
+function to_local_indices!(I,ids::PRange;kwargs...)
+  map(to_local!,I,partition(ids))
+end
+
+function to_global_indices!(I,ids::PRange;kwargs...)
+  map(to_global!,I,partition(ids))
+end
+for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
+  @eval begin
+    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
+      map($f,I,ids)
+    end
+
+    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
+      @check ax ∈ [:rows,:cols]
+      block_ids = CartesianIndices(I)
+      map(block_ids) do id
+        i = id[1]; j = id[2];
+        if ax == :rows
+          $f(I[i,j],ids[i])
+        else
+          $f(I[i,j],ids[j])
+        end
+      end
+    end
+  end
+end
+
 # This type is required because MPIArray from PArrays 
 # cannot be instantiated with a NULL communicator
 struct MPIVoidVector{T} <: AbstractVector{T}

From a21abd7a6438dc60fc8eed3bd4ed853636a851e5 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 5 Nov 2024 19:02:35 +1100
Subject: [PATCH 08/48] We can assemble again!

---
 src/Assembly.jl        | 52 +++++++++++++++++++++++-------------------
 src/GridapExtras.jl    | 13 +++++++++++
 src/PArraysExtras.jl   | 13 +++++++++++
 test/parrays_update.jl | 14 ++++++++++++
 4 files changed, 69 insertions(+), 23 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 1797a996..e0b33745 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -50,6 +50,7 @@ struct DistributedCounter{S,T,N,A,B} <: GridapType
 end
 
 Base.axes(a::DistributedCounter) = a.axes
+Base.axes(a::DistributedCounter,d::Integer) = a.axes[d]
 local_views(a::DistributedCounter) = a.counters
 
 const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B}
@@ -79,8 +80,18 @@ struct DistributedAllocation{S,T,N,A,B} <: GridapType
 end
 
 Base.axes(a::DistributedAllocation) = a.axes
+Base.axes(a::DistributedAllocation,d::Integer) = a.axes[d]
 local_views(a::DistributedAllocation) = a.allocs
 
+function change_axes(a::DistributedAllocation{S,T,N},axes::NTuple{N,<:PRange}) where {S,T,N}
+  indices = map(partition,axes)
+  local_axes = map(indices...) do indices...
+    map(ids -> Base.OneTo(local_length(ids)), indices)
+  end
+  allocs = map(change_axes,a.allocs,local_axes)
+  DistributedAllocation(allocs,axes,a.strategy)
+end
+
 const PVectorAllocation{S,T} = DistributedAllocation{S,T,1}
 const PSparseMatrixAllocation{S,T} = DistributedAllocation{S,T,2}
 
@@ -97,9 +108,9 @@ function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation})
 
     ghost_lids = ghost_to_local(ids)
     ghost_owners = ghost_to_owner(ids)
-    touched_ghost_lids = filter(lid -> a.touched[lid],ghost_lids)
+    touched_ghost_lids = filter(lid -> a.touched[lid], ghost_lids)
     touched_ghost_owners = collect(ghost_owners[touched_ghost_lids])
-    touched_ghost_gids = to_global!(touched_ghost_lids,ids)
+    touched_ghost_gids = to_global!(touched_ghost_lids, ids)
     ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners)
     return replace_ghost(rows,ghost)
   end
@@ -280,43 +291,38 @@ function create_from_nz_assembled(
 )
   # Recover some data
   I,J,V = get_allocations(a)
-  test_gids, trial_gids = axes(a)
+  test_ids = partition(axes(a,1))
+  trial_ids = partition(axes(a,2))
 
   # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
+  map(map_local_to_global!,I,test_ids)
+  map(map_local_to_global!,J,trial_ids)
 
-  # Create the Prange for the rows
-  rows = unpermute(test_gids)
+  # Move (I,J,V) triplets to the owner process of each row I.
+  rows = map(unpermute,test_ids)
   # rows = replace_ghost(unpermute(test_gids),I,find_owner(test_gids,I))
+  t = PartitionedArrays.assemble_coo!(I,J,V,rows)
 
-  # Move (I,J,V) triplets to the owner process of each row I.
-  J_owners = find_owner(trial_gids,J)
-  cols = union_ghost(unpermute(trial_gids),J,J_owners)
-  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
-
-  # Here we can overlap computations
-  # This is a good place to overlap since
-  # sending the matrix rows is a lot of data
-  if !isa(b,Nothing)
-    bprange=_setup_prange_from_pvector_allocation(b)
-    b = callback(bprange)
-  end
+  # Overlap CSC communication with rhs assembly
+  b = callback(rows)
 
   # Wait the transfer to finish
   wait(t)
 
   # Create the Prange for the cols
-  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
+  J_owners = find_owner(trial_ids,J)
+  cols = map(union_ghost,map(unpermute,trial_ids),J,J_owners)
 
   # Overlap rhs communications with CSC compression
   t2 = async_callback(b)
 
   # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
+  map(map_global_to_local!,I,rows)
+  map(map_global_to_local!,J,cols)
 
-  A = _setup_matrix(a,I,J,V,rows,cols)
+  a_sys = change_axes(a,(PRange(rows),PRange(cols)))
+  values = map(create_from_nz,local_views(a_sys))
+  A = PSparseMatrix(values,rows,cols,true)
 
   # Wait the transfer to finish
   if !isa(t2,Nothing)
diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl
index df5b1186..e0723189 100644
--- a/src/GridapExtras.jl
+++ b/src/GridapExtras.jl
@@ -53,3 +53,16 @@ end
   end
   nothing
 end
+
+# change_axes
+
+function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
+  b = Algebra.CounterCOO{T}(axes)
+  b.nnz = a.nnz
+  b
+end
+
+function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
+  counter = change_axes(a.counter,axes)
+  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
+end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 19c97149..be907629 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -98,6 +98,19 @@ function locally_repartition!(w::PVector,v::PVector)
   return w
 end
 
+# SubSparseMatrix extensions
+
+function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
+  I,J,V = findnz(A.parent)
+  rowmap, colmap = A.inv_indices
+  for k in eachindex(I)
+    I[k] = rowmap[I[k]]
+    J[k] = colmap[J[k]]
+  end
+  mask = map((i,j) -> (i > 0 && j > 0), I, J)
+  return I[mask], J[mask], V[mask]
+end
+
 # Linear algebra
 
 function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index ae00925e..9e396812 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -37,6 +37,8 @@ assem = SparseMatrixAssembler(dist_V,dist_V)
 dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
 all(centralize(dist_A1) - serial_A1 .< 1e-10)
 
+centralize(dist_A1)
+
 function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
   K = zeros(Int32,length(I))
   for (p,(i,j)) in enumerate(zip(I,J))
@@ -48,7 +50,19 @@ function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
   K
 end
 
+Aoo = own_own_values(dist_A1).items[1]
 
+using SparseArrays
+function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
+  I,J,V = findnz(A.parent)
+  rowmap, colmap = A.inv_indices
+  for k in eachindex(I)
+    I[k] = rowmap[I[k]]
+    J[k] = colmap[J[k]]
+  end
+  mask = map((i,j) -> (i > 0 && j > 0), I, J)
+  return I[mask], J[mask], V[mask]
+end
 
 ############################################################################################
 

From 07cb9a4a307527682374cdbcb8080680b4f570e1 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 5 Nov 2024 19:45:40 +1100
Subject: [PATCH 09/48] Added the rest of assembly startegies

---
 src/Assembly.jl      | 73 +++++++++++++++++++++++++-------------------
 src/PArraysExtras.jl |  4 +++
 2 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index e0b33745..a5f10c9c 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -262,34 +262,37 @@ function create_from_nz_locally_assembled(
   a,
   callback::Function = rows -> nothing
 )
-  # Recover some data
   I,J,V = get_allocations(a)
-  test_gids, trial_gids = axes(a)
+  test_ids = partition(axes(a,1))
+  trial_ids = partition(axes(a,2))
 
-  rows = remove_ghost(unpermute(test_gids))
+  rows = map(remove_ghost,map(unpermute,test_ids))
   b = callback(rows)
 
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
+  map(map_local_to_global!,I,test_ids)
+  map(map_local_to_global!,J,trial_ids)
 
-  # Create the range for cols
-  cols = _setup_prange(trial_gids,J;ax=:cols)
+  # TODO: replace_ghost or union_ghost?
+  # Actually, do we want to change the ghosts at all? We could return the unpermuted trial_ids
+  J_owners = find_owner(trial_ids,J)
+  cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners)
 
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
+  map(map_global_to_local!,I,rows)
+  map(map_global_to_local!,J,cols)
+
+  assembled = true
+  a_sys = change_axes(a,(PRange(rows),PRange(cols)))
+  values = map(create_from_nz,local_views(a_sys))
+  A = PSparseMatrix(values,rows,cols,assembled)
 
-  A = _setup_matrix(I,J,V,rows,cols)
   return A, b
 end
 
 function create_from_nz_assembled(
-  a, 
+  a,
   callback::Function = rows -> nothing,
-  async_callback::Function = b -> nothing
+  async_callback::Function = b -> empty_async_task
 )
-  # Recover some data
   I,J,V = get_allocations(a)
   test_ids = partition(axes(a,1))
   trial_ids = partition(axes(a,2))
@@ -298,36 +301,44 @@ function create_from_nz_assembled(
   map(map_local_to_global!,I,test_ids)
   map(map_local_to_global!,J,trial_ids)
 
-  # Move (I,J,V) triplets to the owner process of each row I.
+  # Overlapped COO communication and vector assembly
   rows = map(unpermute,test_ids)
-  # rows = replace_ghost(unpermute(test_gids),I,find_owner(test_gids,I))
   t = PartitionedArrays.assemble_coo!(I,J,V,rows)
-
-  # Overlap CSC communication with rhs assembly
   b = callback(rows)
-
-  # Wait the transfer to finish
   wait(t)
 
-  # Create the Prange for the cols
-  J_owners = find_owner(trial_ids,J)
-  cols = map(union_ghost,map(unpermute,trial_ids),J,J_owners)
-
   # Overlap rhs communications with CSC compression
   t2 = async_callback(b)
+  J_owners = find_owner(trial_ids,J)
+  cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners) # TODO: replace_ghost or union_ghost?
 
-  # Convert again I,J to local numeration
   map(map_global_to_local!,I,rows)
   map(map_global_to_local!,J,cols)
 
+  assembled = true
   a_sys = change_axes(a,(PRange(rows),PRange(cols)))
   values = map(create_from_nz,local_views(a_sys))
-  A = PSparseMatrix(values,rows,cols,true)
+  A = PSparseMatrix(values,rows,cols,assembled)
 
-  # Wait the transfer to finish
-  if !isa(t2,Nothing)
-    wait(t2)
-  end
+  wait(t2)
+  return A, b
+end
+
+function create_from_nz_subassembled(
+  a,
+  callback::Function = rows -> nothing,
+  async_callback::Function = b -> empty_async_task
+)
+  rows = partition(axes(a,1))
+  cols = partition(axes(a,2))
+
+  b = callback(rows)
+  t2 = async_callback(b)
+
+  assembled = false
+  values = map(create_from_nz,local_views(a))
+  A = PSparseMatrix(values,rows,cols,assembled)
 
+  wait(t2)
   return A, b
 end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index be907629..4d87e2b0 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -111,6 +111,10 @@ function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
   return I[mask], J[mask], V[mask]
 end
 
+# Async tasks
+
+const empty_async_task = PartitionedArrays.FakeTask(x -> nothing)
+
 # Linear algebra
 
 function LinearAlgebra.axpy!(α,x::PVector,y::PVector)

From a820d9a6979b68f5808e784e0816530c430b1c45 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 5 Nov 2024 23:31:50 +1100
Subject: [PATCH 10/48] Bugfixes

---
 src/Assembly.jl        |  9 +++----
 src/PArraysExtras.jl   | 53 +++++++++++++++++++++++++++++++++++++++++-
 test/parrays_update.jl | 17 ++------------
 3 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index a5f10c9c..021e8151 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -147,9 +147,7 @@ function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:LocallyAssembled})
 end
 
 function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
-  callback(rows) = nothing
-  async_callback(b) = nothing
-  A, = create_from_nz_assembled(a,callback,async_callback)
+  A, = create_from_nz_assembled(a)
   return A
 end
 
@@ -302,15 +300,14 @@ function create_from_nz_assembled(
   map(map_local_to_global!,J,trial_ids)
 
   # Overlapped COO communication and vector assembly
-  rows = map(unpermute,test_ids)
+  rows = filter_and_replace_ghost(map(unpermute,test_ids),I)
   t = PartitionedArrays.assemble_coo!(I,J,V,rows)
   b = callback(rows)
   wait(t)
 
   # Overlap rhs communications with CSC compression
   t2 = async_callback(b)
-  J_owners = find_owner(trial_ids,J)
-  cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners) # TODO: replace_ghost or union_ghost?
+  cols = filter_and_replace_ghost(map(unpermute,trial_ids),J)
 
   map(map_global_to_local!,I,rows)
   map(map_global_to_local!,J,cols)
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 4d87e2b0..05c82163 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -98,6 +98,57 @@ function locally_repartition!(w::PVector,v::PVector)
   return w
 end
 
+"""
+    filter_and_replace_ghost(indices,gids)
+
+Replace ghost ids in `indices` with the ghost ids within `gids`.
+"""
+function filter_and_replace_ghost(indices,gids)
+  owners = find_owner(indices,gids)
+  new_indices = map(indices,gids,owners) do indices, gids, owners
+    ghost_gids, ghost_owners = _filter_ghost(indices,gids,owners)
+    replace_ghost(indices, ghost_gids, ghost_owners)
+  end
+  return new_indices
+end
+
+# Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that 
+# belong to `indices`.
+function _filter_ghost(indices,gids,owners)
+  ghosts = Set{Int}()
+  part_owner = part_id(indices)
+
+  n_ghost = 0
+  for (gid,owner) in zip(gids,owners)
+    if gid < 1
+      continue
+    end
+    if (owner != part_owner) && !(gid in ghosts)
+      n_ghost += 1
+      push!(ghosts,gid)
+    end
+  end
+
+  new_ghost_to_global = zeros(Int,n_ghost)
+  new_ghost_to_owner = zeros(Int32,n_ghost)
+
+  empty!(ghosts)
+  n_ghost = 0
+  for (gid,owner) in zip(gids,owners)
+    if gid < 1
+      continue
+    end
+    if (owner != part_owner) && !(gid in ghosts)
+      n_ghost += 1
+      new_ghost_to_global[n_ghost] = gid
+      new_ghost_to_owner[n_ghost] = owner
+      push!(ghosts,gid)
+    end
+  end
+
+  return new_ghost_to_global, new_ghost_to_owner
+end
+
 # SubSparseMatrix extensions
 
 function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
@@ -113,7 +164,7 @@ end
 
 # Async tasks
 
-const empty_async_task = PartitionedArrays.FakeTask(x -> nothing)
+const empty_async_task = PartitionedArrays.FakeTask(() -> nothing)
 
 # Linear algebra
 
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index 9e396812..fd62332b 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -37,7 +37,8 @@ assem = SparseMatrixAssembler(dist_V,dist_V)
 dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
 all(centralize(dist_A1) - serial_A1 .< 1e-10)
 
-centralize(dist_A1)
+
+############################################################################################
 
 function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
   K = zeros(Int32,length(I))
@@ -50,20 +51,6 @@ function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
   K
 end
 
-Aoo = own_own_values(dist_A1).items[1]
-
-using SparseArrays
-function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
-  I,J,V = findnz(A.parent)
-  rowmap, colmap = A.inv_indices
-  for k in eachindex(I)
-    I[k] = rowmap[I[k]]
-    J[k] = colmap[J[k]]
-  end
-  mask = map((i,j) -> (i > 0 && j > 0), I, J)
-  return I[mask], J[mask], V[mask]
-end
-
 ############################################################################################
 
 ids = cids

From 7b3a94fb03b27034a803055721979860871ebc5d Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Tue, 5 Nov 2024 23:39:15 +1100
Subject: [PATCH 11/48] MInor

---
 src/Assembly.jl      | 5 +----
 src/PArraysExtras.jl | 6 +++++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 021e8151..a538eaa1 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -270,10 +270,7 @@ function create_from_nz_locally_assembled(
   map(map_local_to_global!,I,test_ids)
   map(map_local_to_global!,J,trial_ids)
 
-  # TODO: replace_ghost or union_ghost?
-  # Actually, do we want to change the ghosts at all? We could return the unpermuted trial_ids
-  J_owners = find_owner(trial_ids,J)
-  cols = map(replace_ghost,map(unpermute,trial_ids),J,J_owners)
+  cols = filter_and_replace_ghost(map(unpermute,trial_ids),J)
 
   map(map_global_to_local!,I,rows)
   map(map_global_to_local!,J,cols)
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 05c82163..b139a547 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -102,6 +102,10 @@ end
     filter_and_replace_ghost(indices,gids)
 
 Replace ghost ids in `indices` with the ghost ids within `gids`.
+
+NOTE: The issue is that `replace_ghost` does not check if all gids are ghosts or whether 
+they are repeated. It also shifts ownership of the ghosts. Its all quite messy and not what we 
+would need. TODO: Make me better. 
 """
 function filter_and_replace_ghost(indices,gids)
   owners = find_owner(indices,gids)
@@ -113,7 +117,7 @@ function filter_and_replace_ghost(indices,gids)
 end
 
 # Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that 
-# belong to `indices`.
+# belong to `indices`. This could eventually be a flag in the original function.
 function _filter_ghost(indices,gids,owners)
   ghosts = Set{Int}()
   part_owner = part_id(indices)

From 9c5b9682e8ac0d006815b2b995f41c7cfca9f4b1 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Wed, 26 Mar 2025 12:48:56 +1100
Subject: [PATCH 12/48] Saving changes

---
 src/Algebra.jl         |  78 +++++++++++++++++-
 src/Assembly.jl        | 174 +++++++++++++++++++++++++++++++++++------
 src/FESpaces.jl        | 148 -----------------------------------
 src/PArraysExtras.jl   |  20 ++++-
 test/parrays_update.jl |  46 +++++++++--
 5 files changed, 286 insertions(+), 180 deletions(-)

diff --git a/src/Algebra.jl b/src/Algebra.jl
index 3fa0b502..64410c95 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -68,9 +68,11 @@ end
 # change_ghost
 
 function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T
-  same_partition = (a.index_partition === partition(ids))
-  a_new = same_partition ? a : change_ghost(T,a,ids)
-  if make_consistent && (!same_partition || !is_consistent)
+  if (a.index_partition === partition(ids))
+    return a
+  end
+  a_new = change_ghost(T,a,ids)
+  if make_consistent && !is_consistent
     consistent!(a_new) |> wait
   end
   return a_new
@@ -98,3 +100,73 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_
   end
   return BlockPVector(vals,ids)
 end
+
+# This type is required in order to be able to access the local portion 
+# of distributed sparse matrices and vectors using local indices from the 
+# distributed test and trial spaces
+struct LocalView{T,N,A} <:AbstractArray{T,N}
+  plids_to_value::A
+  d_to_lid_to_plid::NTuple{N,Vector{Int32}}
+  local_size::NTuple{N,Int}
+  function LocalView(
+    plids_to_value::AbstractArray{T,N}, d_to_lid_to_plid::NTuple{N,Vector{Int32}}
+  ) where {T,N}
+    A = typeof(plids_to_value)
+    local_size = map(length,d_to_lid_to_plid)
+    new{T,N,A}(plids_to_value,d_to_lid_to_plid,local_size)
+  end
+end
+
+Base.size(a::LocalView) = a.local_size
+Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
+function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
+  plids = map(getindex,a.d_to_lid_to_plid,lids)
+  if all(i->i>0,plids)
+    a.plids_to_value[plids...]
+  else
+    zero(T)
+  end
+end
+function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
+  plids = map(getindex,a.d_to_lid_to_plid,lids)
+  @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
+  a.plids_to_value[plids...] = v
+end
+
+function local_views(a::PVector,new_rows::PRange)
+  old_rows = axes(a,1)
+  if partition(old_rows) === partition(new_rows)
+    partition(a)
+  else
+    map(partition(a),partition(old_rows),partition(new_rows)) do a,old_rows,new_rows
+      LocalView(a,(local_to_local_map(new_rows,old_rows),))
+    end
+  end
+end
+
+function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
+  old_rows, old_cols = axes(a)
+  if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
+    partition(a)
+  else
+    map(
+      partition(a),partition(old_rows),partition(old_cols),partition(new_rows),partition(new_cols)
+    ) do a,old_rows,old_cols,new_rows,new_cols
+      rl2lmap = local_to_local_map(new_rows,old_rows)
+      cl2lmap = local_to_local_map(new_cols,old_cols)
+      LocalView(a,(rl2lmap,cl2lmap))
+    end
+  end
+end
+
+function local_views(a::BlockPVector,new_rows::BlockPRange)
+  vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays
+  return map(mortar,vals)
+end
+
+function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange)
+  vals = map(CartesianIndices(blocksize(a))) do I
+    local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I])
+  end |> to_parray_of_arrays
+  return map(mortar,vals)
+end
diff --git a/src/Assembly.jl b/src/Assembly.jl
index a538eaa1..165d41c4 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -5,23 +5,36 @@ struct Assembled <: FESpaces.AssemblyStrategy end
 struct SubAssembled <: FESpaces.AssemblyStrategy end
 struct LocallyAssembled <: FESpaces.AssemblyStrategy end
 
+function local_assembly_strategy(::Union{Assembled,SubAssembled},rows,cols)
+  DefaultAssemblyStrategy()
+end
+
+# When LocallyAssembling, make sure that you also loop over ghost cells.
+function local_assembly_strategy(::LocallyAssembled,rows,cols)
+  rows_local_to_ghost = local_to_ghost(rows)
+  GenericAssemblyStrategy(
+    identity,
+    identity,
+    row->iszero(rows_local_to_ghost[row]),
+    col->true
+  )
+end
+
 # PSparseMatrix and PVector builders
 
-struct PSparseMatrixBuilder{T,B}
-  local_matrix_type::Type{T}
+struct DistributedArrayBuilder{T,B}
+  local_array_type::Type{T}
   strategy::B
 end
 
+const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T<:AbstractMatrix,B}
+const PVectorBuilder{T,B} = DistributedArrayBuilder{T<:AbstractVector,B}
+
 function Algebra.get_array_type(::PSparseMatrixBuilder{Tv}) where Tv
   T = eltype(Tv)
   return PSparseMatrix{T}
 end
 
-struct PVectorBuilder{T,B}
-  local_vector_type::Type{T}
-  strategy::B
-end
-
 function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
   T = eltype(Tv)
   return PVector{T}
@@ -53,6 +66,11 @@ Base.axes(a::DistributedCounter) = a.axes
 Base.axes(a::DistributedCounter,d::Integer) = a.axes[d]
 local_views(a::DistributedCounter) = a.counters
 
+function local_views(a::DistributedCounter,axes::PRange...)
+  @check all(map(PArrays.matching_local_indices,axes,a.axes))
+  return a.counters
+end
+
 const PVectorCounter{S,T,A,B} = DistributedCounter{S,T,1,A,B}
 Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
 
@@ -83,6 +101,11 @@ Base.axes(a::DistributedAllocation) = a.axes
 Base.axes(a::DistributedAllocation,d::Integer) = a.axes[d]
 local_views(a::DistributedAllocation) = a.allocs
 
+function local_views(a::DistributedAllocation,axes::PRange...)
+  @check all(map(PArrays.matching_local_indices,axes,a.axes))
+  return a.allocs
+end
+
 function change_axes(a::DistributedAllocation{S,T,N},axes::NTuple{N,<:PRange}) where {S,T,N}
   indices = map(partition,axes)
   local_axes = map(indices...) do indices...
@@ -151,6 +174,11 @@ function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:Assembled})
   return A
 end
 
+function Algebra.create_from_nz(a::PSparseMatrixAllocation{<:SubAssembled})
+  A, = create_from_nz_subassembled(a)
+  return A
+end
+
 # PVector assembly chain:
 #
 #   1 - nz_counter(PVectorBuilder) -> PVectorCounter
@@ -166,7 +194,7 @@ function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::Tuple{<:PRange}) wh
   DistributedCounter(counters,axs,builder.strategy)
 end
 
-function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled})
+function Arrays.nz_allocation(a::PVectorCounter{<:Union{LocallyAssembled,SubAssembled}})
   allocs = map(nz_allocation,local_views(a))
   DistributedAllocation(allocs,a.axes,a.strategy)
 end
@@ -188,14 +216,14 @@ function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVecto
   @assert false
 end
 
-function Algebra.create_from_nz(a::PVectorAllocation{<:LocallyAssembled,<:AbstractVector})
+function Algebra.create_from_nz(a::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}})
   rows = remove_ghost(unpermute(axes(a,1)))
   values = local_views(a)
   b = rhs_callback(values,rows)
   return b
 end
 
-function Algebra.create_from_nz(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
+function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled})
   rows = collect_touched_ids(a)
   values = map(ai -> ai.values, local_views(a))
   b  = rhs_callback(values,rows)
@@ -215,23 +243,19 @@ end
 # This is done by using the following specializations:
 
 function Arrays.nz_allocation(
-  a::PSparseMatrixCounter{<:Assembled},
-  b::PVectorCounter{<:Assembled}
+  a::PSparseMatrixCounter, b::PVectorCounter
 )
-  A = nz_allocation(a) # PSparseMatrixAllocation
-  B = nz_allocation(b) # PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
-  return A, B
+  return nz_allocation(a), nz_allocation(b)
 end
 
 function Algebra.create_from_nz(
   a::PSparseMatrixAllocation{<:LocallyAssembled},
-  b::PVectorAllocation{<:LocallyAssembled,<:AbstractVector}
+  b::PVectorAllocation{<:LocallyAssembled}
 )
   function callback(rows)
-    new_indices = partition(rows)
     values = local_views(b)
     b_fespace = PVector(values,partition(axes(b,1)))
-    locally_repartition(b_fespace,new_indices)
+    locally_repartition(b_fespace,rows)
   end
   A, B = create_from_nz_locally_assembled(a,callback)
   return A, B
@@ -239,7 +263,7 @@ end
 
 function Algebra.create_from_nz(
   a::PSparseMatrixAllocation{<:Assembled},
-  b::PVectorAllocation{<:Assembled,<:TrackedArrayAllocation}
+  b::PVectorAllocation{<:Assembled}
 )
   function callback(rows)
     new_indices = collect_touched_ids(b)
@@ -254,6 +278,19 @@ function Algebra.create_from_nz(
   return A, B
 end
 
+function Algebra.create_from_nz(
+  a::PSparseMatrixAllocation{<:SubAssembled},
+  b::PVectorAllocation{<:SubAssembled}
+)
+  function callback(rows)
+    @check matching_local_indices(rows,axes(b,1))
+    values = local_views(b)
+    PVector(values,partition(axes(b,1)))
+  end
+  A, B = create_from_nz_subassembled(a,callback)
+  return A, B
+end
+
 # Low-level assembly methods
 
 function create_from_nz_locally_assembled(
@@ -321,18 +358,111 @@ end
 function create_from_nz_subassembled(
   a,
   callback::Function = rows -> nothing,
-  async_callback::Function = b -> empty_async_task
 )
   rows = partition(axes(a,1))
   cols = partition(axes(a,2))
 
   b = callback(rows)
-  t2 = async_callback(b)
 
   assembled = false
   values = map(create_from_nz,local_views(a))
   A = PSparseMatrix(values,rows,cols,assembled)
 
-  wait(t2)
   return A, b
 end
+
+# DistributedSparseMatrixAssemblers
+
+"""
+"""
+struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
+  strategy::A
+  assems::B
+  matrix_builder::C
+  vector_builder::D
+  test_gids::E
+  trial_gids::F
+end
+
+local_views(a::DistributedSparseMatrixAssembler) = a.assems
+
+FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
+FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
+FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
+FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
+FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  rows::PRange,
+  cols::PRange,
+  par_strategy=Assembled()
+)
+  assems = map(partition(rows),partition(cols)) do rows,cols
+    FESpaces.GenericSparseMatrixAssembler(
+      SparseMatrixBuilder(local_mat_type), 
+      ArrayBuilder(local_vec_type),
+      Base.OneTo(length(rows)),
+      Base.OneTo(length(cols)),
+      local_assembly_strategy(par_strategy,rows,cols)
+    )
+  end
+  mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy)
+  vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy)
+  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  rows = get_free_dof_ids(test)
+  cols = get_free_dof_ids(trial)
+  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  Tv = getany(map(get_vector_type,local_views(trial)))
+  Tm = SparseMatrixCSC{eltype(Tv),Int}
+  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
+end
+
+function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)  
+end
+
+function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
+end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 1fb12343..4724789b 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -582,154 +582,6 @@ function FESpaces.collect_cell_matrix_and_vector(
   end
 end
 
-"""
-"""
-struct DistributedSparseMatrixAssembler{A,B,C,D,E,F,G} <: SparseMatrixAssembler
-  strategy::A
-  assems::B
-  matrix_builder::C
-  vector_builder::D
-  test_gids::E
-  trial_gids::F
-  caches::G
-end
-
-local_views(a::DistributedSparseMatrixAssembler) = a.assems
-
-FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
-FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
-FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
-FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
-FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
-
-function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  map(symbolic_loop_matrix!,local_views(A),local_views(a),matdata)
-end
-
-function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  map(numeric_loop_matrix!,local_views(A),local_views(a),matdata)
-end
-
-function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  Aviews=local_views(A,rows,cols)
-  bviews=local_views(b,rows)
-  aviews=local_views(a)
-  map(symbolic_loop_matrix_and_vector!,Aviews,bviews,aviews,data)  
-end
-
-function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows = get_rows(a)
-  cols = get_cols(a)
-  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
-end
-
-function FESpaces.assemble_matrix!(mat::PSparseMatrix,a::DistributedSparseMatrixAssembler,matdata)
-  @check !isnothing(a.caches) "Assembler is not reusable!"
-  @check haskey(a.caches,objectid(mat)) "Cache not found!"
-  cache = a.caches[objectid(mat)]
-
-  LinearAlgebra.fillstored!(mat,zero(eltype(mat)))
-
-  counter = nz_counter(get_matrix_builder(a),(get_rows(a),get_cols(a)))
-  map(local_views(counter),partition(mat)) do counter, mat
-    counter.nnz = nnz(mat)
-  end
-  allocs = nz_allocation(counter)
-  numeric_loop_matrix!(allocs,a,matdata)
-
-  create_from_nz!(mat,allocs,cache)
-end
-
-function create_from_nz!(A,a::DistributedAllocation{<:Assembled},cache)
-  _,_,V = get_allocations(a)
-  psparse!(A,V,cache) |> wait
-  return A
-end
-
-# Parallel Assembly strategies
-
-function local_assembly_strategy(::Assembled,rows,cols)
-  DefaultAssemblyStrategy()
-end
-
-# When using this one, make sure that you also loop over ghost cells.
-# This is at your own risk.
-function local_assembly_strategy(::LocallyAssembled,rows,cols)
-  rows_local_to_ghost = local_to_ghost(rows)
-  GenericAssemblyStrategy(
-    identity,
-    identity,
-    row->iszero(rows_local_to_ghost[row]),
-    col->true
-  )
-end
-
-# Assembler high level constructors
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  rows::PRange,
-  cols::PRange,
-  par_strategy=Assembled();
-  reuse_caches=false
-)
-  assems = map(partition(rows),partition(cols)) do rows,cols
-    local_strategy = local_assembly_strategy(par_strategy,rows,cols)
-    FESpaces.GenericSparseMatrixAssembler(
-      SparseMatrixBuilder(local_mat_type),
-      ArrayBuilder(local_vec_type),
-      Base.OneTo(length(rows)),
-      Base.OneTo(length(cols)),
-      local_strategy
-    )
-  end
-  mat_builder = PSparseMatrixBuilder(local_mat_type,par_strategy)
-  vec_builder = PVectorBuilder(local_vec_type,par_strategy)
-  if reuse_caches
-    caches = Dict{UInt64,Any}()
-  else
-    caches = nothing
-  end
-  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols,caches)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy::FESpaces.AssemblyStrategy=Assembled();
-  kwargs...
-)
-  rows = get_free_dof_ids(test)
-  cols = get_free_dof_ids(trial)
-  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy::FESpaces.AssemblyStrategy=Assembled();
-  kwargs...
-)
-  Tv = PartitionedArrays.getany(map(get_vector_type,local_views(trial)))
-  T  = eltype(Tv)
-  Tm = SparseMatrixCSC{T,Int}
-  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
-end
-
 # ZeroMean FESpace
 struct DistributedZeroMeanCache{A,B}
   dvol::A
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index b139a547..df09bdb0 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -153,6 +153,25 @@ function _filter_ghost(indices,gids,owners)
   return new_ghost_to_global, new_ghost_to_owner
 end
 
+# This function computes a mapping among the local identifiers of a and b
+# for which the corresponding global identifiers are both in a and b. 
+# Note that the haskey check is necessary because in the general case 
+# there might be gids in b which are not present in a
+function local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
+  a_lid_to_b_lid = fill(zero(Int32),local_length(a))
+
+  b_local_to_global  = local_to_global(b)
+  a_global_to_local  = global_to_local(a)
+  for b_lid in 1:local_length(b)
+    gid = b_local_to_global[b_lid]
+    a_lid = a_global_to_local[gid]
+    if !iszero(a_lid)
+      a_lid_to_b_lid[a_lid] = b_lid
+    end  
+  end
+  a_lid_to_b_lid
+end
+
 # SubSparseMatrix extensions
 
 function SparseArrays.findnz(A::PartitionedArrays.SubSparseMatrix)
@@ -362,4 +381,3 @@ end
 function generate_subparts(parts::DebugArray,new_comm_size)
   DebugArray(LinearIndices((new_comm_size,)))
 end
-
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index fd62332b..8516150e 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -3,6 +3,8 @@ using Gridap
 using PartitionedArrays
 using GridapDistributed
 
+using Gridap.FESpaces, Gridap.Algebra
+
 np = (1,2)
 ranks = with_debug() do distribute
   distribute(LinearIndices((prod(np),)))
@@ -29,14 +31,46 @@ serial_dΩ = Measure(serial_Ω,2)
 dist_Ω = Triangulation(dist_model)
 dist_dΩ = Measure(dist_Ω,2)
 
-serial_a1(u,v) = ∫(u⋅v)*serial_dΩ
-serial_A1 = assemble_matrix(serial_a1,serial_V,serial_V)
+dist_Ωg = Triangulation(GridapDistributed.with_ghost,dist_model)
+dist_dΩg = Measure(dist_Ωg,2)
+
+serial_a(u,v) = ∫(u⋅v)*serial_dΩ
+dist_a(u,v) = ∫(u⋅v)*dist_dΩ
+dist_ag(u,v) = ∫(u⋅v)*dist_dΩg
+
+serial_A = assemble_matrix(serial_a,serial_V,serial_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.Assembled())
+dist_A_AS = assemble_matrix(dist_a,assem,dist_V,dist_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.LocallyAssembled())
+dist_A_LA = assemble_matrix(dist_ag,assem,dist_V,dist_V)
+
+assem = SparseMatrixAssembler(dist_V,dist_V,GridapDistributed.SubAssembled())
+dist_A_SA = assemble_matrix(dist_a,assem,dist_V,dist_V)
+
+all(centralize(dist_A_AS) - serial_A .< 1e-10)
+
+x_AS = prand(partition(axes(dist_A_AS,2)))
+x_LA = GridapDistributed.change_ghost(x_AS,axes(dist_A_LA,2))
+x_SA = GridapDistributed.change_ghost(x_AS,axes(dist_A_SA,2))
+
+norm(dist_A_AS*x_AS - dist_A_LA*x_LA)
+norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
+
+assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V)
+
+############################################################################################
+
+A = deepcopy(dist_A_AS)
+
+t = assemble!(A;reuse=true)
+_, cache = fetch(t)
 
-dist_a1(u,v) = ∫(u⋅v)*dist_dΩ
-assem = SparseMatrixAssembler(dist_V,dist_V)
-dist_A1 = assemble_matrix(dist_a1,dist_V,dist_V)
-all(centralize(dist_A1) - serial_A1 .< 1e-10)
+t2 = assemble!(A,cache)
+wait(t2)
 
+map(==, partition(A), partition(dist_A_AS))
 
 ############################################################################################
 

From eef271e37b795fba23f11f4a9b64849f7c880319 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Mon, 31 Mar 2025 07:53:26 +1100
Subject: [PATCH 13/48] saving changes

---
 src/Assembly.jl        | 114 +++----------------
 src/FESpaces.jl        | 248 ++++++++++++++++++++++++++++-------------
 test/parrays_update.jl |  59 +---------
 3 files changed, 185 insertions(+), 236 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 165d41c4..1e0127bd 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -22,16 +22,24 @@ end
 
 # PSparseMatrix and PVector builders
 
-struct DistributedArrayBuilder{T,B}
+struct DistributedArrayBuilder{T,N,B}
   local_array_type::Type{T}
   strategy::B
+  function DistributedArrayBuilder(
+    local_array_type::Type{T},
+    strategy::AssemblyStrategy
+  ) where T
+    N = ndims(T)
+    B = typeof(strategy)
+    new{T,N,B}(local_array_type,strategy)
+  end
 end
 
-const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T<:AbstractMatrix,B}
-const PVectorBuilder{T,B} = DistributedArrayBuilder{T<:AbstractVector,B}
+const PVectorBuilder{T,B} = DistributedArrayBuilder{T,1,B}
+const PSparseMatrixBuilder{T,B} = DistributedArrayBuilder{T,2,B}
 
-function Algebra.get_array_type(::PSparseMatrixBuilder{Tv}) where Tv
-  T = eltype(Tv)
+function Algebra.get_array_type(::PSparseMatrixBuilder{Tm}) where Tm
+  T = eltype(Tm)
   return PSparseMatrix{T}
 end
 
@@ -370,99 +378,3 @@ function create_from_nz_subassembled(
 
   return A, b
 end
-
-# DistributedSparseMatrixAssemblers
-
-"""
-"""
-struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
-  strategy::A
-  assems::B
-  matrix_builder::C
-  vector_builder::D
-  test_gids::E
-  trial_gids::F
-end
-
-local_views(a::DistributedSparseMatrixAssembler) = a.assems
-
-FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
-FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
-FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
-FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
-FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
-
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  rows::PRange,
-  cols::PRange,
-  par_strategy=Assembled()
-)
-  assems = map(partition(rows),partition(cols)) do rows,cols
-    FESpaces.GenericSparseMatrixAssembler(
-      SparseMatrixBuilder(local_mat_type), 
-      ArrayBuilder(local_vec_type),
-      Base.OneTo(length(rows)),
-      Base.OneTo(length(cols)),
-      local_assembly_strategy(par_strategy,rows,cols)
-    )
-  end
-  mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy)
-  vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy)
-  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  local_mat_type,
-  local_vec_type,
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy::FESpaces.AssemblyStrategy=Assembled();
-  kwargs...
-)
-  rows = get_free_dof_ids(test)
-  cols = get_free_dof_ids(trial)
-  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
-end
-
-function FESpaces.SparseMatrixAssembler(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  par_strategy::FESpaces.AssemblyStrategy=Assembled();
-  kwargs...
-)
-  Tv = getany(map(get_vector_type,local_views(trial)))
-  Tm = SparseMatrixCSC{eltype(Tv),Int}
-  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
-end
-
-function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows, cols = get_rows(a), get_cols(a)
-  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
-end
-
-function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
-  rows, cols = get_rows(a), get_cols(a)
-  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
-end
-
-function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
-  rows = get_rows(a)
-  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
-end
-
-function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows, cols = get_rows(a), get_cols(a)
-  map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)  
-end
-
-function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
-  rows, cols = get_rows(a), get_cols(a)
-  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
-end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 4724789b..3121e118 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -505,83 +505,6 @@ function _add_distributed_constraint(F::DistributedFESpace,order::Integer,constr
   V
 end
 
-# Assembly
-
-function FESpaces.collect_cell_matrix(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  a::DistributedDomainContribution
-)
-  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
-end
-
-function FESpaces.collect_cell_vector(
-  test::DistributedFESpace, a::DistributedDomainContribution
-)
-  map(collect_cell_vector,local_views(test),local_views(a))
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution
-)
-  map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform)
-  )
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution,
-  uhd
-)
-  map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform),
-    local_views(uhd)
-  )
-end
-
-function FESpaces.collect_cell_vector(
-  test::DistributedFESpace,l::Number
-)
-  map(local_views(test)) do s
-    collect_cell_vector(s,l)
-  end
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number
-)
-  map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
-    collect_cell_matrix_and_vector(u,v,m,l)
-  end
-end
-
-function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number,
-  uhd
-)
-  map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
-    collect_cell_matrix_and_vector(u,v,m,l,f)
-  end
-end
-
 # ZeroMean FESpace
 struct DistributedZeroMeanCache{A,B}
   dvol::A
@@ -741,3 +664,174 @@ function FESpaces.ConstantFESpace(
   vector_type = _find_vector_type(spaces,gids)
   return DistributedSingleFieldFESpace(spaces,gids,trian,vector_type)
 end
+
+# Assembly
+
+"""
+"""
+struct DistributedSparseMatrixAssembler{A,B,C,D,E,F} <: SparseMatrixAssembler
+  strategy::A
+  assems::B
+  matrix_builder::C
+  vector_builder::D
+  test_gids::E
+  trial_gids::F
+end
+
+local_views(a::DistributedSparseMatrixAssembler) = a.assems
+
+FESpaces.get_rows(a::DistributedSparseMatrixAssembler) = a.test_gids
+FESpaces.get_cols(a::DistributedSparseMatrixAssembler) = a.trial_gids
+FESpaces.get_matrix_builder(a::DistributedSparseMatrixAssembler) = a.matrix_builder
+FESpaces.get_vector_builder(a::DistributedSparseMatrixAssembler) = a.vector_builder
+FESpaces.get_assembly_strategy(a::DistributedSparseMatrixAssembler) = a.strategy
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  rows::PRange,
+  cols::PRange,
+  par_strategy=Assembled()
+)
+  assems = map(partition(rows),partition(cols)) do rows,cols
+    FESpaces.GenericSparseMatrixAssembler(
+      SparseMatrixBuilder(local_mat_type), 
+      ArrayBuilder(local_vec_type),
+      Base.OneTo(length(rows)),
+      Base.OneTo(length(cols)),
+      local_assembly_strategy(par_strategy,rows,cols)
+    )
+  end
+  mat_builder = DistributedArrayBuilder(local_mat_type,par_strategy)
+  vec_builder = DistributedArrayBuilder(local_vec_type,par_strategy)
+  return DistributedSparseMatrixAssembler(par_strategy,assems,mat_builder,vec_builder,rows,cols)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  local_mat_type,
+  local_vec_type,
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  rows = get_free_dof_ids(test)
+  cols = get_free_dof_ids(trial)
+  SparseMatrixAssembler(local_mat_type,local_vec_type,rows,cols,par_strategy;kwargs...)
+end
+
+function FESpaces.SparseMatrixAssembler(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  par_strategy::FESpaces.AssemblyStrategy=Assembled();
+  kwargs...
+)
+  Tv = getany(map(get_vector_type,local_views(trial)))
+  Tm = SparseMatrixCSC{eltype(Tv),Int}
+  SparseMatrixAssembler(Tm,Tv,trial,test,par_strategy;kwargs...)
+end
+
+function FESpaces.symbolic_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.numeric_loop_matrix!(A,a::DistributedSparseMatrixAssembler,matdata)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix!,local_views(A,rows,cols),local_views(a),matdata)
+end
+
+function FESpaces.symbolic_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(symbolic_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.numeric_loop_vector!(b,a::DistributedSparseMatrixAssembler,vecdata)
+  rows = get_rows(a)
+  map(numeric_loop_vector!,local_views(b,rows),local_views(a),vecdata)
+end
+
+function FESpaces.symbolic_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(symbolic_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)  
+end
+
+function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrixAssembler,data)
+  rows, cols = get_rows(a), get_cols(a)
+  map(numeric_loop_matrix_and_vector!,local_views(A,rows,cols),local_views(b,rows),local_views(a),data)
+end
+
+function FESpaces.collect_cell_matrix(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  a::DistributedDomainContribution
+)
+  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
+end
+
+function FESpaces.collect_cell_vector(
+  test::DistributedFESpace, a::DistributedDomainContribution
+)
+  map(collect_cell_vector,local_views(test),local_views(a))
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  biform::DistributedDomainContribution,
+  liform::DistributedDomainContribution
+)
+  map(collect_cell_matrix_and_vector,
+    local_views(trial),
+    local_views(test),
+    local_views(biform),
+    local_views(liform)
+  )
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  biform::DistributedDomainContribution,
+  liform::DistributedDomainContribution,
+  uhd
+)
+  map(collect_cell_matrix_and_vector,
+    local_views(trial),
+    local_views(test),
+    local_views(biform),
+    local_views(liform),
+    local_views(uhd)
+  )
+end
+
+function FESpaces.collect_cell_vector(
+  test::DistributedFESpace,l::Number
+)
+  map(local_views(test)) do s
+    collect_cell_vector(s,l)
+  end
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  mat::DistributedDomainContribution,
+  l::Number
+)
+  map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
+    collect_cell_matrix_and_vector(u,v,m,l)
+  end
+end
+
+function FESpaces.collect_cell_matrix_and_vector(
+  trial::DistributedFESpace,
+  test::DistributedFESpace,
+  mat::DistributedDomainContribution,
+  l::Number,
+  uhd
+)
+  map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
+    collect_cell_matrix_and_vector(u,v,m,l,f)
+  end
+end
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index 8516150e..961cbfcd 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -59,64 +59,7 @@ norm(dist_A_AS*x_AS - dist_A_LA*x_LA)
 norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
 
 assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V)
+norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
 
 ############################################################################################
 
-A = deepcopy(dist_A_AS)
-
-t = assemble!(A;reuse=true)
-_, cache = fetch(t)
-
-t2 = assemble!(A,cache)
-wait(t2)
-
-map(==, partition(A), partition(dist_A_AS))
-
-############################################################################################
-
-function PartitionedArrays.precompute_nzindex(A,I,J;skip=false)
-  K = zeros(Int32,length(I))
-  for (p,(i,j)) in enumerate(zip(I,J))
-      if !skip && (i < 1 || j < 1)
-          continue
-      end
-      K[p] = nzindex(A,i,j)
-  end
-  K
-end
-
-############################################################################################
-
-ids = cids
-indices = partition(ids)
-
-n_own = own_length(cids)
-ghost2global = ghost_to_global(cids)
-ghost2owner = ghost_to_owner(cids)
-
-first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
-n_global = reduce(+,n_own)
-n_parts = length(ranks)
-indices2 = map(ranks,n_own,first_gid,ghost2global,ghost2owner) do rank,n_own,start,ghost2global,ghost2owner
-  p = CartesianIndex((rank,))
-  np = (n_parts,)
-  n = (n_global,)
-  ranges = ((1:n_own).+(start-1),)
-  ghost = GhostIndices(n_global,ghost2global,ghost2owner)
-
-  PartitionedArrays.LocalIndicesWithVariableBlockSize(p,np,n,ranges,ghost)
-end
-
-map(indices,indices2) do indices, indices2
-  ghost2local = ghost_to_local(indices)
-  own2local = own_to_local(indices)
-
-  n_own = own_length(indices)
-  n_ghost = ghost_length(indices)
-
-  #perm = vcat(own2local,ghost2local)
-  perm = fill(0,local_length(indices))
-  perm[own2local] .= 1:n_own
-  perm[ghost2local] .= (n_own+1):(n_ghost+n_own)
-  permute_indices(indices2,perm)
-end

From cb9de8ec348a09d14670b606e5dda3918177c71e Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Mon, 31 Mar 2025 10:13:45 +1100
Subject: [PATCH 14/48] Bugfixes

---
 src/Assembly.jl          | 19 ++++++++++---------
 src/FESpaces.jl          |  3 +--
 src/PArraysExtras.jl     | 19 ++++++++++++++++++-
 test/PeriodicBCsTests.jl |  2 ++
 test/parrays_update.jl   |  1 -
 5 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 1e0127bd..41673c5e 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -133,18 +133,19 @@ function get_allocations(a::PSparseMatrixAllocation{S,<:Algebra.AllocationCOO})
   return I,J,V
 end
 
-function collect_touched_ids(a::PVectorAllocation{<:TrackedArrayAllocation})
-  map(local_views(a),partition(axes(a,1))) do a, ids
+function collect_touched_ids(a::PVectorAllocation{S,<:TrackedArrayAllocation}) where S
+  touched_ids = map(local_views(a),partition(axes(a,1))) do a, ids
+    n_global = global_length(ids)
     rows = remove_ghost(unpermute(ids))
 
     ghost_lids = ghost_to_local(ids)
-    ghost_owners = ghost_to_owner(ids)
-    touched_ghost_lids = filter(lid -> a.touched[lid], ghost_lids)
-    touched_ghost_owners = collect(ghost_owners[touched_ghost_lids])
+    touched_ghost_lids = collect(Int,filter(lid -> a.touched[lid], ghost_lids))
+    touched_ghost_owners = local_to_owner(ids)[touched_ghost_lids]
     touched_ghost_gids = to_global!(touched_ghost_lids, ids)
     ghost = GhostIndices(n_global,touched_ghost_gids,touched_ghost_owners)
-    return replace_ghost(rows,ghost)
+    replace_ghost(rows,ghost)
   end
+  return touched_ids
 end
 
 # PSparseMatrix assembly chain
@@ -193,13 +194,13 @@ end
 #   2 - nz_allocation(PVectorCounter) -> PVectorAllocation
 #   3 - create_from_nz(PVectorAllocation) -> PVector
 
-function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::Tuple{<:PRange}) where VT
+function Algebra.nz_counter(builder::PVectorBuilder{VT},axs::NTuple{1,<:PRange}) where VT
   rows, = axs
   counters = map(partition(rows)) do rows
     axs = (Base.OneTo(local_length(rows)),)
     nz_counter(ArrayBuilder(VT),axs)
   end
-  DistributedCounter(counters,axs,builder.strategy)
+  DistributedCounter(counters,(rows,),builder.strategy)
 end
 
 function Arrays.nz_allocation(a::PVectorCounter{<:Union{LocallyAssembled,SubAssembled}})
@@ -291,7 +292,7 @@ function Algebra.create_from_nz(
   b::PVectorAllocation{<:SubAssembled}
 )
   function callback(rows)
-    @check matching_local_indices(rows,axes(b,1))
+    @check PArrays.matching_local_indices(PRange(rows),axes(b,1))
     values = local_views(b)
     PVector(values,partition(axes(b,1)))
   end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 3121e118..f68b7470 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -168,8 +168,7 @@ function generate_gids(
       cell_owner = cell_local_to_owner[cell]
       p = cell_to_gdofs.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
-        dof_owner = ldof_to_owner[ldof]
-        if ldof > 0 && (dof_owner == cell_owner)
+        if ldof > 0 && isequal(ldof_to_owner[ldof],cell_owner)
           ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
         end
       end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index df09bdb0..302944a6 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -3,7 +3,7 @@
     permuted_variable_partition(n_own,gids,owners; n_global, start)
 
 Create indices which are a permuted version of a variable_partition.
-The advantage of this wrt the `LocalIndices` type, is that we can compute 
+The advantage of this w.r.t. the `LocalIndices` type, is that we can compute 
 dof owners with minimal communications, since we only need the size of the blocks.
 
 NOTE: This is the type for our FESpace dof_ids.
@@ -153,6 +153,23 @@ function _filter_ghost(indices,gids,owners)
   return new_ghost_to_global, new_ghost_to_owner
 end
 
+# function PArrays.remove_ghost(indices::PermutedLocalIndices)
+#   n_global = global_length(indices) 
+#   own = OwnIndices(n_global,part_id(indices),own_to_global(indices))
+#   ghost = GhostIndices(n_global,Int[],Int32[])
+#   OwnAndGhostIndices(own,ghost)
+# end
+
+function PArrays.remove_ghost(indices::PermutedLocalIndices)
+  perm = indices.perm
+  own_to_local = indices.own_to_local
+  display(own_to_local)
+  println(issorted(own_to_local))
+  display(perm)
+  println("-------------------------------")
+  remove_ghost(indices.indices)
+end
+
 # This function computes a mapping among the local identifiers of a and b
 # for which the corresponding global identifiers are both in a and b. 
 # Note that the haskey check is necessary because in the general case 
diff --git a/test/PeriodicBCsTests.jl b/test/PeriodicBCsTests.jl
index 6177752d..247cb6fc 100644
--- a/test/PeriodicBCsTests.jl
+++ b/test/PeriodicBCsTests.jl
@@ -35,4 +35,6 @@ function main(distribute,parts)
 
 end
 
+main(DebugArray,(2,2))
+
 end # module
diff --git a/test/parrays_update.jl b/test/parrays_update.jl
index 961cbfcd..ec106730 100644
--- a/test/parrays_update.jl
+++ b/test/parrays_update.jl
@@ -62,4 +62,3 @@ assemble_matrix!(dist_a,dist_A_AS,assem,dist_V,dist_V)
 norm(dist_A_AS*x_AS - dist_A_SA*x_SA)
 
 ############################################################################################
-

From 185a03f01c82f5eca07a7e1b354140f763311d73 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Thu, 18 Sep 2025 09:35:53 +1000
Subject: [PATCH 15/48] Readability and efficiency changes to generate_gids

---
 src/FESpaces.jl | 177 ++++++++++++++++++------------------------------
 1 file changed, 65 insertions(+), 112 deletions(-)

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 0a471b69..22046bfa 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -67,36 +67,26 @@ function FESpaces.gather_free_and_dirichlet_values(f::DistributedFESpace,cell_va
   return free_values, dirichlet_values
 end
 
-function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_prange)
-  map(cell_wise_vector,
-          dof_wise_vector,
-          cell_to_ldofs,
-          partition(cell_prange)) do cwv,dwv,cell_to_ldofs,indices
+function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids)
+  map(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids
     cache  = array_cache(cell_to_ldofs)
-    ncells = length(cell_to_ldofs)
-    ptrs = cwv.ptrs
-    data = cwv.data
-    cell_own_to_local = own_to_local(indices)
-    for cell in cell_own_to_local
+    for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
-      p = ptrs[cell]-1
+      p = cwv.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
         if ldof > 0
-          data[i+p] = dwv[ldof]
+          cwv.data[i+p] = dwv[ldof]
         end
       end
     end
   end
+  return cell_wise_vector
 end
 
-function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_range)
-  map(dof_wise_vector,
-      cell_wise_vector,
-      cell_to_ldofs,
-      partition(cell_range)) do dwv,cwv,cell_to_ldofs,indices
+function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids)
+  map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
     cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
-    for cell in cell_ghost_to_local
+    for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
       p = cwv.ptrs[cell]-1
       for (i,ldof) in enumerate(ldofs)
@@ -106,25 +96,24 @@ function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,c
       end
     end
   end
+  return dof_wise_vector
 end
 
-function dof_wise_to_cell_wise(dof_wise_vector,cell_to_ldofs,cell_prange)
-  cwv = map(cell_to_ldofs) do cell_to_ldofs
-    cache = array_cache(cell_to_ldofs)
-    ncells = length(cell_to_ldofs)
-    ptrs = Vector{Int32}(undef,ncells+1)
-    for cell in 1:ncells
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      ptrs[cell+1] = length(ldofs)
-    end
-    PArrays.length_to_ptrs!(ptrs)
-    ndata = ptrs[end]-1
-    data = Vector{Int}(undef,ndata)
-    data .= -1
+function allocate_cell_wise_vector(T, cell_to_lids)
+  map(cell_to_lids) do cell_to_lids
+    ptrs = Arrays.generate_ptrs(cell_to_lids)
+    data = zeros(T,ptrs[end]-1)
     JaggedArray(data,ptrs)
   end
-  dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_prange)
-  cwv
+end
+
+function dof_wise_to_cell_wise(
+  dof_wise_vector, cell_to_ldofs, cell_ids; 
+  T = eltype(eltype(dof_wise_vector))
+)
+  cwv = allocate_cell_wise_vector(T,cell_to_ldofs)
+  dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_ids)
+  return cwv
 end
 
 function fetch_vector_ghost_values_cache(vector_partition,partition)
@@ -139,128 +128,92 @@ end
 function generate_gids(
   cell_range::PRange,
   cell_to_ldofs::AbstractArray{<:AbstractArray},
-  nldofs::AbstractArray{<:Integer})
+  nldofs::AbstractArray{<:Integer}
+)
+  ranks = linear_indices(partition(cell_range))
+  cell_ldofs_to_data = allocate_cell_wise_vector(Int, cell_to_ldofs)
+  cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_data,partition(cell_range))
 
   # Find and count number owned dofs
   ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs
     ldof_to_owner = fill(Int32(0),nldofs)
     cache = array_cache(cell_to_ldofs)
-    lcell_to_owner = local_to_owner(indices)
-    for cell in 1:length(cell_to_ldofs)
-      owner = lcell_to_owner[cell]
+    for (cell, owner) in enumerate(local_to_owner(indices))
       ldofs = getindex!(cache,cell_to_ldofs,cell)
       for ldof in ldofs
-        if ldof>0
-          # TODO this simple approach concentrates dofs
-          # in the last part and creates imbalances
+        if ldof > 0
+          # NOTE: this approach concentrates dofs in the last processor
           ldof_to_owner[ldof] = max(owner,ldof_to_owner[ldof])
         end
       end
     end
-    me = part_id(indices) 
-    nodofs = count(p->p==me,ldof_to_owner)
+    nodofs = count(isequal(part_id(indices)),ldof_to_owner)
     ldof_to_owner, nodofs
   end |> tuple_of_arrays
 
-  cell_ldofs_to_part = dof_wise_to_cell_wise(ldof_to_owner,
-                                              cell_to_ldofs,
-                                              cell_range)
-
-  # Note1 : this call potentially updates cell_prange with the 
-  #         info required to exchange info among nearest neighbours
-  #         so that it can be re-used in the future for other exchanges
-
-  # Note2 : we need to call reverse() as the senders and receivers are 
-  #         swapped in the AssemblyCache of partition(cell_range)
-
-  # Exchange the dof owners
-  cache_fetch=fetch_vector_ghost_values_cache(cell_ldofs_to_part,partition(cell_range))
-  fetch_vector_ghost_values!(cell_ldofs_to_part,cache_fetch) |> wait
-  
-  cell_wise_to_dof_wise!(ldof_to_owner,
-                         cell_ldofs_to_part,
-                         cell_to_ldofs,
-                         cell_range)
-
   # Find the global range of owned dofs
   first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs)))
-  
-  # Distribute gdofs to owned ones
-  ldof_to_gdof = map(first_gdof,ldof_to_owner,partition(cell_range)) do first_gdof,ldof_to_owner,indices
-    me = part_id(indices)
+
+  # Exchange the dof owners. Cell owner always has correct dof owner.
+  cell_ldofs_to_owner = dof_wise_to_cell_wise!(
+    cell_ldofs_to_data,ldof_to_owner,cell_to_ldofs,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_ldofs_to_owner,cache_fetch) |> wait
+  cell_wise_to_dof_wise!(
+    ldof_to_owner,cell_ldofs_to_owner,cell_to_ldofs,ghost_to_local(cell_range)
+  )
+
+  # Fill owned gids
+  ldof_to_gdof = map(ranks,first_gdof,ldof_to_owner) do rank,first_gdof,ldof_to_owner
     offset = first_gdof-1
-    ldof_to_gdof = Vector{Int}(undef,length(ldof_to_owner))
+    ldof_to_gdof = zeros(Int,length(ldof_to_owner))
     odof = 0
     for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
+      if owner == rank
         odof += 1
-        ldof_to_gdof[ldof] = odof
-      else
-        ldof_to_gdof[ldof] = 0
-      end
-    end
-    for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == me
-        ldof_to_gdof[ldof] += offset
+        ldof_to_gdof[ldof] = odof + offset
       end
     end
     ldof_to_gdof
   end
 
-  # Create cell-wise global dofs
-  cell_to_gdofs = dof_wise_to_cell_wise(ldof_to_gdof,
-                                        cell_to_ldofs,
-                                        cell_range)
-
-  # Exchange the global dofs
+  # Exchange gids
+  cell_to_gdofs = dof_wise_to_cell_wise!(
+    cell_ldofs_to_data,ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range)
+  )
   fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
 
-  # Distribute global dof ids also to ghost
-  map(cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
-    gdof = 0
+  # Fill ghost gids with exchanged information
+  map(
+    cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)
+  ) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
     cache = array_cache(cell_to_ldofs)
-    cell_ghost_to_local = ghost_to_local(indices)
-    cell_local_to_owner = local_to_owner(indices)
-    for cell in cell_ghost_to_local
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
+    lcell_to_owner = local_to_owner(indices)
+    for cell in ghost_to_local(indices)
       p = cell_to_gdofs.ptrs[cell]-1
+      ldofs = getindex!(cache,cell_to_ldofs,cell)
+      cell_owner = lcell_to_owner[cell]
       for (i,ldof) in enumerate(ldofs)
-        if ldof > 0 && ldof_to_owner[ldof] == cell_local_to_owner[cell]
+        if (ldof > 0) && isequal(ldof_to_owner[ldof],cell_owner)
           ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
         end
       end
     end
   end
 
-  dof_wise_to_cell_wise!(cell_to_gdofs,
-                         ldof_to_gdof,
-                         cell_to_ldofs,
-                         cell_range)
-
+  dof_wise_to_cell_wise!(cell_to_gdofs,ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range))
   fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-
-  cell_wise_to_dof_wise!(ldof_to_gdof,
-                         cell_to_gdofs,
-                         cell_to_ldofs,
-                         cell_range)
+  cell_wise_to_dof_wise!(ldof_to_gdof,cell_to_gdofs,cell_to_ldofs,ghost_to_local(cell_range))
 
   # Setup DoFs LocalIndices
   ngdofs = reduction(+,nodofs,destination=:all,init=zero(eltype(nodofs)))
-  local_indices = map(ngdofs,partition(cell_range),ldof_to_gdof,ldof_to_owner) do ngdofs,
-                                                                                  indices,
-                                                                                  ldof_to_gdof,
-                                                                                  ldof_to_owner
-     me = part_id(indices)
-     LocalIndices(ngdofs,me,ldof_to_gdof,ldof_to_owner)
-  end
+  local_indices = map(LocalIndices,ngdofs,ranks,ldof_to_gdof,ldof_to_owner)
 
-  # Setup dof range
-  dofs_range = PRange(local_indices)
-
-  return dofs_range
+  return PRange(local_indices)
 end
 
 # FEFunction related
+
 """
 """
 struct DistributedFEFunctionData{T<:AbstractVector} <: GridapType

From 1d3cf84a4fd71ca5523b6a10d4242d3c39b5b319 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Thu, 18 Sep 2025 11:18:19 +1000
Subject: [PATCH 16/48] Added generate_posneg_gids

---
 src/FESpaces.jl | 148 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 22046bfa..4d61bd62 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -83,6 +83,24 @@ function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,c
   return cell_wise_vector
 end
 
+function posneg_wise_to_cell_wise!(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids)
+  map(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids) do cwv,pwv,nwv,cell_to_posneg,cell_ids
+    cache  = array_cache(cell_to_posneg)
+    for cell in cell_ids
+      lids = getindex!(cache,cell_to_posneg,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,lid) in enumerate(lids)
+        if lid > 0
+          cwv.data[i+p] = pwv[lid]
+        elseif lid < 0
+          cwv.data[i+p] = nwv[-lid]
+        end
+      end
+    end
+  end
+  return cell_wise_vector
+end
+
 function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids)
   map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
     cache = array_cache(cell_to_ldofs)
@@ -99,6 +117,24 @@ function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,c
   return dof_wise_vector
 end
 
+function cell_wise_to_posneg_wise!(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids)
+  map(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids) do pwv,nwv,cwv,cell_to_posneg,cell_ids
+    cache = array_cache(cell_to_posneg)
+    for cell in cell_ids
+      lids = getindex!(cache,cell_to_posneg,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,lid) in enumerate(lids)
+        if lid > 0
+          pwv[lid] = cwv.data[i+p]
+        elseif lid < 0
+          nwv[-lid] = cwv.data[i+p]
+        end
+      end
+    end
+  end
+  return pos_wise_vector, neg_wise_vector
+end
+
 function allocate_cell_wise_vector(T, cell_to_lids)
   map(cell_to_lids) do cell_to_lids
     ptrs = Arrays.generate_ptrs(cell_to_lids)
@@ -212,6 +248,118 @@ function generate_gids(
   return PRange(local_indices)
 end
 
+function generate_posneg_gids(
+  cell_range::PRange,
+  cell_to_lposneg::AbstractArray{<:AbstractArray},
+  nlpos::AbstractArray{<:Integer},
+  nlneg::AbstractArray{<:Integer}
+)
+  ranks = linear_indices(partition(cell_range))
+  cell_lids_to_data = allocate_cell_wise_vector(Int, cell_to_lposneg)
+  cache_fetch = fetch_vector_ghost_values_cache(cell_lids_to_data,partition(cell_range))
+
+  # Find and count number owned dofs
+  lpos_to_owner, lneg_to_owner, nopos, noneg = map(
+    partition(cell_range),cell_to_lposneg,nlpos,nlneg
+  ) do indices,cell_to_lposneg,nlpos,nlneg
+    lpos_to_owner = fill(zero(Int32),nlpos)
+    lneg_to_owner = fill(zero(Int32),nlneg)
+    cache = array_cache(cell_to_lposneg)
+    for (cell, owner) in enumerate(local_to_owner(indices))
+      lids = getindex!(cache,cell_to_lposneg,cell)
+      for lid in lids
+        if lid > 0
+          lpos_to_owner[lid] = max(owner,lpos_to_owner[lid])
+        elseif lid < 0
+          lneg_to_owner[-lid] = max(owner,lneg_to_owner[-lid])
+        end
+      end
+    end
+    rank = part_id(indices)
+    nopos = count(isequal(rank),lpos_to_owner)
+    noneg = count(isequal(rank),lneg_to_owner)
+    return lpos_to_owner, lneg_to_owner, nopos, noneg
+  end |> tuple_of_arrays
+
+  # Find the global range of owned dofs
+  first_gpos = scan(+,nopos,type=:exclusive,init=1)
+  first_gneg = scan(+,noneg,type=:exclusive,init=1)
+
+  # Exchange the dof owners. Cell owner always has correct dof owner.
+  cell_ldofs_to_owner = posneg_wise_to_cell_wise!(
+    cell_lids_to_data,lpos_to_owner,lneg_to_owner,cell_to_lposneg,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_ldofs_to_owner,cache_fetch) |> wait
+  cell_wise_to_posneg_wise!(
+    lpos_to_owner,lneg_to_owner,cell_ldofs_to_owner,cell_to_lposneg,ghost_to_local(cell_range)
+  )
+
+  # Fill owned gids
+  lpos_to_gpos = map(ranks,first_gpos,lpos_to_owner) do rank,first_gpos,lpos_to_owner
+    offset = first_gpos-1
+    lpos_to_gpos = zeros(Int,length(lpos_to_owner))
+    opos = 0
+    for (lpos,owner) in enumerate(lpos_to_owner)
+      if owner == rank
+        opos += 1
+        lpos_to_gpos[lpos] = opos + offset
+      end
+    end
+    lpos_to_gpos
+  end
+
+  lneg_to_gneg = map(ranks,first_gneg,lneg_to_owner) do rank,first_gneg,lneg_to_owner
+    offset = first_gneg-1
+    lneg_to_gneg = zeros(Int,length(lneg_to_owner))
+    oneg = 0
+    for (lneg,owner) in enumerate(lneg_to_owner)
+      if owner == rank
+        oneg += 1
+        lneg_to_gneg[lneg] = oneg + offset
+      end
+    end
+    lneg_to_gneg
+  end
+
+  # Exchange gids
+  cell_to_gposneg = posneg_wise_to_cell_wise!(
+    cell_lids_to_data,lpos_to_gpos,lneg_to_gneg,cell_to_lposneg,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_to_gposneg,cache_fetch) |> wait
+
+  # Fill ghost gids with exchanged information
+  map(
+    cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,partition(cell_range)
+  ) do cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,indices
+    cache = array_cache(cell_to_lposneg)
+    lcell_to_owner = local_to_owner(indices)
+    for cell in ghost_to_local(indices)
+      p = cell_to_gposneg.ptrs[cell]-1
+      lids = getindex!(cache,cell_to_lposneg,cell)
+      cell_owner = lcell_to_owner[cell]
+      for (i,lid) in enumerate(lids)
+        if (lid > 0) && isequal(lpos_to_owner[lid],cell_owner)
+          lpos_to_gpos[lid] = cell_to_gposneg.data[i+p]
+        elseif (lid < 0) && isequal(lneg_to_owner[-lid],cell_owner)
+          lneg_to_gneg[-lid] = cell_to_gposneg.data[i+p]
+        end
+      end
+    end
+  end
+
+  posneg_wise_to_cell_wise!(cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,cell_to_lposneg,own_to_local(cell_range))
+  fetch_vector_ghost_values!(cell_to_gposneg,cache_fetch) |> wait
+  cell_wise_to_posneg_wise!(lpos_to_gpos,lneg_to_gneg,cell_to_gposneg,cell_to_lposneg,ghost_to_local(cell_range))
+
+  # Setup DoFs LocalIndices
+  ngpos = reduction(+,nopos,destination=:all,init=0)
+  ngneg = reduction(+,noneg,destination=:all,init=0)
+  local_indices_pos = map(LocalIndices,ngpos,ranks,lpos_to_gpos,lpos_to_owner)
+  local_indices_neg = map(LocalIndices,ngneg,ranks,lneg_to_gneg,lneg_to_owner)
+
+  return PRange(local_indices_pos), PRange(local_indices_neg)
+end
+
 # FEFunction related
 
 """

From 1dba39f4b542e297c59c13e808c152be31ff828f Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Sat, 13 Dec 2025 14:38:34 +1100
Subject: [PATCH 17/48] Added prange splitting by color

---
 src/FESpaces.jl | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 4d61bd62..639b6c25 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -360,6 +360,71 @@ function generate_posneg_gids(
   return PRange(local_indices_pos), PRange(local_indices_neg)
 end
 
+function split_gids_by_color(
+  gids::PRange, lid_to_color::AbstractArray, 
+  ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))
+)
+  color_to_nlids, color_to_noids = map(partition(gids), lid_to_color) do ids, lid_to_color
+    rank = part_id(ids)
+    lid_to_owner = local_to_owner(ids)
+    color_to_nlids = zeros(Int,ncolors)
+    color_to_noids = zeros(Int,ncolors)
+    for (owner,color) in zip(lid_to_owner,lid_to_color)
+      color_to_nlids[color] += 1
+      color_to_noids[color] += isequal(owner,rank)
+    end
+    return color_to_nlids, color_to_noids
+  end |> tuple_of_arrays
+
+  # I wish we could reduce arrays directly...
+  color_to_fgid = map(1:ncolors) do c
+    scan(+,map(Base.Fix2(getindex,c), color_to_noids); type=:exclusive, init=1)
+  end |> to_parray_of_arrays
+  color_to_ngids = map(1:ncolors) do c
+    reduction(+,map(Base.Fix2(getindex,c), color_to_noids); destination=:all, init=0)
+  end |> to_parray_of_arrays
+
+  lid_to_cgid = map(partition(gids), lid_to_color, color_to_fgid) do ids, lid_to_color, color_to_fgid
+    rank = part_id(ids)
+    lid_to_cgid = zeros(Int,length(lid_to_color))
+    color_to_offset = zeros(Int,ncolors)
+    for (lid,(owner,color)) in enumerate(zip(local_to_owner(ids),lid_to_color))
+      if isequal(owner,rank)
+        lid_to_cgid[lid] = color_to_fgid[color] + color_to_offset[color]
+        color_to_offset[color] += 1
+      end
+    end
+    return lid_to_cgid
+  end
+  consistent!(PVector(lid_to_cgid,partition(gids))) |> wait
+
+  color_to_gids = map(
+    partition(gids), lid_to_color, lid_to_cgid, color_to_nlids, color_to_ngids
+  ) do ids, lid_to_color, lid_to_cgid, color_to_nlids, color_to_ngids
+    color_to_lid_to_gid = [zeros(Int, n) for n in color_to_nlids]
+    color_to_lid_to_owner = [zeros(Int32, n) for n in color_to_nlids]
+    offsets = zeros(Int, ncolors)
+    for (color, cgid, owner) in zip(lid_to_color, lid_to_cgid, local_to_owner(ids))
+      offsets[color] += 1
+      lid = offsets[color]
+      color_to_lid_to_gid[color][lid] = cgid
+      color_to_lid_to_owner[color][lid] = owner
+    end
+
+    color_to_ids = ()
+    for color in 1:ncolors
+      cids = LocalIndices(
+        color_to_ngids[color], part_id(ids), 
+        color_to_lid_to_gid[color], color_to_lid_to_owner[color]
+      )
+      color_to_ids = (color_to_ids..., cids)
+    end
+    return color_to_ids
+  end |> tuple_of_arrays
+
+  return map(PRange,color_to_gids)
+end
+
 # FEFunction related
 
 """

From b0f61c2d3aab141fd73e4a846e9bb0ef7699b420 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Sun, 14 Dec 2025 11:39:37 +1100
Subject: [PATCH 18/48] Generate gids from colored lids

---
 src/FESpaces.jl | 130 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 639b6c25..d4fe80d4 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -360,6 +360,136 @@ function generate_posneg_gids(
   return PRange(local_indices_pos), PRange(local_indices_neg)
 end
 
+function generate_gids_by_color(
+  cell_range::PRange, 
+  cell_to_lids::AbstractArray{<:AbstractArray},
+  lid_to_color::AbstractArray,
+  ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))
+)
+  cell_lids_to_data = allocate_cell_wise_vector(Int, cell_to_lids)
+  cache_fetch = fetch_vector_ghost_values_cache(cell_lids_to_data,partition(cell_range))
+
+  lid_to_owner, color_to_noids = map(
+    partition(cell_range),cell_to_lids,lid_to_color
+  ) do indices, cell_to_lids, lid_to_color
+    lid_to_owner = fill(zero(Int32),length(lid_to_color))
+    cache = array_cache(cell_to_lids)
+    for (cell, owner) in enumerate(local_to_owner(indices))
+      lids = getindex!(cache,cell_to_lids,cell)
+      for lid in lids
+        (lid < 0) && continue
+        lid_to_owner[lid] = max(owner,lid_to_owner[lid])
+      end
+    end
+
+    rank = part_id(indices)
+    color_to_noids = zeros(Int,ncolors)
+    for (owner, color) in zip(lid_to_owner,lid_to_color)
+      color_to_noids[color] += isequal(owner,rank)
+    end
+
+    return lid_to_owner, color_to_noids
+  end |> tuple_of_arrays
+
+  # Find first gid per color
+  color_to_fgid = map(1:ncolors) do c
+    scan(+,map(Base.Fix2(getindex,c), color_to_noids); type=:exclusive, init=1)
+  end |> to_parray_of_arrays
+
+  # Start exchanging dof owners
+  cell_lids_to_owner = dof_wise_to_cell_wise!(
+    cell_lids_to_data,lid_to_owner,cell_to_lids,own_to_local(cell_range)
+  )
+  t1 = fetch_vector_ghost_values!(cell_lids_to_owner,cache_fetch)
+
+  # Note: lid_to_owner is still not consistent, but owned data is correct
+  lid_to_gid, color_to_clid_to_lid = map(
+    partition(cell_range), lid_to_color, lid_to_owner, color_to_fgid
+  ) do indices, lid_to_color, lid_to_owner, color_to_fgid
+    
+    # Count number of dofs per color
+    rank = part_id(indices)
+    color_to_nldofs = zeros(Int,ncolors)
+    lid_to_clid = zeros(Int32,length(lid_to_color))
+    lid_to_gid = zeros(Int,length(lid_to_color))
+    for (lid, (color, owner)) in enumerate(zip(lid_to_color,lid_to_owner))
+      color_to_nldofs[color] += 1
+      lid_to_clid[lid] = color_to_nldofs[color]
+      if isequal(owner,rank)
+        lid_to_gid[lid] = color_to_fgid[color]
+        color_to_fgid[color] += 1
+      end
+    end
+
+    # Find clid to lid mapping
+    color_to_clid_to_lid = [fill(zero(Int32),n) for n in color_to_nldofs]
+    for (lid, (color, clid)) in enumerate(zip(lid_to_color,lid_to_clid))
+      color_to_clid_to_lid[color][clid] = lid
+    end
+    return lid_to_gid, color_to_clid_to_lid
+  end |> tuple_of_arrays
+
+  # Finish exchanging the dof owners.
+  wait(t1)
+  cell_wise_to_dof_wise!(
+    lid_to_owner,cell_lids_to_owner,cell_to_lids,ghost_to_local(cell_range)
+  )
+
+  # Start exchanging gids
+  cell_to_gids = dof_wise_to_cell_wise!(
+    cell_lids_to_data,lid_to_gid,cell_to_lids,own_to_local(cell_range)
+  )
+  t2 = fetch_vector_ghost_values!(cell_to_gids,cache_fetch)
+
+  # Find the global range of owned dofs
+  color_to_ngids = map(1:ncolors) do c
+    reduction(+,map(Base.Fix2(getindex,c), color_to_noids); destination=:all, init=0)
+  end |> to_parray_of_arrays
+
+  # Finish exchanging the gids.
+  wait(t2)
+  map(
+    cell_to_lids,cell_to_gids,lid_to_gid,lid_to_owner,partition(cell_range)
+  ) do cell_to_lids,cell_to_gids,lid_to_gid,lid_to_owner,indices
+    cache = array_cache(cell_to_lids)
+    lcell_to_owner = local_to_owner(indices)
+    for cell in ghost_to_local(indices)
+      p = cell_to_gids.ptrs[cell]-1
+      lids = getindex!(cache,cell_to_lids,cell)
+      cell_owner = lcell_to_owner[cell]
+      for (i,lid) in enumerate(lids)
+        if (lid > 0) && isequal(lid_to_owner[lid],cell_owner)
+          lid_to_gid[lid] = cell_to_gids.data[i+p]
+        end
+      end
+    end
+  end
+
+  dof_wise_to_cell_wise!(cell_to_gids,lid_to_gid,cell_to_lids,own_to_local(cell_range))
+  fetch_vector_ghost_values!(cell_to_gids,cache_fetch) |> wait
+  cell_wise_to_dof_wise!(lid_to_gid,cell_to_gids,cell_to_lids,ghost_to_local(cell_range))
+
+  # Setup DoFs LocalIndices per color
+  color_to_indices = map(
+    partition(cell_range), lid_to_color, lid_to_gid, lid_to_owner, color_to_clid_to_lid, color_to_ngids
+  ) do indices, lid_to_color, lid_to_gid, lid_to_owner, color_to_clid_to_lid, color_to_ngids
+    rank = part_id(indices)
+
+    color_to_indices = ()
+    for c in 1:ncolors
+      cids = LocalIndices(
+        color_to_ngids[c], rank, 
+        lid_to_gid[color_to_clid_to_lid[c]], 
+        lid_to_owner[color_to_clid_to_lid[c]]
+      )
+      color_to_indices = (color_to_indices..., cids)
+    end
+    return color_to_indices
+  end |> tuple_of_arrays
+
+  return map(PRange,color_to_indices)
+end
+
 function split_gids_by_color(
   gids::PRange, lid_to_color::AbstractArray, 
   ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))

From ef98aecfdc1c09ce6d7435c1f42bd53ad59d2742 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Sun, 14 Dec 2025 21:17:29 +1100
Subject: [PATCH 19/48] Minor

---
 src/FESpaces.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index d4fe80d4..b951b44c 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -403,7 +403,7 @@ function generate_gids_by_color(
   t1 = fetch_vector_ghost_values!(cell_lids_to_owner,cache_fetch)
 
   # Note: lid_to_owner is still not consistent, but owned data is correct
-  lid_to_gid, color_to_clid_to_lid = map(
+  lid_to_gid, lid_to_clid, color_to_clid_to_lid = map(
     partition(cell_range), lid_to_color, lid_to_owner, color_to_fgid
   ) do indices, lid_to_color, lid_to_owner, color_to_fgid
     
@@ -426,7 +426,7 @@ function generate_gids_by_color(
     for (lid, (color, clid)) in enumerate(zip(lid_to_color,lid_to_clid))
       color_to_clid_to_lid[color][clid] = lid
     end
-    return lid_to_gid, color_to_clid_to_lid
+    return lid_to_gid, lid_to_clid, color_to_clid_to_lid
   end |> tuple_of_arrays
 
   # Finish exchanging the dof owners.
@@ -487,7 +487,7 @@ function generate_gids_by_color(
     return color_to_indices
   end |> tuple_of_arrays
 
-  return map(PRange,color_to_indices)
+  return map(PRange,color_to_indices), lid_to_clid, color_to_clid_to_lid
 end
 
 function split_gids_by_color(

From eb63cb9138421c77e994c9f831e4ae07a9248bc2 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Mon, 15 Dec 2025 08:42:27 +1100
Subject: [PATCH 20/48] Added documentation

---
 src/FESpaces.jl       | 33 +++++++++++++++++++++++++++++++++
 test/FESpacesTests.jl | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index b951b44c..73ede927 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -161,6 +161,15 @@ function fetch_vector_ghost_values!(vector_partition,cache)
   assemble!((a,b)->b, vector_partition, cache) 
 end
 
+"""
+    generate_gids(cell_gids::PRange, cell_to_lids, nlids) -> PRange
+
+Given a set of cell global ids, a distributed array of local tables mapping cells to local dof ids,
+and a distributed array with the number of local dofs in each partition, this function 
+generates the global dof ids and returns them as a `PRange` of `LocalIndices`.
+
+Ignores negative local dof ids (usually used for Dirichlet dofs).
+"""
 function generate_gids(
   cell_range::PRange,
   cell_to_ldofs::AbstractArray{<:AbstractArray},
@@ -248,6 +257,13 @@ function generate_gids(
   return PRange(local_indices)
 end
 
+"""
+    generate_posneg_gids(cell_gids::PRange, cell_to_lposneg, nlpos, nlneg) -> (PRange,PRange)
+
+Similar to `generate_gids`, but also handles negative local dof ids. Returns two sets of 
+global ids: one for positive local ids and another for negative local ids.
+This can be used to generate simultaneously free and dirichlet dof global ids.
+"""
 function generate_posneg_gids(
   cell_range::PRange,
   cell_to_lposneg::AbstractArray{<:AbstractArray},
@@ -360,6 +376,16 @@ function generate_posneg_gids(
   return PRange(local_indices_pos), PRange(local_indices_neg)
 end
 
+"""
+    generate_gids_by_color(cell_gids::PRange, cell_to_lids, lid_to_color, ncolors)
+
+Similar to `generate_gids`, but uses a global partition given by `lid_to_color` to generate
+a different set of global ids per color. Returns:
+
+- a tuple with a `PRange` of `LocalIndices` per color.
+- a mapping `lid_to_clid` that maps local ids to local color ids
+- a mapping `color_to_clid_to_lid` that maps, for each color, local color ids to local ids.
+"""
 function generate_gids_by_color(
   cell_range::PRange, 
   cell_to_lids::AbstractArray{<:AbstractArray},
@@ -490,6 +516,13 @@ function generate_gids_by_color(
   return map(PRange,color_to_indices), lid_to_clid, color_to_clid_to_lid
 end
 
+"""
+    split_gids_by_color(gids::PRange, lid_to_color::AbstractArray, ncolors) -> NTuple{ncolors,PRange}
+
+Given a set of global ids and a mapping from local ids to colors, this function splits
+the global ids into different sets of global ids per color. Returns a tuple with a `PRange`
+of `LocalIndices` per color.
+"""
 function split_gids_by_color(
   gids::PRange, lid_to_color::AbstractArray, 
   ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))
diff --git a/test/FESpacesTests.jl b/test/FESpacesTests.jl
index bf1e78e6..9a221202 100644
--- a/test/FESpacesTests.jl
+++ b/test/FESpacesTests.jl
@@ -9,6 +9,44 @@ using Test
 
 u((x,y)) = x+y
 
+function gid_generation_tests(distribute,parts)
+  ranks = distribute(LinearIndices((prod(parts),)))
+
+  model = CartesianDiscreteModel(ranks,parts,(0,1,0,1),(8,4))
+  reffe = ReferenceFE(lagrangian,Float64,1)
+  cgids = GridapDistributed.get_cell_gids(model)
+
+  V0 = FESpace(model,reffe)
+  I0 = partition(get_free_dof_ids(V0))
+
+  V1 = FESpace(model,reffe;dirichlet_tags=["tag_1","tag_2","tag_3","tag_4","tag_5","tag_6"])
+  I1 = partition(get_free_dof_ids(V1))
+
+  fgids_1, dgids_1 = GridapDistributed.generate_posneg_gids(
+    cgids,get_cell_dof_ids(V1),map(num_free_dofs,local_views(V1)),map(num_dirichlet_dofs,local_views(V1))
+  )
+
+  # 1 -> free, 2 -> dirichlet
+  lid_to_color = map(local_views(V0),local_views(V1)) do V0, V1
+    lid_to_color = zeros(Int16,num_free_dofs(V0))
+    for (I,J) in zip(get_cell_dof_ids(V0),get_cell_dof_ids(V1))
+      lid_to_color[I] .= map(j -> ifelse(j > 0, 1, 2), J)
+    end
+    return lid_to_color
+  end
+
+  fgids_2, dgids_2 = GridapDistributed.split_gids_by_color(
+    get_free_dof_ids(V0), lid_to_color
+  )
+
+  (fgids_3, dgids_3), _, _ = GridapDistributed.generate_gids_by_color(
+    cgids, get_cell_dof_ids(V0), lid_to_color
+  )
+
+  @test fgids_1 == fgids_2 == fgids_3
+  @test dgids_1 == dgids_2 == dgids_3
+end
+
 function assemble_tests(das,dΩ,dΩass,U,V)
   # Assembly
   dv = get_fe_basis(V)

From 6ed24a5a54a82ea3105a426595b5717bbc93c0c5 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Thu, 18 Dec 2025 11:08:55 +1100
Subject: [PATCH 21/48] Minor

---
 src/FESpaces.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 73ede927..da59f608 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -1138,7 +1138,7 @@ function FESpaces.ZeroMeanFESpace(space::DistributedSingleFieldFESpace,dΩ::Dist
     vol  = sum(dvol)
     return vol, dvol
   end |> tuple_of_arrays
-  vol  = reduce(+,_vol,init=zero(eltype(vol)))
+  vol = reduce(+,_vol,init=zero(eltype(_vol)))
   metadata = DistributedZeroMeanCache(dvol,vol)
 
   return DistributedSingleFieldFESpace(

From 3ccf13c610c10167f3d4295fae6c7f40f19a901e Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Wed, 13 May 2026 16:20:21 +1000
Subject: [PATCH 22/48] Minor

---
 .gitignore | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4f9205f8..e42680c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,13 +2,20 @@
 *.jl.cov
 *.jl.mem
 .DS_Store
-/Manifest.toml
-/LocalPreferences.toml
+
+Manifest.toml
+LocalPreferences.toml
+
 /dev/
 /docs/build/
 /docs/site/
-/docs/Manifest.toml
+
 /tmp/
+
 *.vtu
 *.pvtu
 *.pvd
+
+CLAUDE.md
+AGENTS.md
+

From f2e583e2d3c4d4c20653aa9058e4efc3ccb63e07 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Wed, 13 May 2026 17:07:03 +1000
Subject: [PATCH 23/48] Fixes from pulling master

---
 src/Adaptivity.jl                 |  4 ++--
 src/GridapDistributed.jl          |  2 +-
 test/AdaptivityCartesianTests.jl  |  2 +-
 test/AdaptivityMultiFieldTests.jl | 10 +++++-----
 test/AdaptivityTests.jl           | 10 +++++-----
 test/PullbackTests.jl             |  2 +-
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/Adaptivity.jl b/src/Adaptivity.jl
index a9e798c4..05d285ab 100644
--- a/src/Adaptivity.jl
+++ b/src/Adaptivity.jl
@@ -297,8 +297,8 @@ function get_cartesian_redistribute_glue(
       # If I'm in the old subprocessor,
       #   - I send all owned old cells that I don't own in the new model.
       #   - I receive all owned new cells that I don't own in the old model.
-      old2new = replace(find_local_to_local_map(old_ids,new_ids), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_ids,old_ids), -1 => 0)
+      old2new = replace(local_to_local_map(old_ids,new_ids), -1 => 0)
+      new2old = replace(local_to_local_map(new_ids,old_ids), -1 => 0)
 
       new_l2o = local_to_own(new_ids)
       old_l2o = local_to_own(old_ids)
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index 76e956d1..a1daddea 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -34,6 +34,7 @@ import Gridap.Fields: grad2curl
 import Gridap.CellData: Interpolable
 
 export LocallyAssembled
+export SubAssembled
 export Assembled
 
 export get_cell_gids
@@ -49,7 +50,6 @@ include("PArraysExtras.jl")
 include("BlockPartitionedArrays.jl")
 
 include("GridapExtras.jl")
-include("PArraysExtras.jl")
 
 include("Algebra.jl")
 
diff --git a/test/AdaptivityCartesianTests.jl b/test/AdaptivityCartesianTests.jl
index b35ae70d..fde21bb1 100644
--- a/test/AdaptivityCartesianTests.jl
+++ b/test/AdaptivityCartesianTests.jl
@@ -11,7 +11,7 @@ using GridapDistributed
 using PartitionedArrays
 
 using GridapDistributed: i_am_in, generate_subparts
-using GridapDistributed: find_local_to_local_map
+using GridapDistributed: local_to_local_map
 using GridapDistributed: DistributedAdaptedDiscreteModel, redistribute, redistribute_cartesian
 using GridapDistributed: RedistributeGlue, redistribute_cell_dofs, redistribute_fe_function, redistribute_free_values
 
diff --git a/test/AdaptivityMultiFieldTests.jl b/test/AdaptivityMultiFieldTests.jl
index 5596d5d9..51c90d2e 100644
--- a/test/AdaptivityMultiFieldTests.jl
+++ b/test/AdaptivityMultiFieldTests.jl
@@ -11,7 +11,7 @@ using GridapDistributed
 using PartitionedArrays
 
 using GridapDistributed: i_am_in, generate_subparts
-using GridapDistributed: find_local_to_local_map
+using GridapDistributed: local_to_local_map
 using GridapDistributed: DistributedAdaptedDiscreteModel
 using GridapDistributed: RedistributeGlue, redistribute_cell_dofs, redistribute_fe_function, redistribute_free_values
 
@@ -58,8 +58,8 @@ function get_redistribute_glue(old_parts,new_parts::DebugArray,old_cell_to_part,
       old_partition = partition(get_cell_gids(model)).items[p]
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> lazy_map(Reindex(global_to_local(old_partition)),gids),gids_snd_by_part)
-      old2new = replace(find_local_to_local_map(old_partition,new_partition), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_partition,old_partition), -1 => 0)
+      old2new = replace(local_to_local_map(old_partition,new_partition), -1 => 0)
+      new2old = replace(local_to_local_map(new_partition,old_partition), -1 => 0)
     else
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> fill(Int32(0),length(gids)),gids_snd_by_part)
@@ -90,8 +90,8 @@ function get_redistribute_glue(old_parts,new_parts::MPIArray,old_cell_to_part,ne
       old_partition = PartitionedArrays.getany(partition(get_cell_gids(model)))
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> lazy_map(Reindex(global_to_local(old_partition)),gids),gids_snd_by_part)
-      old2new = replace(find_local_to_local_map(old_partition,new_partition), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_partition,old_partition), -1 => 0)
+      old2new = replace(local_to_local_map(old_partition,new_partition), -1 => 0)
+      new2old = replace(local_to_local_map(new_partition,old_partition), -1 => 0)
     else
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> fill(Int32(0),length(gids)),gids_snd_by_part)
diff --git a/test/AdaptivityTests.jl b/test/AdaptivityTests.jl
index 3d35263b..3bde688a 100644
--- a/test/AdaptivityTests.jl
+++ b/test/AdaptivityTests.jl
@@ -11,7 +11,7 @@ using GridapDistributed
 using PartitionedArrays
 
 using GridapDistributed: i_am_in, generate_subparts
-using GridapDistributed: find_local_to_local_map
+using GridapDistributed: local_to_local_map
 using GridapDistributed: DistributedAdaptedDiscreteModel
 using GridapDistributed: RedistributeGlue, redistribute_cell_dofs, redistribute_fe_function, redistribute_free_values
 
@@ -58,8 +58,8 @@ function get_redistribute_glue(old_parts,new_parts::DebugArray,old_cell_to_part,
       old_partition = partition(get_cell_gids(model)).items[p]
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> lazy_map(Reindex(global_to_local(old_partition)),gids),gids_snd_by_part)
-      old2new = replace(find_local_to_local_map(old_partition,new_partition), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_partition,old_partition), -1 => 0)
+      old2new = replace(local_to_local_map(old_partition,new_partition), -1 => 0)
+      new2old = replace(local_to_local_map(new_partition,old_partition), -1 => 0)
     else
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> fill(Int32(0),length(gids)),gids_snd_by_part)
@@ -90,8 +90,8 @@ function get_redistribute_glue(old_parts,new_parts::MPIArray,old_cell_to_part,ne
       old_partition = PartitionedArrays.getany(partition(get_cell_gids(model)))
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> lazy_map(Reindex(global_to_local(old_partition)),gids),gids_snd_by_part)
-      old2new = replace(find_local_to_local_map(old_partition,new_partition), -1 => 0)
-      new2old = replace(find_local_to_local_map(new_partition,old_partition), -1 => 0)
+      old2new = replace(local_to_local_map(old_partition,new_partition), -1 => 0)
+      new2old = replace(local_to_local_map(new_partition,old_partition), -1 => 0)
     else
       lids_rcv = map(gids -> lazy_map(Reindex(global_to_local(new_partition)),gids),gids_rcv_by_part)
       lids_snd = map(gids -> fill(Int32(0),length(gids)),gids_snd_by_part)
diff --git a/test/PullbackTests.jl b/test/PullbackTests.jl
index 3c12f37d..3e970819 100644
--- a/test/PullbackTests.jl
+++ b/test/PullbackTests.jl
@@ -116,7 +116,7 @@ function main(distribute,nranks)
 
   model = GridapDistributed.DistributedDiscreteModel(models,gids)
 
-  das = FullyAssembledRows()
+  das = LocallyAssembled()
   trian = Triangulation(das,model)
 
   reffe = ReferenceFE(raviart_thomas,Float64,0)

From 52e7af13320b9665dbe69e8917764534f42d0d7c Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Wed, 13 May 2026 17:10:08 +1000
Subject: [PATCH 24/48] Minor

---
 src/FESpaces.jl | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 7718b4bf..fc078032 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -474,32 +474,6 @@ end
 
 # Factories
 
-# function FESpaces.FESpace(
-#   model::DistributedDiscreteModel,reffe;split_own_and_ghost=false,constraint=nothing,kwargs...
-# )
-#   spaces = map(local_views(model)) do m
-#     FESpace(m,reffe;kwargs...)
-#   end
-#   gids =  generate_gids(model,spaces)
-#   trian = DistributedTriangulation(map(get_triangulation,spaces),model)
-#   vector_type = _find_vector_type(spaces,gids;split_own_and_ghost=split_own_and_ghost)
-#   space = DistributedSingleFieldFESpace(spaces,gids,trian,vector_type)
-#   return _add_distributed_constraint(space,reffe,constraint)
-# end
-# 
-# function FESpaces.FESpace(
-#   _trian::DistributedTriangulation,reffe;split_own_and_ghost=false,constraint=nothing,kwargs...
-# )
-#   trian = add_ghost_cells(_trian)
-#   spaces = map(local_views(trian)) do t
-#     FESpace(t,reffe;kwargs...)
-#   end
-#   gids = generate_gids(trian,spaces)
-#   vector_type = _find_vector_type(spaces,gids;split_own_and_ghost=split_own_and_ghost)
-#   space = DistributedSingleFieldFESpace(spaces,gids,trian,vector_type)
-#   return _add_distributed_constraint(space,reffe,constraint)
-# end
-
 function FESpaces.FESpace(
   model::DistributedDiscreteModel,args...;kwargs...
 )

From f010f78912b21927c03ff0fa3f53579fccded29c Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Wed, 13 May 2026 18:52:37 +1000
Subject: [PATCH 25/48] Make sure we consistently return PermutedLocalIndices

---
 src/Algebra.jl       |  8 ++---
 src/FESpaces.jl      | 86 +++++++++++++++++---------------------------
 src/PArraysExtras.jl | 18 +++-------
 3 files changed, 40 insertions(+), 72 deletions(-)

diff --git a/src/Algebra.jl b/src/Algebra.jl
index bdfb46bc..5b7b11e3 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -68,11 +68,9 @@ end
 # change_ghost
 
 function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T
-  if (a.index_partition === partition(ids))
-    return a
-  end
-  a_new = change_ghost(T,a,ids)
-  if make_consistent && !is_consistent
+  same_partition = (a.index_partition === partition(ids))
+  a_new = same_partition ? a : change_ghost(T,a,ids)
+  if make_consistent && (!same_partition || !is_consistent)
     consistent!(a_new) |> wait
   end
   return a_new
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 80e0ef16..4ccbb800 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -166,7 +166,7 @@ end
 
 Given a set of cell global ids, a distributed array of local tables mapping cells to local dof ids,
 and a distributed array with the number of local dofs in each partition, this function 
-generates the global dof ids and returns them as a `PRange` of `LocalIndices`.
+generates the global dof ids and returns them as a `PRange` of `PermutedLocalIndices`.
 
 Ignores negative local dof ids (usually used for Dirichlet dofs).
 """
@@ -251,8 +251,9 @@ function generate_gids(
   cell_wise_to_dof_wise!(ldof_to_gdof,cell_to_gdofs,cell_to_ldofs,ghost_to_local(cell_range))
 
   # Setup DoFs LocalIndices
-  ngdofs = reduction(+,nodofs,destination=:all,init=zero(eltype(nodofs)))
-  local_indices = map(LocalIndices,ngdofs,ranks,ldof_to_gdof,ldof_to_owner)
+  local_indices = permuted_variable_partition(
+    nodofs,ldof_to_gdof,ldof_to_owner;start=first_gdof
+  )
 
   return PRange(local_indices)
 end
@@ -368,10 +369,8 @@ function generate_posneg_gids(
   cell_wise_to_posneg_wise!(lpos_to_gpos,lneg_to_gneg,cell_to_gposneg,cell_to_lposneg,ghost_to_local(cell_range))
 
   # Setup DoFs LocalIndices
-  ngpos = reduction(+,nopos,destination=:all,init=0)
-  ngneg = reduction(+,noneg,destination=:all,init=0)
-  local_indices_pos = map(LocalIndices,ngpos,ranks,lpos_to_gpos,lpos_to_owner)
-  local_indices_neg = map(LocalIndices,ngneg,ranks,lneg_to_gneg,lneg_to_owner)
+  local_indices_pos = permuted_variable_partition(nopos,lpos_to_gpos,lpos_to_owner;start=first_gpos)
+  local_indices_neg = permuted_variable_partition(noneg,lneg_to_gneg,lneg_to_owner;start=first_gneg)
 
   return PRange(local_indices_pos), PRange(local_indices_neg)
 end
@@ -382,7 +381,7 @@ end
 Similar to `generate_gids`, but uses a global partition given by `lid_to_color` to generate
 a different set of global ids per color. Returns:
 
-- a tuple with a `PRange` of `LocalIndices` per color.
+- a tuple with a `PRange` of `PermutedLocalIndices` per color.
 - a mapping `lid_to_clid` that maps local ids to local color ids
 - a mapping `color_to_clid_to_lid` that maps, for each color, local color ids to local ids.
 """
@@ -467,11 +466,6 @@ function generate_gids_by_color(
   )
   t2 = fetch_vector_ghost_values!(cell_to_gids,cache_fetch)
 
-  # Find the global range of owned dofs
-  color_to_ngids = map(1:ncolors) do c
-    reduction(+,map(Base.Fix2(getindex,c), color_to_noids); destination=:all, init=0)
-  end |> to_parray_of_arrays
-
   # Finish exchanging the gids.
   wait(t2)
   map(
@@ -495,25 +489,16 @@ function generate_gids_by_color(
   fetch_vector_ghost_values!(cell_to_gids,cache_fetch) |> wait
   cell_wise_to_dof_wise!(lid_to_gid,cell_to_gids,cell_to_lids,ghost_to_local(cell_range))
 
-  # Setup DoFs LocalIndices per color
-  color_to_indices = map(
-    partition(cell_range), lid_to_color, lid_to_gid, lid_to_owner, color_to_clid_to_lid, color_to_ngids
-  ) do indices, lid_to_color, lid_to_gid, lid_to_owner, color_to_clid_to_lid, color_to_ngids
-    rank = part_id(indices)
-
-    color_to_indices = ()
-    for c in 1:ncolors
-      cids = LocalIndices(
-        color_to_ngids[c], rank, 
-        lid_to_gid[color_to_clid_to_lid[c]], 
-        lid_to_owner[color_to_clid_to_lid[c]]
-      )
-      color_to_indices = (color_to_indices..., cids)
-    end
-    return color_to_indices
-  end |> tuple_of_arrays
+  # Setup DoFs indices per color
+  color_to_indices = ntuple(ncolors) do c
+    c_noids  = map(Base.Fix2(getindex,c), color_to_noids)
+    c_fgid   = color_to_fgid[c]
+    c_gids   = map(getindex, lid_to_gid,   color_to_clid_to_lid[c])
+    c_owners = map(getindex, lid_to_owner, color_to_clid_to_lid[c])
+    PRange(permuted_variable_partition(c_noids, c_gids, c_owners; start=c_fgid))
+  end
 
-  return map(PRange,color_to_indices), lid_to_clid, color_to_clid_to_lid
+  return color_to_indices, lid_to_clid, color_to_clid_to_lid
 end
 
 """
@@ -521,7 +506,7 @@ end
 
 Given a set of global ids and a mapping from local ids to colors, this function splits
 the global ids into different sets of global ids per color. Returns a tuple with a `PRange`
-of `LocalIndices` per color.
+of `PermutedLocalIndices` per color.
 """
 function split_gids_by_color(
   gids::PRange, lid_to_color::AbstractArray, 
@@ -543,9 +528,6 @@ function split_gids_by_color(
   color_to_fgid = map(1:ncolors) do c
     scan(+,map(Base.Fix2(getindex,c), color_to_noids); type=:exclusive, init=1)
   end |> to_parray_of_arrays
-  color_to_ngids = map(1:ncolors) do c
-    reduction(+,map(Base.Fix2(getindex,c), color_to_noids); destination=:all, init=0)
-  end |> to_parray_of_arrays
 
   lid_to_cgid = map(partition(gids), lid_to_color, color_to_fgid) do ids, lid_to_color, color_to_fgid
     rank = part_id(ids)
@@ -561,31 +543,29 @@ function split_gids_by_color(
   end
   consistent!(PVector(lid_to_cgid,partition(gids))) |> wait
 
-  color_to_gids = map(
-    partition(gids), lid_to_color, lid_to_cgid, color_to_nlids, color_to_ngids
-  ) do ids, lid_to_color, lid_to_cgid, color_to_nlids, color_to_ngids
-    color_to_lid_to_gid = [zeros(Int, n) for n in color_to_nlids]
-    color_to_lid_to_owner = [zeros(Int32, n) for n in color_to_nlids]
+  # Pack per-color gid/owner arrays in a single pass over local dofs
+  all_c_gids, all_c_owners = map(
+    lid_to_color, lid_to_cgid, partition(gids), color_to_nlids
+  ) do lid_to_color, lid_to_cgid, ids, color_to_nlids
+    c_gids   = [zeros(Int,   n) for n in color_to_nlids]
+    c_owners = [zeros(Int32, n) for n in color_to_nlids]
     offsets = zeros(Int, ncolors)
     for (color, cgid, owner) in zip(lid_to_color, lid_to_cgid, local_to_owner(ids))
       offsets[color] += 1
       lid = offsets[color]
-      color_to_lid_to_gid[color][lid] = cgid
-      color_to_lid_to_owner[color][lid] = owner
+      c_gids[color][lid]   = cgid
+      c_owners[color][lid] = owner
     end
-
-    color_to_ids = ()
-    for color in 1:ncolors
-      cids = LocalIndices(
-        color_to_ngids[color], part_id(ids), 
-        color_to_lid_to_gid[color], color_to_lid_to_owner[color]
-      )
-      color_to_ids = (color_to_ids..., cids)
-    end
-    return color_to_ids
+    return c_gids, c_owners
   end |> tuple_of_arrays
 
-  return map(PRange,color_to_gids)
+  return ntuple(ncolors) do c
+    c_noids  = map(Base.Fix2(getindex,c), color_to_noids)
+    c_fgid   = color_to_fgid[c]
+    c_gids   = map(Base.Fix2(getindex,c), all_c_gids)
+    c_owners = map(Base.Fix2(getindex,c), all_c_owners)
+    PRange(permuted_variable_partition(c_noids, c_gids, c_owners; start=c_fgid))
+  end
 end
 
 # FEFunction related
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 76a5eff1..8158e1fe 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -153,21 +153,11 @@ function _filter_ghost(indices,gids,owners)
   return new_ghost_to_global, new_ghost_to_owner
 end
 
-# function PArrays.remove_ghost(indices::PermutedLocalIndices)
-#   n_global = global_length(indices) 
-#   own = OwnIndices(n_global,part_id(indices),own_to_global(indices))
-#   ghost = GhostIndices(n_global,Int[],Int32[])
-#   OwnAndGhostIndices(own,ghost)
-# end
-
 function PArrays.remove_ghost(indices::PermutedLocalIndices)
-  perm = indices.perm
-  own_to_local = indices.own_to_local
-  display(own_to_local)
-  println(issorted(own_to_local))
-  display(perm)
-  println("-------------------------------")
-  remove_ghost(indices.indices)
+  n_global = global_length(indices) 
+  own = OwnIndices(n_global,part_id(indices),own_to_global(indices))
+  ghost = GhostIndices(n_global,Int[],Int32[])
+  OwnAndGhostIndices(own,ghost)
 end
 
 # This function computes a mapping among the local identifiers of a and b

From 817a882aaa121ccb034d0885e3f93735394c1ab6 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Wed, 13 May 2026 19:43:23 +1000
Subject: [PATCH 26/48] Minor bugfix

---
 src/BlockPartitionedArrays.jl | 17 +++++++++++++++++
 src/PArraysExtras.jl          | 15 ---------------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/BlockPartitionedArrays.jl b/src/BlockPartitionedArrays.jl
index a8bebdba..d754f5ae 100644
--- a/src/BlockPartitionedArrays.jl
+++ b/src/BlockPartitionedArrays.jl
@@ -215,6 +215,13 @@ function Base.all(f::Function,x::BlockPVector)
   all(map(xi->all(f,xi),blocks(x)))
 end
 
+function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
+  map(blocks(x),blocks(y)) do x,y
+    LinearAlgebra.axpy!(α,x,y)
+  end
+  return y
+end
+
 function LinearAlgebra.rmul!(a::BlockPVector,v::Number)
   map(ai->rmul!(ai,v),blocks(a))
   return a
@@ -371,6 +378,16 @@ function LinearAlgebra.fillstored!(a::BlockPMatrix,v)
   return a
 end
 
+function Algebra.axpy_entries!(
+  α::Number, A::BlockPMatrix, B::BlockPMatrix;
+  check::Bool=true
+)
+  map(blocks(A),blocks(B)) do A, B
+    Algebra.axpy_entries!(α,A,B;check)
+  end
+  return B
+end
+
 # Broadcasting
 
 struct BlockPBroadcasted{A,B}
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 8158e1fe..c46d55d9 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -207,12 +207,6 @@ function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
   return y
 end
 
-function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
-  map(blocks(x),blocks(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  return y
-end
 
 function Algebra.axpy_entries!(
   α::Number, A::PSparseMatrix, B::PSparseMatrix;
@@ -229,15 +223,6 @@ function Algebra.axpy_entries!(
   return B
 end
 
-function Algebra.axpy_entries!(
-  α::Number, A::BlockPMatrix, B::BlockPMatrix;
-  check::Bool=true
-)
-  map(blocks(A),blocks(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
 
 # Array of PArrays -> PArray of Arrays 
 # TODO: I think this is now implemented in PartitionedArrays.jl (check)

From 12e1bc2d7488fd288c28880ee052ee48dc8aa660 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Wed, 13 May 2026 20:02:43 +1000
Subject: [PATCH 27/48] Fixed rhs callback

---
 src/Assembly.jl | 50 +++++++++++++++++++------------------------------
 1 file changed, 19 insertions(+), 31 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 41673c5e..54103baf 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -214,6 +214,18 @@ function Arrays.nz_allocation(a::PVectorCounter{<:Assembled})
   return DistributedAllocation(allocs,a.axes,a.strategy)
 end
 
+function rhs_callback(b::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}}, rows)
+  b_fespace = PVector(local_views(b), partition(axes(b,1)))
+  locally_repartition(b_fespace, rows)
+end
+
+function rhs_callback(b::PVectorAllocation{<:Assembled}, rows)
+  new_indices = collect_touched_ids(b)
+  values = map(ai -> ai.values, local_views(b))
+  b_fespace = PVector(values, partition(axes(b,1)))
+  locally_repartition(b_fespace, new_indices)
+end
+
 function Algebra.create_from_nz(a::PVector)
   assemble!(a) |> wait
   return a
@@ -226,20 +238,15 @@ function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVecto
 end
 
 function Algebra.create_from_nz(a::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}})
-  rows = remove_ghost(unpermute(axes(a,1)))
-  values = local_views(a)
-  b = rhs_callback(values,rows)
-  return b
+  rows = map(remove_ghost, map(unpermute, partition(axes(a,1))))
+  return rhs_callback(a, rows)
 end
 
 function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled})
   rows = collect_touched_ids(a)
-  values = map(ai -> ai.values, local_views(a))
-  b  = rhs_callback(values,rows)
+  b = rhs_callback(a, rows)
   t2 = assemble!(b)
-  if t2 !== nothing
-    wait(t2)
-  end
+  t2 !== nothing && wait(t2)
   return b
 end
 
@@ -261,12 +268,7 @@ function Algebra.create_from_nz(
   a::PSparseMatrixAllocation{<:LocallyAssembled},
   b::PVectorAllocation{<:LocallyAssembled}
 )
-  function callback(rows)
-    values = local_views(b)
-    b_fespace = PVector(values,partition(axes(b,1)))
-    locally_repartition(b_fespace,rows)
-  end
-  A, B = create_from_nz_locally_assembled(a,callback)
+  A, B = create_from_nz_locally_assembled(a, Base.Fix1(rhs_callback,b))
   return A, B
 end
 
@@ -274,16 +276,7 @@ function Algebra.create_from_nz(
   a::PSparseMatrixAllocation{<:Assembled},
   b::PVectorAllocation{<:Assembled}
 )
-  function callback(rows)
-    new_indices = collect_touched_ids(b)
-    values = map(ai -> ai.values, local_views(b))
-    b_fespace = PVector(values,partition(axes(b,1)))
-    locally_repartition(b_fespace,new_indices)
-  end
-  function async_callback(b)
-    assemble!(b)
-  end
-  A, B = create_from_nz_assembled(a,callback,async_callback)
+  A, B = create_from_nz_assembled(a, Base.Fix1(rhs_callback,b), assemble!)
   return A, B
 end
 
@@ -291,12 +284,7 @@ function Algebra.create_from_nz(
   a::PSparseMatrixAllocation{<:SubAssembled},
   b::PVectorAllocation{<:SubAssembled}
 )
-  function callback(rows)
-    @check PArrays.matching_local_indices(PRange(rows),axes(b,1))
-    values = local_views(b)
-    PVector(values,partition(axes(b,1)))
-  end
-  A, B = create_from_nz_subassembled(a,callback)
+  A, B = create_from_nz_subassembled(a, Base.Fix1(rhs_callback,b))
   return A, B
 end
 

From 17bd78d7063db542fc4eae423ed20517ce8bf2dc Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Wed, 13 May 2026 23:19:30 +1000
Subject: [PATCH 28/48] More bugfixes

---
 src/MultiField.jl    | 104 +++++++++++++------------------------------
 src/PArraysExtras.jl |   9 ++--
 2 files changed, 37 insertions(+), 76 deletions(-)

diff --git a/src/MultiField.jl b/src/MultiField.jl
index cfe96795..3cb7c272 100644
--- a/src/MultiField.jl
+++ b/src/MultiField.jl
@@ -362,104 +362,64 @@ function generate_multi_field_gids(
   p_f_flid_lid = map(v,f_p_flid_lid...)
 
   # Find the first gid of the multifield space in each part
-  ngids = sum(map(length,f_frange))
   p_noids = map(f_fiset->sum(map(own_length,f_fiset)),p_f_fiset)
   p_firstgid = scan(+,p_noids,type=:exclusive,init=one(eltype(p_noids)))
 
   # Distributed gids to owned dofs
-  p_lid_gid, p_lid_part = map(
-    p_f_flid_lid, p_f_fiset, p_firstgid) do f_flid_lid, f_fiset, firstgid
-    nlids = sum(map(length,f_flid_lid))
-    lid_gid = zeros(Int,nlids)
-    lid_part = zeros(Int32,nlids)
-    nf = length(f_fiset)
+  p_lid_gid, p_lid_part = map(p_f_flid_lid, p_f_fiset, p_firstgid) do f_flid_lid, f_fiset, firstgid
+    nlids    = sum(length, f_flid_lid)
+    lid_gid  = zeros(Int,   nlids)
+    lid_part = zeros(Int32, nlids)
     gid = firstgid
-    for f in 1:nf
-      fiset = f_fiset[f]
-      fiset_owner_to_local = own_to_local(fiset)
-      flid_lid = f_flid_lid[f]
+    for (flid_lid, fiset) in zip(f_flid_lid, f_fiset)
       part = part_id(fiset)
-      for foid in 1:own_length(fiset)
-        flid = fiset_owner_to_local[foid]
-        lid = flid_lid[flid]
-        lid_part[lid] = part
-        lid_gid[lid] = gid
+      for flid in own_to_local(fiset)
+        lid_gid[flid_lid[flid]]  = gid
+        lid_part[flid_lid[flid]] = part
         gid += 1
       end
     end
-    lid_gid,lid_part
+    return lid_gid, lid_part
   end |> tuple_of_arrays
 
-  # Now we need to propagate to ghost
-  # to this end we use the already available
-  # communicators in each of the single fields
-  # We cannot use the cell wise dof like in the old version
-  # since each field can be defined on an independent mesh.
+  # Now we need to propagate to ghost. To this end we use the already available
+  # communicators in each of the single fields.
+  # We cannot use the cell wise communication since each field can be defined on an independent mesh.
   f_aux_gids = map(frange->PVector{Vector{eltype(eltype(p_lid_gid))}}(undef,partition(frange)),f_frange)
   f_aux_part = map(frange->PVector{Vector{eltype(eltype(p_lid_part))}}(undef,partition(frange)),f_frange)
   propagate_to_ghost_multifield!(p_lid_gid,f_aux_gids,f_p_flid_lid,f_p_fiset)
   propagate_to_ghost_multifield!(p_lid_part,f_aux_part,f_p_flid_lid,f_p_fiset)
 
-  p_iset = map(partition(f_frange[1]),p_lid_gid,p_lid_part) do indices,
-                                                               lid_to_gid,
-                                                               lid_to_owner
-     me = part_id(indices)
-     LocalIndices(ngids,me,lid_to_gid,lid_to_owner)
+  p_iset = map(p_lid_gid,p_lid_part,p_noids,p_firstgid) do lid_to_gid, lid_to_owner, nodofs, firstgid
+    permuted_variable_partition(nodofs,lid_to_gid,lid_to_owner;start=firstgid)
   end
 
-  # Merge neighbors
-  function merge_neigs(f_neigs)
-    dict = Dict{Int32,Int32}()
-    for f in 1:length(f_neigs)
-      for neig in f_neigs[f]
-        dict[neig] = neig
-      end
-    end
-    collect(keys(dict))
-  end
-
-  f_p_parts_snd, f_p_parts_rcv = map(x->assembly_neighbors(partition(x)),f_frange) |> tuple_of_arrays
-  p_f_parts_snd = map(v,f_p_parts_snd...)
-  p_f_parts_rcv = map(v,f_p_parts_rcv...)
-  p_neigs_snd = map(merge_neigs,p_f_parts_snd)
-  p_neigs_rcv = map(merge_neigs,p_f_parts_rcv)
+  f_p_parts_snd, f_p_parts_rcv = map(assembly_neighbors ∘ partition, f_frange) |> tuple_of_arrays
+  merge_neigs(f_neigs...) = sort(unique(vcat(f_neigs...)))
+  p_neigs_snd = map(merge_neigs,f_p_parts_snd...)
+  p_neigs_rcv = map(merge_neigs,f_p_parts_rcv...)
 
   exchange_graph = ExchangeGraph(p_neigs_snd,p_neigs_rcv)
-  assembly_neighbors(p_iset;neighbors=exchange_graph)
+  assembly_neighbors(p_iset; neighbors = exchange_graph)
 
-  PRange(p_iset)
+  return PRange(p_iset)
 end
 
 function propagate_to_ghost_multifield!(
-  p_lid_gid,f_gids,f_p_flid_lid,f_p_fiset)
-  # Loop over fields
-  nf = length(f_gids)
-  for f in 1:nf
-    # Write data into owned in single-field buffer
-    gids = f_gids[f]
-    p_flid_gid = gids.vector_partition
-    p_flid_lid = f_p_flid_lid[f]
-    p_fiset = f_p_fiset[f]
-    map(
-      p_flid_gid,p_flid_lid,p_lid_gid,p_fiset) do flid_gid,flid_lid,lid_gid,fiset
-      fiset_own_to_local = own_to_local(fiset)
-      for foid in 1:own_length(fiset)
-        flid = fiset_own_to_local[foid]
-        lid = flid_lid[flid]
-        flid_gid[flid] = lid_gid[lid]
+  p_lid_gid, f_gids, f_p_flid_lid, f_p_fiset
+)
+  for (gids, p_flid_lid, p_fiset) in zip(f_gids, f_p_flid_lid, f_p_fiset)
+    p_flid_gid = local_views(gids)
+    map(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
+      for flid in own_to_local(fiset)
+        flid_gid[flid] = lid_gid[flid_lid[flid]]
       end
     end
-    # move to ghost
-    cache=fetch_vector_ghost_values_cache(partition(gids),p_fiset)
-    fetch_vector_ghost_values!(partition(gids),cache) |> wait
-    # write again into multifield array on ghost ids
-    map(
-      p_flid_gid,p_flid_lid,p_lid_gid,p_fiset) do flid_gid,flid_lid,lid_gid,fiset
-      fiset_ghost_to_local=ghost_to_local(fiset)
-      for fhid in 1:ghost_length(fiset)
-        flid = fiset_ghost_to_local[fhid]
-        lid = flid_lid[flid]
-        lid_gid[lid] = flid_gid[flid]
+    cache = fetch_vector_ghost_values_cache(partition(gids), p_fiset)
+    fetch_vector_ghost_values!(partition(gids), cache) |> wait
+    map(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
+      for flid in ghost_to_local(fiset)
+        lid_gid[flid_lid[flid]] = flid_gid[flid]
       end
     end
   end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index c46d55d9..b30aaf2a 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -64,7 +64,7 @@ function unpermute(indices::LocalIndices)
   rank = part_id(indices)
   own = OwnIndices(nglobal,rank,own_to_global(indices))
   ghost = GhostIndices(nglobal,ghost_to_global(indices),ghost_to_owner(indices))
-  OwnAndGhostIndices(own,ghost,global_to_owner(indices))
+  OwnAndGhostIndices(own,ghost)
 end
 
 """
@@ -98,14 +98,15 @@ function locally_repartition!(w::PVector,v::PVector)
   return w
 end
 
+
 """
     filter_and_replace_ghost(indices,gids)
 
 Replace ghost ids in `indices` with the ghost ids within `gids`.
 
-NOTE: The issue is that `replace_ghost` does not check if all gids are ghosts or whether 
-they are repeated. It also shifts ownership of the ghosts. Its all quite messy and not what we 
-would need. TODO: Make me better. 
+NOTE: The issue is that `replace_ghost` does not check if all gids are ghosts or whether
+they are repeated. It also shifts ownership of the ghosts. Its all quite messy and not what we
+would need. TODO: Make me better.
 """
 function filter_and_replace_ghost(indices,gids)
   owners = find_owner(indices,gids)

From 1b9aef00f6cafaec908a1f232de048ae242e13da Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Thu, 14 May 2026 12:07:57 +1000
Subject: [PATCH 29/48] Move gid-cretion code into new file

---
 src/FESpaces.jl          | 555 ++--------------------------------
 src/Geometry.jl          |  51 +---
 src/GridapDistributed.jl |   2 +
 src/GridapExtras.jl      |   9 -
 src/MultiField.jl        |  76 +----
 src/Partitions.jl        | 634 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 661 insertions(+), 666 deletions(-)
 create mode 100644 src/Partitions.jl

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 4ccbb800..f403324c 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -67,506 +67,6 @@ function FESpaces.gather_free_and_dirichlet_values(f::DistributedFESpace,cell_va
   return free_values, dirichlet_values
 end
 
-function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids)
-  map(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids
-    cache  = array_cache(cell_to_ldofs)
-    for cell in cell_ids
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      p = cwv.ptrs[cell]-1
-      for (i,ldof) in enumerate(ldofs)
-        if ldof > 0
-          cwv.data[i+p] = dwv[ldof]
-        end
-      end
-    end
-  end
-  return cell_wise_vector
-end
-
-function posneg_wise_to_cell_wise!(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids)
-  map(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids) do cwv,pwv,nwv,cell_to_posneg,cell_ids
-    cache  = array_cache(cell_to_posneg)
-    for cell in cell_ids
-      lids = getindex!(cache,cell_to_posneg,cell)
-      p = cwv.ptrs[cell]-1
-      for (i,lid) in enumerate(lids)
-        if lid > 0
-          cwv.data[i+p] = pwv[lid]
-        elseif lid < 0
-          cwv.data[i+p] = nwv[-lid]
-        end
-      end
-    end
-  end
-  return cell_wise_vector
-end
-
-function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids)
-  map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
-    cache = array_cache(cell_to_ldofs)
-    for cell in cell_ids
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      p = cwv.ptrs[cell]-1
-      for (i,ldof) in enumerate(ldofs)
-        if ldof > 0
-          dwv[ldof] = cwv.data[i+p]
-        end
-      end
-    end
-  end
-  return dof_wise_vector
-end
-
-function cell_wise_to_posneg_wise!(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids)
-  map(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids) do pwv,nwv,cwv,cell_to_posneg,cell_ids
-    cache = array_cache(cell_to_posneg)
-    for cell in cell_ids
-      lids = getindex!(cache,cell_to_posneg,cell)
-      p = cwv.ptrs[cell]-1
-      for (i,lid) in enumerate(lids)
-        if lid > 0
-          pwv[lid] = cwv.data[i+p]
-        elseif lid < 0
-          nwv[-lid] = cwv.data[i+p]
-        end
-      end
-    end
-  end
-  return pos_wise_vector, neg_wise_vector
-end
-
-function allocate_cell_wise_vector(T, cell_to_lids)
-  map(cell_to_lids) do cell_to_lids
-    ptrs = Arrays.generate_ptrs(cell_to_lids)
-    data = zeros(T,ptrs[end]-1)
-    JaggedArray(data,ptrs)
-  end
-end
-
-function dof_wise_to_cell_wise(
-  dof_wise_vector, cell_to_ldofs, cell_ids; 
-  T = eltype(eltype(dof_wise_vector))
-)
-  cwv = allocate_cell_wise_vector(T,cell_to_ldofs)
-  dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_ids)
-  return cwv
-end
-
-function fetch_vector_ghost_values_cache(vector_partition,partition)
-  cache = PArrays.p_vector_cache(vector_partition,partition)
-  reverse(cache)
-end
-
-function fetch_vector_ghost_values!(vector_partition,cache)
-  assemble!((a,b)->b, vector_partition, cache) 
-end
-
-"""
-    generate_gids(cell_gids::PRange, cell_to_lids, nlids) -> PRange
-
-Given a set of cell global ids, a distributed array of local tables mapping cells to local dof ids,
-and a distributed array with the number of local dofs in each partition, this function 
-generates the global dof ids and returns them as a `PRange` of `PermutedLocalIndices`.
-
-Ignores negative local dof ids (usually used for Dirichlet dofs).
-"""
-function generate_gids(
-  cell_range::PRange,
-  cell_to_ldofs::AbstractArray{<:AbstractArray},
-  nldofs::AbstractArray{<:Integer}
-)
-  ranks = linear_indices(partition(cell_range))
-  cell_ldofs_to_data = allocate_cell_wise_vector(Int, cell_to_ldofs)
-  cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_data,partition(cell_range))
-
-  # Find and count number owned dofs
-  ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs
-    ldof_to_owner = fill(Int32(0),nldofs)
-    cache = array_cache(cell_to_ldofs)
-    for (cell, owner) in enumerate(local_to_owner(indices))
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      for ldof in ldofs
-        if ldof > 0
-          # NOTE: this approach concentrates dofs in the last processor
-          ldof_to_owner[ldof] = max(owner,ldof_to_owner[ldof])
-        end
-      end
-    end
-    nodofs = count(isequal(part_id(indices)),ldof_to_owner)
-    ldof_to_owner, nodofs
-  end |> tuple_of_arrays
-
-  # Find the global range of owned dofs
-  first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs)))
-
-  # Exchange the dof owners. Cell owner always has correct dof owner.
-  cell_ldofs_to_owner = dof_wise_to_cell_wise!(
-    cell_ldofs_to_data,ldof_to_owner,cell_to_ldofs,own_to_local(cell_range)
-  )
-  fetch_vector_ghost_values!(cell_ldofs_to_owner,cache_fetch) |> wait
-  cell_wise_to_dof_wise!(
-    ldof_to_owner,cell_ldofs_to_owner,cell_to_ldofs,ghost_to_local(cell_range)
-  )
-
-  # Fill owned gids
-  ldof_to_gdof = map(ranks,first_gdof,ldof_to_owner) do rank,first_gdof,ldof_to_owner
-    offset = first_gdof-1
-    ldof_to_gdof = zeros(Int,length(ldof_to_owner))
-    odof = 0
-    for (ldof,owner) in enumerate(ldof_to_owner)
-      if owner == rank
-        odof += 1
-        ldof_to_gdof[ldof] = odof + offset
-      end
-    end
-    ldof_to_gdof
-  end
-
-  # Exchange gids
-  cell_to_gdofs = dof_wise_to_cell_wise!(
-    cell_ldofs_to_data,ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range)
-  )
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-
-  # Fill ghost gids with exchanged information
-  map(
-    cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)
-  ) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
-    cache = array_cache(cell_to_ldofs)
-    lcell_to_owner = local_to_owner(indices)
-    for cell in ghost_to_local(indices)
-      p = cell_to_gdofs.ptrs[cell]-1
-      ldofs = getindex!(cache,cell_to_ldofs,cell)
-      cell_owner = lcell_to_owner[cell]
-      for (i,ldof) in enumerate(ldofs)
-        if (ldof > 0) && isequal(ldof_to_owner[ldof],cell_owner)
-          ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
-        end
-      end
-    end
-  end
-
-  dof_wise_to_cell_wise!(cell_to_gdofs,ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range))
-  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
-  cell_wise_to_dof_wise!(ldof_to_gdof,cell_to_gdofs,cell_to_ldofs,ghost_to_local(cell_range))
-
-  # Setup DoFs LocalIndices
-  local_indices = permuted_variable_partition(
-    nodofs,ldof_to_gdof,ldof_to_owner;start=first_gdof
-  )
-
-  return PRange(local_indices)
-end
-
-"""
-    generate_posneg_gids(cell_gids::PRange, cell_to_lposneg, nlpos, nlneg) -> (PRange,PRange)
-
-Similar to `generate_gids`, but also handles negative local dof ids. Returns two sets of 
-global ids: one for positive local ids and another for negative local ids.
-This can be used to generate simultaneously free and dirichlet dof global ids.
-"""
-function generate_posneg_gids(
-  cell_range::PRange,
-  cell_to_lposneg::AbstractArray{<:AbstractArray},
-  nlpos::AbstractArray{<:Integer},
-  nlneg::AbstractArray{<:Integer}
-)
-  ranks = linear_indices(partition(cell_range))
-  cell_lids_to_data = allocate_cell_wise_vector(Int, cell_to_lposneg)
-  cache_fetch = fetch_vector_ghost_values_cache(cell_lids_to_data,partition(cell_range))
-
-  # Find and count number owned dofs
-  lpos_to_owner, lneg_to_owner, nopos, noneg = map(
-    partition(cell_range),cell_to_lposneg,nlpos,nlneg
-  ) do indices,cell_to_lposneg,nlpos,nlneg
-    lpos_to_owner = fill(zero(Int32),nlpos)
-    lneg_to_owner = fill(zero(Int32),nlneg)
-    cache = array_cache(cell_to_lposneg)
-    for (cell, owner) in enumerate(local_to_owner(indices))
-      lids = getindex!(cache,cell_to_lposneg,cell)
-      for lid in lids
-        if lid > 0
-          lpos_to_owner[lid] = max(owner,lpos_to_owner[lid])
-        elseif lid < 0
-          lneg_to_owner[-lid] = max(owner,lneg_to_owner[-lid])
-        end
-      end
-    end
-    rank = part_id(indices)
-    nopos = count(isequal(rank),lpos_to_owner)
-    noneg = count(isequal(rank),lneg_to_owner)
-    return lpos_to_owner, lneg_to_owner, nopos, noneg
-  end |> tuple_of_arrays
-
-  # Find the global range of owned dofs
-  first_gpos = scan(+,nopos,type=:exclusive,init=1)
-  first_gneg = scan(+,noneg,type=:exclusive,init=1)
-
-  # Exchange the dof owners. Cell owner always has correct dof owner.
-  cell_ldofs_to_owner = posneg_wise_to_cell_wise!(
-    cell_lids_to_data,lpos_to_owner,lneg_to_owner,cell_to_lposneg,own_to_local(cell_range)
-  )
-  fetch_vector_ghost_values!(cell_ldofs_to_owner,cache_fetch) |> wait
-  cell_wise_to_posneg_wise!(
-    lpos_to_owner,lneg_to_owner,cell_ldofs_to_owner,cell_to_lposneg,ghost_to_local(cell_range)
-  )
-
-  # Fill owned gids
-  lpos_to_gpos = map(ranks,first_gpos,lpos_to_owner) do rank,first_gpos,lpos_to_owner
-    offset = first_gpos-1
-    lpos_to_gpos = zeros(Int,length(lpos_to_owner))
-    opos = 0
-    for (lpos,owner) in enumerate(lpos_to_owner)
-      if owner == rank
-        opos += 1
-        lpos_to_gpos[lpos] = opos + offset
-      end
-    end
-    lpos_to_gpos
-  end
-
-  lneg_to_gneg = map(ranks,first_gneg,lneg_to_owner) do rank,first_gneg,lneg_to_owner
-    offset = first_gneg-1
-    lneg_to_gneg = zeros(Int,length(lneg_to_owner))
-    oneg = 0
-    for (lneg,owner) in enumerate(lneg_to_owner)
-      if owner == rank
-        oneg += 1
-        lneg_to_gneg[lneg] = oneg + offset
-      end
-    end
-    lneg_to_gneg
-  end
-
-  # Exchange gids
-  cell_to_gposneg = posneg_wise_to_cell_wise!(
-    cell_lids_to_data,lpos_to_gpos,lneg_to_gneg,cell_to_lposneg,own_to_local(cell_range)
-  )
-  fetch_vector_ghost_values!(cell_to_gposneg,cache_fetch) |> wait
-
-  # Fill ghost gids with exchanged information
-  map(
-    cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,partition(cell_range)
-  ) do cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,indices
-    cache = array_cache(cell_to_lposneg)
-    lcell_to_owner = local_to_owner(indices)
-    for cell in ghost_to_local(indices)
-      p = cell_to_gposneg.ptrs[cell]-1
-      lids = getindex!(cache,cell_to_lposneg,cell)
-      cell_owner = lcell_to_owner[cell]
-      for (i,lid) in enumerate(lids)
-        if (lid > 0) && isequal(lpos_to_owner[lid],cell_owner)
-          lpos_to_gpos[lid] = cell_to_gposneg.data[i+p]
-        elseif (lid < 0) && isequal(lneg_to_owner[-lid],cell_owner)
-          lneg_to_gneg[-lid] = cell_to_gposneg.data[i+p]
-        end
-      end
-    end
-  end
-
-  posneg_wise_to_cell_wise!(cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,cell_to_lposneg,own_to_local(cell_range))
-  fetch_vector_ghost_values!(cell_to_gposneg,cache_fetch) |> wait
-  cell_wise_to_posneg_wise!(lpos_to_gpos,lneg_to_gneg,cell_to_gposneg,cell_to_lposneg,ghost_to_local(cell_range))
-
-  # Setup DoFs LocalIndices
-  local_indices_pos = permuted_variable_partition(nopos,lpos_to_gpos,lpos_to_owner;start=first_gpos)
-  local_indices_neg = permuted_variable_partition(noneg,lneg_to_gneg,lneg_to_owner;start=first_gneg)
-
-  return PRange(local_indices_pos), PRange(local_indices_neg)
-end
-
-"""
-    generate_gids_by_color(cell_gids::PRange, cell_to_lids, lid_to_color, ncolors)
-
-Similar to `generate_gids`, but uses a global partition given by `lid_to_color` to generate
-a different set of global ids per color. Returns:
-
-- a tuple with a `PRange` of `PermutedLocalIndices` per color.
-- a mapping `lid_to_clid` that maps local ids to local color ids
-- a mapping `color_to_clid_to_lid` that maps, for each color, local color ids to local ids.
-"""
-function generate_gids_by_color(
-  cell_range::PRange, 
-  cell_to_lids::AbstractArray{<:AbstractArray},
-  lid_to_color::AbstractArray,
-  ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))
-)
-  cell_lids_to_data = allocate_cell_wise_vector(Int, cell_to_lids)
-  cache_fetch = fetch_vector_ghost_values_cache(cell_lids_to_data,partition(cell_range))
-
-  lid_to_owner, color_to_noids = map(
-    partition(cell_range),cell_to_lids,lid_to_color
-  ) do indices, cell_to_lids, lid_to_color
-    lid_to_owner = fill(zero(Int32),length(lid_to_color))
-    cache = array_cache(cell_to_lids)
-    for (cell, owner) in enumerate(local_to_owner(indices))
-      lids = getindex!(cache,cell_to_lids,cell)
-      for lid in lids
-        (lid < 0) && continue
-        lid_to_owner[lid] = max(owner,lid_to_owner[lid])
-      end
-    end
-
-    rank = part_id(indices)
-    color_to_noids = zeros(Int,ncolors)
-    for (owner, color) in zip(lid_to_owner,lid_to_color)
-      color_to_noids[color] += isequal(owner,rank)
-    end
-
-    return lid_to_owner, color_to_noids
-  end |> tuple_of_arrays
-
-  # Find first gid per color
-  color_to_fgid = map(1:ncolors) do c
-    scan(+,map(Base.Fix2(getindex,c), color_to_noids); type=:exclusive, init=1)
-  end |> to_parray_of_arrays
-
-  # Start exchanging dof owners
-  cell_lids_to_owner = dof_wise_to_cell_wise!(
-    cell_lids_to_data,lid_to_owner,cell_to_lids,own_to_local(cell_range)
-  )
-  t1 = fetch_vector_ghost_values!(cell_lids_to_owner,cache_fetch)
-
-  # Note: lid_to_owner is still not consistent, but owned data is correct
-  lid_to_gid, lid_to_clid, color_to_clid_to_lid = map(
-    partition(cell_range), lid_to_color, lid_to_owner, color_to_fgid
-  ) do indices, lid_to_color, lid_to_owner, color_to_fgid
-    
-    # Count number of dofs per color
-    rank = part_id(indices)
-    color_to_nldofs = zeros(Int,ncolors)
-    lid_to_clid = zeros(Int32,length(lid_to_color))
-    lid_to_gid = zeros(Int,length(lid_to_color))
-    for (lid, (color, owner)) in enumerate(zip(lid_to_color,lid_to_owner))
-      color_to_nldofs[color] += 1
-      lid_to_clid[lid] = color_to_nldofs[color]
-      if isequal(owner,rank)
-        lid_to_gid[lid] = color_to_fgid[color]
-        color_to_fgid[color] += 1
-      end
-    end
-
-    # Find clid to lid mapping
-    color_to_clid_to_lid = [fill(zero(Int32),n) for n in color_to_nldofs]
-    for (lid, (color, clid)) in enumerate(zip(lid_to_color,lid_to_clid))
-      color_to_clid_to_lid[color][clid] = lid
-    end
-    return lid_to_gid, lid_to_clid, color_to_clid_to_lid
-  end |> tuple_of_arrays
-
-  # Finish exchanging the dof owners.
-  wait(t1)
-  cell_wise_to_dof_wise!(
-    lid_to_owner,cell_lids_to_owner,cell_to_lids,ghost_to_local(cell_range)
-  )
-
-  # Start exchanging gids
-  cell_to_gids = dof_wise_to_cell_wise!(
-    cell_lids_to_data,lid_to_gid,cell_to_lids,own_to_local(cell_range)
-  )
-  t2 = fetch_vector_ghost_values!(cell_to_gids,cache_fetch)
-
-  # Finish exchanging the gids.
-  wait(t2)
-  map(
-    cell_to_lids,cell_to_gids,lid_to_gid,lid_to_owner,partition(cell_range)
-  ) do cell_to_lids,cell_to_gids,lid_to_gid,lid_to_owner,indices
-    cache = array_cache(cell_to_lids)
-    lcell_to_owner = local_to_owner(indices)
-    for cell in ghost_to_local(indices)
-      p = cell_to_gids.ptrs[cell]-1
-      lids = getindex!(cache,cell_to_lids,cell)
-      cell_owner = lcell_to_owner[cell]
-      for (i,lid) in enumerate(lids)
-        if (lid > 0) && isequal(lid_to_owner[lid],cell_owner)
-          lid_to_gid[lid] = cell_to_gids.data[i+p]
-        end
-      end
-    end
-  end
-
-  dof_wise_to_cell_wise!(cell_to_gids,lid_to_gid,cell_to_lids,own_to_local(cell_range))
-  fetch_vector_ghost_values!(cell_to_gids,cache_fetch) |> wait
-  cell_wise_to_dof_wise!(lid_to_gid,cell_to_gids,cell_to_lids,ghost_to_local(cell_range))
-
-  # Setup DoFs indices per color
-  color_to_indices = ntuple(ncolors) do c
-    c_noids  = map(Base.Fix2(getindex,c), color_to_noids)
-    c_fgid   = color_to_fgid[c]
-    c_gids   = map(getindex, lid_to_gid,   color_to_clid_to_lid[c])
-    c_owners = map(getindex, lid_to_owner, color_to_clid_to_lid[c])
-    PRange(permuted_variable_partition(c_noids, c_gids, c_owners; start=c_fgid))
-  end
-
-  return color_to_indices, lid_to_clid, color_to_clid_to_lid
-end
-
-"""
-    split_gids_by_color(gids::PRange, lid_to_color::AbstractArray, ncolors) -> NTuple{ncolors,PRange}
-
-Given a set of global ids and a mapping from local ids to colors, this function splits
-the global ids into different sets of global ids per color. Returns a tuple with a `PRange`
-of `PermutedLocalIndices` per color.
-"""
-function split_gids_by_color(
-  gids::PRange, lid_to_color::AbstractArray, 
-  ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))
-)
-  color_to_nlids, color_to_noids = map(partition(gids), lid_to_color) do ids, lid_to_color
-    rank = part_id(ids)
-    lid_to_owner = local_to_owner(ids)
-    color_to_nlids = zeros(Int,ncolors)
-    color_to_noids = zeros(Int,ncolors)
-    for (owner,color) in zip(lid_to_owner,lid_to_color)
-      color_to_nlids[color] += 1
-      color_to_noids[color] += isequal(owner,rank)
-    end
-    return color_to_nlids, color_to_noids
-  end |> tuple_of_arrays
-
-  # I wish we could reduce arrays directly...
-  color_to_fgid = map(1:ncolors) do c
-    scan(+,map(Base.Fix2(getindex,c), color_to_noids); type=:exclusive, init=1)
-  end |> to_parray_of_arrays
-
-  lid_to_cgid = map(partition(gids), lid_to_color, color_to_fgid) do ids, lid_to_color, color_to_fgid
-    rank = part_id(ids)
-    lid_to_cgid = zeros(Int,length(lid_to_color))
-    color_to_offset = zeros(Int,ncolors)
-    for (lid,(owner,color)) in enumerate(zip(local_to_owner(ids),lid_to_color))
-      if isequal(owner,rank)
-        lid_to_cgid[lid] = color_to_fgid[color] + color_to_offset[color]
-        color_to_offset[color] += 1
-      end
-    end
-    return lid_to_cgid
-  end
-  consistent!(PVector(lid_to_cgid,partition(gids))) |> wait
-
-  # Pack per-color gid/owner arrays in a single pass over local dofs
-  all_c_gids, all_c_owners = map(
-    lid_to_color, lid_to_cgid, partition(gids), color_to_nlids
-  ) do lid_to_color, lid_to_cgid, ids, color_to_nlids
-    c_gids   = [zeros(Int,   n) for n in color_to_nlids]
-    c_owners = [zeros(Int32, n) for n in color_to_nlids]
-    offsets = zeros(Int, ncolors)
-    for (color, cgid, owner) in zip(lid_to_color, lid_to_cgid, local_to_owner(ids))
-      offsets[color] += 1
-      lid = offsets[color]
-      c_gids[color][lid]   = cgid
-      c_owners[color][lid] = owner
-    end
-    return c_gids, c_owners
-  end |> tuple_of_arrays
-
-  return ntuple(ncolors) do c
-    c_noids  = map(Base.Fix2(getindex,c), color_to_noids)
-    c_fgid   = color_to_fgid[c]
-    c_gids   = map(Base.Fix2(getindex,c), all_c_gids)
-    c_owners = map(Base.Fix2(getindex,c), all_c_owners)
-    PRange(permuted_variable_partition(c_noids, c_gids, c_owners; start=c_fgid))
-  end
-end
 
 # FEFunction related
 
@@ -1238,51 +738,39 @@ function FESpaces.numeric_loop_matrix_and_vector!(A,b,a::DistributedSparseMatrix
 end
 
 function FESpaces.collect_cell_matrix(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  a::DistributedDomainContribution
+  trial::DistributedFESpace, test::DistributedFESpace, matdata::DistributedDomainContribution
 )
-  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(a))
+  map(collect_cell_matrix,local_views(trial),local_views(test),local_views(matdata))
 end
 
 function FESpaces.collect_cell_vector(
-  test::DistributedFESpace, a::DistributedDomainContribution
+  test::DistributedFESpace, vecdata::DistributedDomainContribution
 )
-  map(collect_cell_vector,local_views(test),local_views(a))
+  map(collect_cell_vector,local_views(test),local_views(vecdata))
 end
 
 function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution
+  trial::DistributedFESpace, test::DistributedFESpace,
+  matdata::DistributedDomainContribution, vecdata::DistributedDomainContribution
 )
   map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform)
+    local_views(trial), local_views(test),
+    local_views(matdata), local_views(vecdata)
   )
 end
 
 function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  biform::DistributedDomainContribution,
-  liform::DistributedDomainContribution,
-  uhd
+  trial::DistributedFESpace, test::DistributedFESpace,
+  matdata::DistributedDomainContribution, vecdata::DistributedDomainContribution, uhd
 )
   map(collect_cell_matrix_and_vector,
-    local_views(trial),
-    local_views(test),
-    local_views(biform),
-    local_views(liform),
-    local_views(uhd)
+    local_views(trial), local_views(test),
+    local_views(matdata), local_views(vecdata), local_views(uhd)
   )
 end
 
 function FESpaces.collect_cell_vector(
-  test::DistributedFESpace,l::Number
+  test::DistributedFESpace, l::Number
 )
   map(local_views(test)) do s
     collect_cell_vector(s,l)
@@ -1290,24 +778,19 @@ function FESpaces.collect_cell_vector(
 end
 
 function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number
+  trial::DistributedFESpace, test::DistributedFESpace,
+  matdata::DistributedDomainContribution, l::Number
 )
-  map(local_views(trial),local_views(test),local_views(mat)) do u,v,m
+  map(local_views(trial),local_views(test),local_views(matdata)) do u,v,m
     collect_cell_matrix_and_vector(u,v,m,l)
   end
 end
 
 function FESpaces.collect_cell_matrix_and_vector(
-  trial::DistributedFESpace,
-  test::DistributedFESpace,
-  mat::DistributedDomainContribution,
-  l::Number,
-  uhd
+  trial::DistributedFESpace, test::DistributedFESpace,
+  matdata::DistributedDomainContribution, l::Number, uhd
 )
-  map(local_views(trial),local_views(test),local_views(mat),local_views(uhd)) do u,v,m,f
+  map(local_views(trial),local_views(test),local_views(matdata),local_views(uhd)) do u,v,m,f
     collect_cell_matrix_and_vector(u,v,m,l,f)
   end
 end
diff --git a/src/Geometry.jl b/src/Geometry.jl
index af872c47..7c49af11 100644
--- a/src/Geometry.jl
+++ b/src/Geometry.jl
@@ -957,12 +957,12 @@ function generate_cell_gids(dtrian::DistributedTriangulation)
   generate_cell_gids(dmodel,dtrian)
 end
 
-function generate_cell_gids(dmodel::DistributedDiscreteModel{Dm},
-                            dtrian::DistributedTriangulation{Dt}) where {Dm,Dt}
-
+function generate_cell_gids(
+  dmodel::DistributedDiscreteModel{Dm},
+  dtrian::DistributedTriangulation{Dt}
+) where {Dm,Dt}
   mgids = get_face_gids(dmodel,Dt)
-  covers_all_faces = _covers_all_faces(dmodel,dtrian)
-  if (covers_all_faces)
+  if _covers_all_faces(dmodel,dtrian)
     tgids = mgids
   else
     tcell_to_mcell = map(local_views(dtrian)) do trian
@@ -974,44 +974,3 @@ function generate_cell_gids(dmodel::DistributedDiscreteModel{Dm},
   end
   return tgids
 end
-
-function restrict_gids(gids::PRange, new_to_old_lid::AbstractArray)
-
-  n_own = map(partition(gids), new_to_old_lid) do ids, n2o_lid
-    rank = part_id(ids)
-    return count(isequal(rank), view(local_to_owner(ids), n2o_lid)) 
-  end
-
-  # Assign global ids to owned lids
-  first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
-  
-  old_lid_to_new_gid = map(first_gid,new_to_old_lid,partition(gids)) do first_gid, n2o_lid, ids
-    old_lid_to_new_gid = zeros(Int,local_length(ids))
-    old_lid_to_owner = local_to_owner(ids)
-    rank = part_id(ids)
-    gid = first_gid
-    for old in n2o_lid
-      if old_lid_to_owner[old] == rank
-        old_lid_to_new_gid[old] = gid
-        gid += 1
-      end
-    end
-    return old_lid_to_new_gid
-  end
-
-  consistent!(PVector(old_lid_to_new_gid,partition(gids))) |> wait
-
-  # Prepare new partition
-  n_gids = reduction(+,n_own,destination=:all,init=zero(eltype(n_own)))
-
-  new_indices = map(
-    n_gids, old_lid_to_new_gid, new_to_old_lid, partition(gids)
-  ) do n_gids, old_lid_to_new_gid, new_to_old_lid, ids
-    lid_to_gid = old_lid_to_new_gid[new_to_old_lid]
-    lid_to_owner  = local_to_owner(ids)[new_to_old_lid]
-    return LocalIndices(n_gids,part_id(ids),lid_to_gid,lid_to_owner)
-  end
-  _find_neighbours!(new_indices, partition(gids))
-
-  return PRange(new_indices)
-end
diff --git a/src/GridapDistributed.jl b/src/GridapDistributed.jl
index a1daddea..d3312203 100644
--- a/src/GridapDistributed.jl
+++ b/src/GridapDistributed.jl
@@ -55,6 +55,8 @@ include("Algebra.jl")
 
 include("Assembly.jl")
 
+include("Partitions.jl")
+
 include("Geometry.jl")
 
 include("CellData.jl")
diff --git a/src/GridapExtras.jl b/src/GridapExtras.jl
index e0723189..ba0bf326 100644
--- a/src/GridapExtras.jl
+++ b/src/GridapExtras.jl
@@ -1,13 +1,4 @@
 
-# Extensions to Gridap/Arrays/Tables.jl
-
-function generate_ptrs(vv::AbstractArray{<:AbstractArray{T}}) where T
-  ptrs = Vector{Int32}(undef,length(vv)+1)
-  Arrays._generate_data_and_ptrs_fill_ptrs!(ptrs,vv)
-  Arrays.length_to_ptrs!(ptrs)
-  return ptrs
-end
-
 # New type of Vector allocation
 
 """
diff --git a/src/MultiField.jl b/src/MultiField.jl
index 3cb7c272..b2ee9126 100644
--- a/src/MultiField.jl
+++ b/src/MultiField.jl
@@ -334,7 +334,7 @@ function generate_multi_field_gids(
     push!(f_p_flid_lid,p_flid_lid)
   end
   f_frange = map(get_free_dof_ids,f_dspace)
-  gids = generate_multi_field_gids(f_p_flid_lid,f_frange)
+  gids = vcat_gids(f_p_flid_lid,f_frange)
   return gids
 end
 
@@ -351,80 +351,6 @@ function generate_multi_field_gids(
   return BlockPRange(block_gids)
 end
 
-function generate_multi_field_gids(
-  f_p_flid_lid::AbstractVector{<:AbstractArray{<:AbstractVector}},
-  f_frange::AbstractVector{<:PRange})
-
-  f_p_fiset = map(local_views,f_frange)
-
-  v(x...) = collect(x)
-  p_f_fiset = map(v,f_p_fiset...)
-  p_f_flid_lid = map(v,f_p_flid_lid...)
-
-  # Find the first gid of the multifield space in each part
-  p_noids = map(f_fiset->sum(map(own_length,f_fiset)),p_f_fiset)
-  p_firstgid = scan(+,p_noids,type=:exclusive,init=one(eltype(p_noids)))
-
-  # Distributed gids to owned dofs
-  p_lid_gid, p_lid_part = map(p_f_flid_lid, p_f_fiset, p_firstgid) do f_flid_lid, f_fiset, firstgid
-    nlids    = sum(length, f_flid_lid)
-    lid_gid  = zeros(Int,   nlids)
-    lid_part = zeros(Int32, nlids)
-    gid = firstgid
-    for (flid_lid, fiset) in zip(f_flid_lid, f_fiset)
-      part = part_id(fiset)
-      for flid in own_to_local(fiset)
-        lid_gid[flid_lid[flid]]  = gid
-        lid_part[flid_lid[flid]] = part
-        gid += 1
-      end
-    end
-    return lid_gid, lid_part
-  end |> tuple_of_arrays
-
-  # Now we need to propagate to ghost. To this end we use the already available
-  # communicators in each of the single fields.
-  # We cannot use the cell wise communication since each field can be defined on an independent mesh.
-  f_aux_gids = map(frange->PVector{Vector{eltype(eltype(p_lid_gid))}}(undef,partition(frange)),f_frange)
-  f_aux_part = map(frange->PVector{Vector{eltype(eltype(p_lid_part))}}(undef,partition(frange)),f_frange)
-  propagate_to_ghost_multifield!(p_lid_gid,f_aux_gids,f_p_flid_lid,f_p_fiset)
-  propagate_to_ghost_multifield!(p_lid_part,f_aux_part,f_p_flid_lid,f_p_fiset)
-
-  p_iset = map(p_lid_gid,p_lid_part,p_noids,p_firstgid) do lid_to_gid, lid_to_owner, nodofs, firstgid
-    permuted_variable_partition(nodofs,lid_to_gid,lid_to_owner;start=firstgid)
-  end
-
-  f_p_parts_snd, f_p_parts_rcv = map(assembly_neighbors ∘ partition, f_frange) |> tuple_of_arrays
-  merge_neigs(f_neigs...) = sort(unique(vcat(f_neigs...)))
-  p_neigs_snd = map(merge_neigs,f_p_parts_snd...)
-  p_neigs_rcv = map(merge_neigs,f_p_parts_rcv...)
-
-  exchange_graph = ExchangeGraph(p_neigs_snd,p_neigs_rcv)
-  assembly_neighbors(p_iset; neighbors = exchange_graph)
-
-  return PRange(p_iset)
-end
-
-function propagate_to_ghost_multifield!(
-  p_lid_gid, f_gids, f_p_flid_lid, f_p_fiset
-)
-  for (gids, p_flid_lid, p_fiset) in zip(f_gids, f_p_flid_lid, f_p_fiset)
-    p_flid_gid = local_views(gids)
-    map(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
-      for flid in own_to_local(fiset)
-        flid_gid[flid] = lid_gid[flid_lid[flid]]
-      end
-    end
-    cache = fetch_vector_ghost_values_cache(partition(gids), p_fiset)
-    fetch_vector_ghost_values!(partition(gids), cache) |> wait
-    map(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
-      for flid in ghost_to_local(fiset)
-        lid_gid[flid_lid[flid]] = flid_gid[flid]
-      end
-    end
-  end
-end
-
 # BlockSparseMatrixAssemblers
 
 const DistributedBlockSparseMatrixAssembler{R,C} =
diff --git a/src/Partitions.jl b/src/Partitions.jl
new file mode 100644
index 00000000..9698ae61
--- /dev/null
+++ b/src/Partitions.jl
@@ -0,0 +1,634 @@
+
+# Cell-wise communication helpers
+
+function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids)
+  map(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids
+    cache  = array_cache(cell_to_ldofs)
+    for cell in cell_ids
+      ldofs = getindex!(cache,cell_to_ldofs,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,ldof) in enumerate(ldofs)
+        if ldof > 0
+          cwv.data[i+p] = dwv[ldof]
+        end
+      end
+    end
+  end
+  return cell_wise_vector
+end
+
+function posneg_wise_to_cell_wise!(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids)
+  map(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids) do cwv,pwv,nwv,cell_to_posneg,cell_ids
+    cache  = array_cache(cell_to_posneg)
+    for cell in cell_ids
+      lids = getindex!(cache,cell_to_posneg,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,lid) in enumerate(lids)
+        if lid > 0
+          cwv.data[i+p] = pwv[lid]
+        elseif lid < 0
+          cwv.data[i+p] = nwv[-lid]
+        end
+      end
+    end
+  end
+  return cell_wise_vector
+end
+
+function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids)
+  map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
+    cache = array_cache(cell_to_ldofs)
+    for cell in cell_ids
+      ldofs = getindex!(cache,cell_to_ldofs,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,ldof) in enumerate(ldofs)
+        if ldof > 0
+          dwv[ldof] = cwv.data[i+p]
+        end
+      end
+    end
+  end
+  return dof_wise_vector
+end
+
+function cell_wise_to_posneg_wise!(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids)
+  map(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids) do pwv,nwv,cwv,cell_to_posneg,cell_ids
+    cache = array_cache(cell_to_posneg)
+    for cell in cell_ids
+      lids = getindex!(cache,cell_to_posneg,cell)
+      p = cwv.ptrs[cell]-1
+      for (i,lid) in enumerate(lids)
+        if lid > 0
+          pwv[lid] = cwv.data[i+p]
+        elseif lid < 0
+          nwv[-lid] = cwv.data[i+p]
+        end
+      end
+    end
+  end
+  return pos_wise_vector, neg_wise_vector
+end
+
+function allocate_cell_wise_vector(T, cell_to_lids)
+  map(cell_to_lids) do cell_to_lids
+    ptrs = Arrays.generate_ptrs(cell_to_lids)
+    data = zeros(T,ptrs[end]-1)
+    JaggedArray(data,ptrs)
+  end
+end
+
+function dof_wise_to_cell_wise(
+  dof_wise_vector, cell_to_ldofs, cell_ids;
+  T = eltype(eltype(dof_wise_vector))
+)
+  cwv = allocate_cell_wise_vector(T,cell_to_ldofs)
+  dof_wise_to_cell_wise!(cwv,dof_wise_vector,cell_to_ldofs,cell_ids)
+  return cwv
+end
+
+function fetch_vector_ghost_values_cache(vector_partition,partition)
+  cache = PArrays.p_vector_cache(vector_partition,partition)
+  reverse(cache)
+end
+
+function fetch_vector_ghost_values!(vector_partition,cache)
+  assemble!((a,b)->b, vector_partition, cache)
+end
+
+# PRange generation from cell meshes
+
+"""
+    generate_gids(cell_gids::PRange, cell_to_lids, nlids) -> PRange
+
+Given a set of cell global ids, a distributed array of local tables mapping cells to local dof ids,
+and a distributed array with the number of local dofs in each partition, this function
+generates the global dof ids and returns them as a `PRange` of `PermutedLocalIndices`.
+
+Ignores negative local dof ids (usually used for Dirichlet dofs).
+"""
+function generate_gids(
+  cell_range::PRange,
+  cell_to_ldofs::AbstractArray{<:AbstractArray},
+  nldofs::AbstractArray{<:Integer}
+)
+  ranks = linear_indices(partition(cell_range))
+  cell_ldofs_to_data = allocate_cell_wise_vector(Int, cell_to_ldofs)
+  cache_fetch = fetch_vector_ghost_values_cache(cell_ldofs_to_data,partition(cell_range))
+
+  # Find and count number owned dofs
+  ldof_to_owner, nodofs = map(partition(cell_range),cell_to_ldofs,nldofs) do indices,cell_to_ldofs,nldofs
+    ldof_to_owner = fill(Int32(0),nldofs)
+    cache = array_cache(cell_to_ldofs)
+    for (cell, owner) in enumerate(local_to_owner(indices))
+      ldofs = getindex!(cache,cell_to_ldofs,cell)
+      for ldof in ldofs
+        if ldof > 0
+          # NOTE: this approach concentrates dofs in the last processor
+          ldof_to_owner[ldof] = max(owner,ldof_to_owner[ldof])
+        end
+      end
+    end
+    nodofs = count(isequal(part_id(indices)),ldof_to_owner)
+    ldof_to_owner, nodofs
+  end |> tuple_of_arrays
+
+  # Find the global range of owned dofs
+  first_gdof = scan(+,nodofs,type=:exclusive,init=one(eltype(nodofs)))
+
+  # Exchange the dof owners. Cell owner always has correct dof owner.
+  cell_ldofs_to_owner = dof_wise_to_cell_wise!(
+    cell_ldofs_to_data,ldof_to_owner,cell_to_ldofs,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_ldofs_to_owner,cache_fetch) |> wait
+  cell_wise_to_dof_wise!(
+    ldof_to_owner,cell_ldofs_to_owner,cell_to_ldofs,ghost_to_local(cell_range)
+  )
+
+  # Fill owned gids
+  ldof_to_gdof = map(ranks,first_gdof,ldof_to_owner) do rank,first_gdof,ldof_to_owner
+    offset = first_gdof-1
+    ldof_to_gdof = zeros(Int,length(ldof_to_owner))
+    odof = 0
+    for (ldof,owner) in enumerate(ldof_to_owner)
+      if owner == rank
+        odof += 1
+        ldof_to_gdof[ldof] = odof + offset
+      end
+    end
+    ldof_to_gdof
+  end
+
+  # Exchange gids
+  cell_to_gdofs = dof_wise_to_cell_wise!(
+    cell_ldofs_to_data,ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
+
+  # Fill ghost gids with exchanged information
+  map(
+    cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)
+  ) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
+    cache = array_cache(cell_to_ldofs)
+    lcell_to_owner = local_to_owner(indices)
+    for cell in ghost_to_local(indices)
+      p = cell_to_gdofs.ptrs[cell]-1
+      ldofs = getindex!(cache,cell_to_ldofs,cell)
+      cell_owner = lcell_to_owner[cell]
+      for (i,ldof) in enumerate(ldofs)
+        if (ldof > 0) && isequal(ldof_to_owner[ldof],cell_owner)
+          ldof_to_gdof[ldof] = cell_to_gdofs.data[i+p]
+        end
+      end
+    end
+  end
+
+  dof_wise_to_cell_wise!(cell_to_gdofs,ldof_to_gdof,cell_to_ldofs,own_to_local(cell_range))
+  fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
+  cell_wise_to_dof_wise!(ldof_to_gdof,cell_to_gdofs,cell_to_ldofs,ghost_to_local(cell_range))
+
+  # Setup DoFs LocalIndices
+  local_indices = permuted_variable_partition(
+    nodofs,ldof_to_gdof,ldof_to_owner;start=first_gdof
+  )
+
+  return PRange(local_indices)
+end
+
+"""
+    generate_posneg_gids(cell_gids::PRange, cell_to_lposneg, nlpos, nlneg) -> (PRange,PRange)
+
+Similar to `generate_gids`, but also handles negative local dof ids. Returns two sets of
+global ids: one for positive local ids and another for negative local ids.
+This can be used to generate simultaneously free and dirichlet dof global ids.
+"""
+function generate_posneg_gids(
+  cell_range::PRange,
+  cell_to_lposneg::AbstractArray{<:AbstractArray},
+  nlpos::AbstractArray{<:Integer},
+  nlneg::AbstractArray{<:Integer}
+)
+  ranks = linear_indices(partition(cell_range))
+  cell_lids_to_data = allocate_cell_wise_vector(Int, cell_to_lposneg)
+  cache_fetch = fetch_vector_ghost_values_cache(cell_lids_to_data,partition(cell_range))
+
+  # Find and count number owned dofs
+  lpos_to_owner, lneg_to_owner, nopos, noneg = map(
+    partition(cell_range),cell_to_lposneg,nlpos,nlneg
+  ) do indices,cell_to_lposneg,nlpos,nlneg
+    lpos_to_owner = fill(zero(Int32),nlpos)
+    lneg_to_owner = fill(zero(Int32),nlneg)
+    cache = array_cache(cell_to_lposneg)
+    for (cell, owner) in enumerate(local_to_owner(indices))
+      lids = getindex!(cache,cell_to_lposneg,cell)
+      for lid in lids
+        if lid > 0
+          lpos_to_owner[lid] = max(owner,lpos_to_owner[lid])
+        elseif lid < 0
+          lneg_to_owner[-lid] = max(owner,lneg_to_owner[-lid])
+        end
+      end
+    end
+    rank = part_id(indices)
+    nopos = count(isequal(rank),lpos_to_owner)
+    noneg = count(isequal(rank),lneg_to_owner)
+    return lpos_to_owner, lneg_to_owner, nopos, noneg
+  end |> tuple_of_arrays
+
+  # Find the global range of owned dofs
+  first_gpos = scan(+,nopos,type=:exclusive,init=1)
+  first_gneg = scan(+,noneg,type=:exclusive,init=1)
+
+  # Exchange the dof owners. Cell owner always has correct dof owner.
+  cell_ldofs_to_owner = posneg_wise_to_cell_wise!(
+    cell_lids_to_data,lpos_to_owner,lneg_to_owner,cell_to_lposneg,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_ldofs_to_owner,cache_fetch) |> wait
+  cell_wise_to_posneg_wise!(
+    lpos_to_owner,lneg_to_owner,cell_ldofs_to_owner,cell_to_lposneg,ghost_to_local(cell_range)
+  )
+
+  # Fill owned gids
+  lpos_to_gpos = map(ranks,first_gpos,lpos_to_owner) do rank,first_gpos,lpos_to_owner
+    offset = first_gpos-1
+    lpos_to_gpos = zeros(Int,length(lpos_to_owner))
+    opos = 0
+    for (lpos,owner) in enumerate(lpos_to_owner)
+      if owner == rank
+        opos += 1
+        lpos_to_gpos[lpos] = opos + offset
+      end
+    end
+    lpos_to_gpos
+  end
+
+  lneg_to_gneg = map(ranks,first_gneg,lneg_to_owner) do rank,first_gneg,lneg_to_owner
+    offset = first_gneg-1
+    lneg_to_gneg = zeros(Int,length(lneg_to_owner))
+    oneg = 0
+    for (lneg,owner) in enumerate(lneg_to_owner)
+      if owner == rank
+        oneg += 1
+        lneg_to_gneg[lneg] = oneg + offset
+      end
+    end
+    lneg_to_gneg
+  end
+
+  # Exchange gids
+  cell_to_gposneg = posneg_wise_to_cell_wise!(
+    cell_lids_to_data,lpos_to_gpos,lneg_to_gneg,cell_to_lposneg,own_to_local(cell_range)
+  )
+  fetch_vector_ghost_values!(cell_to_gposneg,cache_fetch) |> wait
+
+  # Fill ghost gids with exchanged information
+  map(
+    cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,partition(cell_range)
+  ) do cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,indices
+    cache = array_cache(cell_to_lposneg)
+    lcell_to_owner = local_to_owner(indices)
+    for cell in ghost_to_local(indices)
+      p = cell_to_gposneg.ptrs[cell]-1
+      lids = getindex!(cache,cell_to_lposneg,cell)
+      cell_owner = lcell_to_owner[cell]
+      for (i,lid) in enumerate(lids)
+        if (lid > 0) && isequal(lpos_to_owner[lid],cell_owner)
+          lpos_to_gpos[lid] = cell_to_gposneg.data[i+p]
+        elseif (lid < 0) && isequal(lneg_to_owner[-lid],cell_owner)
+          lneg_to_gneg[-lid] = cell_to_gposneg.data[i+p]
+        end
+      end
+    end
+  end
+
+  posneg_wise_to_cell_wise!(cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,cell_to_lposneg,own_to_local(cell_range))
+  fetch_vector_ghost_values!(cell_to_gposneg,cache_fetch) |> wait
+  cell_wise_to_posneg_wise!(lpos_to_gpos,lneg_to_gneg,cell_to_gposneg,cell_to_lposneg,ghost_to_local(cell_range))
+
+  # Setup DoFs LocalIndices
+  local_indices_pos = permuted_variable_partition(nopos,lpos_to_gpos,lpos_to_owner;start=first_gpos)
+  local_indices_neg = permuted_variable_partition(noneg,lneg_to_gneg,lneg_to_owner;start=first_gneg)
+
+  return PRange(local_indices_pos), PRange(local_indices_neg)
+end
+
+"""
+    generate_gids_by_color(cell_gids::PRange, cell_to_lids, lid_to_color, ncolors)
+
+Similar to `generate_gids`, but uses a global partition given by `lid_to_color` to generate
+a different set of global ids per color. Returns:
+
+- a tuple with a `PRange` of `PermutedLocalIndices` per color.
+- a mapping `lid_to_clid` that maps local ids to local color ids
+- a mapping `color_to_clid_to_lid` that maps, for each color, local color ids to local ids.
+"""
+function generate_gids_by_color(
+  cell_range::PRange,
+  cell_to_lids::AbstractArray{<:AbstractArray},
+  lid_to_color::AbstractArray,
+  ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))
+)
+  cell_lids_to_data = allocate_cell_wise_vector(Int, cell_to_lids)
+  cache_fetch = fetch_vector_ghost_values_cache(cell_lids_to_data,partition(cell_range))
+
+  lid_to_owner, color_to_noids = map(
+    partition(cell_range),cell_to_lids,lid_to_color
+  ) do indices, cell_to_lids, lid_to_color
+    lid_to_owner = fill(zero(Int32),length(lid_to_color))
+    cache = array_cache(cell_to_lids)
+    for (cell, owner) in enumerate(local_to_owner(indices))
+      lids = getindex!(cache,cell_to_lids,cell)
+      for lid in lids
+        (lid < 0) && continue
+        lid_to_owner[lid] = max(owner,lid_to_owner[lid])
+      end
+    end
+
+    rank = part_id(indices)
+    color_to_noids = zeros(Int,ncolors)
+    for (owner, color) in zip(lid_to_owner,lid_to_color)
+      color_to_noids[color] += isequal(owner,rank)
+    end
+
+    return lid_to_owner, color_to_noids
+  end |> tuple_of_arrays
+
+  # Find first gid per color
+  color_to_fgid = map(1:ncolors) do c
+    scan(+,map(Base.Fix2(getindex,c), color_to_noids); type=:exclusive, init=1)
+  end |> to_parray_of_arrays
+
+  # Start exchanging dof owners
+  cell_lids_to_owner = dof_wise_to_cell_wise!(
+    cell_lids_to_data,lid_to_owner,cell_to_lids,own_to_local(cell_range)
+  )
+  t1 = fetch_vector_ghost_values!(cell_lids_to_owner,cache_fetch)
+
+  # Note: lid_to_owner is still not consistent, but owned data is correct
+  lid_to_gid, lid_to_clid, color_to_clid_to_lid = map(
+    partition(cell_range), lid_to_color, lid_to_owner, color_to_fgid
+  ) do indices, lid_to_color, lid_to_owner, color_to_fgid
+
+    # Count number of dofs per color
+    rank = part_id(indices)
+    color_to_nldofs = zeros(Int,ncolors)
+    lid_to_clid = zeros(Int32,length(lid_to_color))
+    lid_to_gid = zeros(Int,length(lid_to_color))
+    for (lid, (color, owner)) in enumerate(zip(lid_to_color,lid_to_owner))
+      color_to_nldofs[color] += 1
+      lid_to_clid[lid] = color_to_nldofs[color]
+      if isequal(owner,rank)
+        lid_to_gid[lid] = color_to_fgid[color]
+        color_to_fgid[color] += 1
+      end
+    end
+
+    # Find clid to lid mapping
+    color_to_clid_to_lid = [fill(zero(Int32),n) for n in color_to_nldofs]
+    for (lid, (color, clid)) in enumerate(zip(lid_to_color,lid_to_clid))
+      color_to_clid_to_lid[color][clid] = lid
+    end
+    return lid_to_gid, lid_to_clid, color_to_clid_to_lid
+  end |> tuple_of_arrays
+
+  # Finish exchanging the dof owners.
+  wait(t1)
+  cell_wise_to_dof_wise!(
+    lid_to_owner,cell_lids_to_owner,cell_to_lids,ghost_to_local(cell_range)
+  )
+
+  # Start exchanging gids
+  cell_to_gids = dof_wise_to_cell_wise!(
+    cell_lids_to_data,lid_to_gid,cell_to_lids,own_to_local(cell_range)
+  )
+  t2 = fetch_vector_ghost_values!(cell_to_gids,cache_fetch)
+
+  # Finish exchanging the gids.
+  wait(t2)
+  map(
+    cell_to_lids,cell_to_gids,lid_to_gid,lid_to_owner,partition(cell_range)
+  ) do cell_to_lids,cell_to_gids,lid_to_gid,lid_to_owner,indices
+    cache = array_cache(cell_to_lids)
+    lcell_to_owner = local_to_owner(indices)
+    for cell in ghost_to_local(indices)
+      p = cell_to_gids.ptrs[cell]-1
+      lids = getindex!(cache,cell_to_lids,cell)
+      cell_owner = lcell_to_owner[cell]
+      for (i,lid) in enumerate(lids)
+        if (lid > 0) && isequal(lid_to_owner[lid],cell_owner)
+          lid_to_gid[lid] = cell_to_gids.data[i+p]
+        end
+      end
+    end
+  end
+
+  dof_wise_to_cell_wise!(cell_to_gids,lid_to_gid,cell_to_lids,own_to_local(cell_range))
+  fetch_vector_ghost_values!(cell_to_gids,cache_fetch) |> wait
+  cell_wise_to_dof_wise!(lid_to_gid,cell_to_gids,cell_to_lids,ghost_to_local(cell_range))
+
+  # Setup DoFs indices per color
+  color_to_indices = ntuple(ncolors) do c
+    c_noids  = map(Base.Fix2(getindex,c), color_to_noids)
+    c_fgid   = color_to_fgid[c]
+    c_gids   = map(getindex, lid_to_gid,   color_to_clid_to_lid[c])
+    c_owners = map(getindex, lid_to_owner, color_to_clid_to_lid[c])
+    PRange(permuted_variable_partition(c_noids, c_gids, c_owners; start=c_fgid))
+  end
+
+  return color_to_indices, lid_to_clid, color_to_clid_to_lid
+end
+
+"""
+    split_gids_by_color(gids::PRange, lid_to_color::AbstractArray, ncolors) -> NTuple{ncolors,PRange}
+
+Given a set of global ids and a mapping from local ids to colors, this function splits
+the global ids into different sets of global ids per color. Returns a tuple with a `PRange`
+of `PermutedLocalIndices` per color.
+"""
+function split_gids_by_color(
+  gids::PRange, lid_to_color::AbstractArray,
+  ncolors = getany(reduction(max,map(maximum,lid_to_color);destination=:all))
+)
+  color_to_nlids, color_to_noids = map(partition(gids), lid_to_color) do ids, lid_to_color
+    rank = part_id(ids)
+    lid_to_owner = local_to_owner(ids)
+    color_to_nlids = zeros(Int,ncolors)
+    color_to_noids = zeros(Int,ncolors)
+    for (owner,color) in zip(lid_to_owner,lid_to_color)
+      color_to_nlids[color] += 1
+      color_to_noids[color] += isequal(owner,rank)
+    end
+    return color_to_nlids, color_to_noids
+  end |> tuple_of_arrays
+
+  # I wish we could reduce arrays directly...
+  color_to_fgid = map(1:ncolors) do c
+    scan(+,map(Base.Fix2(getindex,c), color_to_noids); type=:exclusive, init=1)
+  end |> to_parray_of_arrays
+
+  lid_to_cgid = map(partition(gids), lid_to_color, color_to_fgid) do ids, lid_to_color, color_to_fgid
+    rank = part_id(ids)
+    lid_to_cgid = zeros(Int,length(lid_to_color))
+    color_to_offset = zeros(Int,ncolors)
+    for (lid,(owner,color)) in enumerate(zip(local_to_owner(ids),lid_to_color))
+      if isequal(owner,rank)
+        lid_to_cgid[lid] = color_to_fgid[color] + color_to_offset[color]
+        color_to_offset[color] += 1
+      end
+    end
+    return lid_to_cgid
+  end
+  consistent!(PVector(lid_to_cgid,partition(gids))) |> wait
+
+  # Pack per-color gid/owner arrays in a single pass over local dofs
+  all_c_gids, all_c_owners = map(
+    lid_to_color, lid_to_cgid, partition(gids), color_to_nlids
+  ) do lid_to_color, lid_to_cgid, ids, color_to_nlids
+    c_gids   = [zeros(Int,   n) for n in color_to_nlids]
+    c_owners = [zeros(Int32, n) for n in color_to_nlids]
+    offsets = zeros(Int, ncolors)
+    for (color, cgid, owner) in zip(lid_to_color, lid_to_cgid, local_to_owner(ids))
+      offsets[color] += 1
+      lid = offsets[color]
+      c_gids[color][lid]   = cgid
+      c_owners[color][lid] = owner
+    end
+    return c_gids, c_owners
+  end |> tuple_of_arrays
+
+  return ntuple(ncolors) do c
+    c_noids  = map(Base.Fix2(getindex,c), color_to_noids)
+    c_fgid   = color_to_fgid[c]
+    c_gids   = map(Base.Fix2(getindex,c), all_c_gids)
+    c_owners = map(Base.Fix2(getindex,c), all_c_owners)
+    PRange(permuted_variable_partition(c_noids, c_gids, c_owners; start=c_fgid))
+  end
+end
+
+# PRange restriction
+
+function restrict_gids(gids::PRange, new_to_old_lid::AbstractArray)
+
+  n_own = map(partition(gids), new_to_old_lid) do ids, n2o_lid
+    rank = part_id(ids)
+    return count(isequal(rank), view(local_to_owner(ids), n2o_lid))
+  end
+
+  # Assign global ids to owned lids
+  first_gid = scan(+,n_own,type=:exclusive,init=one(eltype(n_own)))
+
+  old_lid_to_new_gid = map(first_gid,new_to_old_lid,partition(gids)) do first_gid, n2o_lid, ids
+    old_lid_to_new_gid = zeros(Int,local_length(ids))
+    old_lid_to_owner = local_to_owner(ids)
+    rank = part_id(ids)
+    gid = first_gid
+    for old in n2o_lid
+      if old_lid_to_owner[old] == rank
+        old_lid_to_new_gid[old] = gid
+        gid += 1
+      end
+    end
+    return old_lid_to_new_gid
+  end
+
+  consistent!(PVector(old_lid_to_new_gid,partition(gids))) |> wait
+
+  # Prepare new partition
+  n_gids = reduction(+,n_own,destination=:all,init=zero(eltype(n_own)))
+
+  new_indices = map(
+    n_gids, old_lid_to_new_gid, new_to_old_lid, partition(gids)
+  ) do n_gids, old_lid_to_new_gid, new_to_old_lid, ids
+    lid_to_gid   = old_lid_to_new_gid[new_to_old_lid]
+    lid_to_owner = local_to_owner(ids)[new_to_old_lid]
+    return LocalIndices(n_gids,part_id(ids),lid_to_gid,lid_to_owner)
+  end
+
+  p_snd, p_rcv = assembly_neighbors(partition(gids))
+  assembly_neighbors(new_indices; neighbors=ExchangeGraph(p_snd, p_rcv))
+
+  return PRange(new_indices)
+end
+
+# Multi-field PRange concatenation
+
+"""
+    vcat_gids(f_p_flid_lid, f_frange) -> PRange
+
+Concatenate N per-field PRanges into a single multi-field PRange.
+
+- `f_p_flid_lid`: for each field f, a distributed array mapping single-field local ids to multi-field local ids.
+- `f_frange`: per-field PRanges.
+"""
+function vcat_gids(
+  f_p_flid_lid::AbstractVector{<:AbstractArray{<:AbstractVector}},
+  f_frange::AbstractVector{<:PRange}
+)
+  f_p_fiset = map(local_views,f_frange)
+
+  v(x...) = collect(x)
+  p_f_fiset    = map(v,f_p_fiset...)
+  p_f_flid_lid = map(v,f_p_flid_lid...)
+
+  # Per-part own DOF count and first global ID
+  p_noids    = map(f_fiset->sum(map(own_length,f_fiset)),p_f_fiset)
+  p_firstgid = scan(+,p_noids,type=:exclusive,init=one(eltype(p_noids)))
+
+  # Fill owned gids and owners
+  p_lid_gid, p_lid_part = map(p_f_flid_lid, p_f_fiset, p_firstgid) do f_flid_lid, f_fiset, firstgid
+    nlids    = sum(length, f_flid_lid)
+    lid_gid  = zeros(Int,   nlids)
+    lid_part = zeros(Int32, nlids)
+    gid = firstgid
+    for (flid_lid, fiset) in zip(f_flid_lid, f_fiset)
+      part = part_id(fiset)
+      for flid in own_to_local(fiset)
+        lid_gid[flid_lid[flid]]  = gid
+        lid_part[flid_lid[flid]] = part
+        gid += 1
+      end
+    end
+    return lid_gid, lid_part
+  end |> tuple_of_arrays
+
+  # Propagate gid and owner to ghost DOFs via each field's communicator
+  f_aux_gids = map(frange->PVector{Vector{eltype(eltype(p_lid_gid))}}(undef,partition(frange)),f_frange)
+  f_aux_part = map(frange->PVector{Vector{eltype(eltype(p_lid_part))}}(undef,partition(frange)),f_frange)
+  _vcat_propagate_ghost!(p_lid_gid,f_aux_gids,f_p_flid_lid,f_p_fiset)
+  _vcat_propagate_ghost!(p_lid_part,f_aux_part,f_p_flid_lid,f_p_fiset)
+
+  # Build permuted partition
+  p_iset = map(p_lid_gid,p_lid_part,p_noids,p_firstgid) do lid_to_gid, lid_to_owner, nodofs, firstgid
+    permuted_variable_partition(nodofs,lid_to_gid,lid_to_owner;start=firstgid)
+  end
+
+  # Merge assembly neighbors from all fields
+  f_p_parts_snd, f_p_parts_rcv = map(assembly_neighbors ∘ partition, f_frange) |> tuple_of_arrays
+  merge_neigs(f_neigs...) = sort(unique(vcat(f_neigs...)))
+  p_neigs_snd = map(merge_neigs,f_p_parts_snd...)
+  p_neigs_rcv = map(merge_neigs,f_p_parts_rcv...)
+
+  exchange_graph = ExchangeGraph(p_neigs_snd,p_neigs_rcv)
+  assembly_neighbors(p_iset; neighbors=exchange_graph)
+
+  return PRange(p_iset)
+end
+
+function _vcat_propagate_ghost!(
+  p_lid_gid, f_gids, f_p_flid_lid, f_p_fiset
+)
+  for (gids, p_flid_lid, p_fiset) in zip(f_gids, f_p_flid_lid, f_p_fiset)
+    p_flid_gid = local_views(gids)
+    map(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
+      for flid in own_to_local(fiset)
+        flid_gid[flid] = lid_gid[flid_lid[flid]]
+      end
+    end
+    cache = fetch_vector_ghost_values_cache(partition(gids), p_fiset)
+    fetch_vector_ghost_values!(partition(gids), cache) |> wait
+    map(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
+      for flid in ghost_to_local(fiset)
+        lid_gid[flid_lid[flid]] = flid_gid[flid]
+      end
+    end
+  end
+end

From beffe9df5a6d2062ee372aa2b52bcdebc19184cd Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Thu, 14 May 2026 12:31:05 +1000
Subject: [PATCH 30/48] Minor

---
 src/Partitions.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/Partitions.jl b/src/Partitions.jl
index 9698ae61..8e3aebd8 100644
--- a/src/Partitions.jl
+++ b/src/Partitions.jl
@@ -597,9 +597,7 @@ function vcat_gids(
   _vcat_propagate_ghost!(p_lid_part,f_aux_part,f_p_flid_lid,f_p_fiset)
 
   # Build permuted partition
-  p_iset = map(p_lid_gid,p_lid_part,p_noids,p_firstgid) do lid_to_gid, lid_to_owner, nodofs, firstgid
-    permuted_variable_partition(nodofs,lid_to_gid,lid_to_owner;start=firstgid)
-  end
+  p_iset = permuted_variable_partition(p_noids,p_lid_gid,p_lid_part;start=p_firstgid)
 
   # Merge assembly neighbors from all fields
   f_p_parts_snd, f_p_parts_rcv = map(assembly_neighbors ∘ partition, f_frange) |> tuple_of_arrays

From 74040bb245d6dfcef251ce568f9a881c8f64bb14 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Thu, 14 May 2026 15:49:33 +1000
Subject: [PATCH 31/48] Bugfix in remove_ghost

---
 src/MultiField.jl    |  2 +-
 src/PArraysExtras.jl | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/MultiField.jl b/src/MultiField.jl
index b2ee9126..00400854 100644
--- a/src/MultiField.jl
+++ b/src/MultiField.jl
@@ -361,7 +361,7 @@ function FESpaces.SparseMatrixAssembler(
   local_vec_type,
   trial::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle},
   test::DistributedMultiFieldFESpace{<:BlockMultiFieldStyle},
-  par_strategy=Assembled()
+  par_strategy::FESpaces.AssemblyStrategy=Assembled()
 )
   NBr, SBr, Pr = MultiField.get_block_parameters(MultiFieldStyle(test))
   NBc, SBc, Pc = MultiField.get_block_parameters(MultiFieldStyle(trial))
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index b30aaf2a..bb2b23aa 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -154,11 +154,16 @@ function _filter_ghost(indices,gids,owners)
   return new_ghost_to_global, new_ghost_to_owner
 end
 
+# function PArrays.remove_ghost(indices::PermutedLocalIndices)
+#   n_global = global_length(indices) 
+#   own = OwnIndices(n_global,part_id(indices),own_to_global(indices))
+#   ghost = GhostIndices(n_global,Int[],Int32[])
+#   OwnAndGhostIndices(own,ghost)
+# end
+
 function PArrays.remove_ghost(indices::PermutedLocalIndices)
-  n_global = global_length(indices) 
-  own = OwnIndices(n_global,part_id(indices),own_to_global(indices))
-  ghost = GhostIndices(n_global,Int[],Int32[])
-  OwnAndGhostIndices(own,ghost)
+  @check indices.perm[own_to_local(indices)] == 1:own_length(indices)
+  return remove_ghost(unpermute(indices))
 end
 
 # This function computes a mapping among the local identifiers of a and b

From 718a94da3f5dadd126a850aab83572272e3f0d7c Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 09:17:50 +1000
Subject: [PATCH 32/48] Added back block assembly

---
 src/Assembly.jl                     | 90 ++++++++++++++++++++++++--
 src/PArraysExtras.jl                |  9 +++
 test/BlockPartitionedArraysTests.jl | 99 -----------------------------
 3 files changed, 95 insertions(+), 103 deletions(-)

diff --git a/src/Assembly.jl b/src/Assembly.jl
index 54103baf..c2f020a1 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -234,7 +234,7 @@ end
 function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled,<:AbstractVector})
   # This point MUST NEVER be reached. If reached there is an inconsistency
   # in the parallel code in charge of vector assembly
-  @assert false
+  @unreachable
 end
 
 function Algebra.create_from_nz(a::PVectorAllocation{<:Union{LocallyAssembled,SubAssembled}})
@@ -246,7 +246,7 @@ function Algebra.create_from_nz(a::PVectorAllocation{<:Assembled})
   rows = collect_touched_ids(a)
   b = rhs_callback(a, rows)
   t2 = assemble!(b)
-  t2 !== nothing && wait(t2)
+  !isnothing(t2) && wait(t2)
   return b
 end
 
@@ -304,7 +304,7 @@ function create_from_nz_locally_assembled(
   map(map_local_to_global!,I,test_ids)
   map(map_local_to_global!,J,trial_ids)
 
-  cols = filter_and_replace_ghost(map(unpermute,trial_ids),J)
+  cols = map(unpermute,trial_ids)
 
   map(map_global_to_local!,I,rows)
   map(map_global_to_local!,J,cols)
@@ -332,7 +332,7 @@ function create_from_nz_assembled(
 
   # Overlapped COO communication and vector assembly
   rows = filter_and_replace_ghost(map(unpermute,test_ids),I)
-  t = PartitionedArrays.assemble_coo!(I,J,V,rows)
+  t = PArrays.assemble_coo!(I,J,V,rows)
   b = callback(rows)
   wait(t)
 
@@ -367,3 +367,85 @@ function create_from_nz_subassembled(
 
   return A, b
 end
+
+# Block matrix assembly (Assembled strategy)
+#
+# Gridap's generic create_from_nz(a::ArrayBlock) calls create_from_nz per block
+# independently, producing different row/col PRanges per block → mortar check fails.
+# These dispatches intercept the block case and compute SHARED PRanges by unioning
+# the nonzero indices across all blocks in each block-row and block-col.
+
+function rhs_callback(b::VectorBlock{<:PVectorAllocation{<:Assembled}}, rows_da)
+  b_vecs = [rhs_callback(b.array[i], rows_da[i]) for i in 1:length(rows_da)]
+  mortar(b_vecs)
+end
+
+function Algebra.create_from_nz(a::MatrixBlock{<:PSparseMatrixAllocation{<:Assembled}})
+  A, = create_from_nz_block_assembled(a)
+  return A
+end
+
+function Algebra.create_from_nz(
+  a::MatrixBlock{<:PSparseMatrixAllocation{<:Assembled}},
+  b::VectorBlock{<:PVectorAllocation{<:Assembled}}
+)
+  callback = Base.Fix1(rhs_callback,b)
+  async_callback = assemble!
+  A, B = create_from_nz_block_assembled(a, callback, async_callback)
+  return A, B
+end
+
+function create_from_nz_block_assembled(
+  a              :: MatrixBlock{<:PSparseMatrixAllocation{<:Assembled}},
+  callback       :: Function = rows_da -> nothing,
+  async_callback :: Function = b_vecs  -> [empty_async_task]
+)
+  NBr, NBc  = size(a.array)
+  block_ids = CartesianIndices(a.array)
+
+  I, J, V = map(get_allocations,a.array) |> tuple_of_arrays
+
+  test_ids  = [partition(axes(a.array[i,i],1)) for i in 1:NBr]
+  trial_ids = [partition(axes(a.array[j,j],2)) for j in 1:NBc]
+
+  for i in 1:NBr, j in 1:NBc
+    map(map_local_to_global!,I[i,j],test_ids[i])
+    map(map_local_to_global!,J[i,j],trial_ids[j])
+  end
+
+  # Shared row partition per block-row: union I[i,:] across all columns
+  rows_da = [filter_and_replace_ghost(I[i,:],test_ids[i]) for i in 1:NBr]
+
+  # COO communication overlapped with callback (e.g. RHS local repartition)
+  ts = [
+    PArrays.assemble_coo!(I[i,j],J[i,j],V[i,j],rows_da[i]) for i in 1:NBr, j in 1:NBc
+  ]
+  b = callback(rows_da)
+  foreach(wait,ts)
+
+  # Kick off async RHS communication, overlapped with column partition computation
+  t2s = async_callback(b)
+
+  # Shared col partition per block-col: union J[:,j] across all rows
+  cols_da = [filter_and_replace_ghost(J[:,j],trial_ids[j]) for j in 1:NBc]
+
+  foreach(wait,t2s)
+
+  for i in 1:NBr, j in 1:NBc
+    map(map_global_to_local!,I[i,j],rows_da[i])
+    map(map_global_to_local!,J[i,j],cols_da[j])
+  end
+
+  # Build each block's PSparseMatrix using the shared row/col partitions.
+  # All blocks in block-row i store the same rows_da[i] object, so
+  # partition(axes(A,1)) === rows_da[i] for all j → mortar === check passes.
+  assembled = true
+  block_mats = map(block_ids) do idx
+    i,j = idx[1],idx[2]
+    a_sys  = change_axes(a.array[idx],(PRange(rows_da[i]),PRange(cols_da[j])))
+    values = map(create_from_nz,local_views(a_sys))
+    PSparseMatrix(values,rows_da[i],cols_da[j],assembled)
+  end
+
+  return mortar(block_mats), b
+end
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index bb2b23aa..539204f6 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -117,6 +117,15 @@ function filter_and_replace_ghost(indices,gids)
   return new_indices
 end
 
+# Block variant: union a slice of distributed gid arrays (e.g. I[i,:] or J[:,j])
+# before calling the standard filter_and_replace_ghost.
+function filter_and_replace_ghost(gids_block::Vector{<:AbstractArray}, ids)
+  union_gids = map(to_parray_of_arrays(gids_block)) do chunks
+    vcat(chunks...)
+  end
+  filter_and_replace_ghost(map(unpermute,ids), union_gids)
+end
+
 # Same as PartitionedArrays.filter_ghost, but we do not exclude ghost indices that 
 # belong to `indices`. This could eventually be a flag in the original function.
 function _filter_ghost(indices,gids,owners)
diff --git a/test/BlockPartitionedArraysTests.jl b/test/BlockPartitionedArraysTests.jl
index 6afd0969..826b1195 100644
--- a/test/BlockPartitionedArraysTests.jl
+++ b/test/BlockPartitionedArraysTests.jl
@@ -77,103 +77,4 @@ function main(distribute, parts)
   @test norm(v4) ≈ 6.0 * sqrt(2*n_global)
 end
 
-indices = map(ranks) do r
-  if r == 1
-    own_gids = [1,2,3,4,5]
-    ghost_gids   = [6,7]
-    ghost_owners = [2,2]
-  else
-    own_gids = [6,7,8,9,10]
-    ghost_gids   = [5]
-    ghost_owners = [1]
-  end
-  own_idx   = OwnIndices(10,r,own_gids)
-  ghost_idx = GhostIndices(10,ghost_gids,ghost_owners)
-  OwnAndGhostIndices(own_idx,ghost_idx)
-end
-
-block_range = BlockPRange([PRange(indices),PRange(indices)])
-
-_v = PVector{OwnAndGhostVectors{Vector{Float64}}}(undef,indices)
-v = BlockPArray([_v,_v],(block_range,))
-fill!(v,1.0)
-
-_m = map(CartesianIndices((2,2))) do I
-  i,j = I[1],I[2]
-  local_mats = map(ranks,indices) do r, ind
-    n = local_length(ind)
-    if i==j && r == 1
-      SparseMatrixCSC(n,n,Int[1,3,5,7,9,10,11,13],Int[1,2,2,3,3,4,4,5,5,6,6,7],fill(1.0,12))
-    elseif i==j && r == 2
-      SparseMatrixCSC(n,n,Int[1,2,4,6,8,10,11],Int[1,1,2,2,3,3,4,4,5,6],fill(1.0,10))
-    else
-      SparseMatrixCSC(n,n,fill(Int(1),n+1),Int[],Float64.([]))
-    end
-  end
-  PSparseMatrix(local_mats,indices,indices)
-end;
-m = BlockPArray(_m,(block_range,block_range))
-
-x = similar(_v)
-mul!(x,_m[1,1],_v)
-
-# BlockPRange
-
-@test blocklength(block_range) == 2
-@test blocksize(block_range) == (2,)
-
-# AbstractArray API
-
-__v = similar(v,block_range)
-__m = similar(m,(block_range,block_range))
-fill!(__v,1.0)
-
-__v = __v .+ 1.0
-__v = __v .- 1.0
-__v = __v .* 1.0
-__v = __v ./ 1.0
-
-# PartitionedArrays API
-
-consistent!(__v) |> wait
-t = assemble!(__v)
-assemble!(__m) |> wait
-fetch(t);
-
-PartitionedArrays.centralize(m)
-
-partition(v)
-partition(m)
-local_values(v)
-own_values(v)
-ghost_values(v)
-own_ghost_values(m)
-ghost_own_values(m)
-
-# LinearAlgebra API
-fill!(v,1.0)
-x = similar(v)
-mul!(x,m,v)
-consistent!(x) |> wait
-
-@test dot(v,x) ≈ 36
-norm(v)
-copy!(x,v)
-
-LinearAlgebra.fillstored!(__m,1.0)
-
-__v = BlockPVector{Vector{Float64}}(undef,block_range)
-#__m = BlockPMatrix{SparseMatrixCSC{Float64,Int64}}(undef,block_range,block_range)
-
-maximum(abs,v)
-minimum(abs,v)
-
-# GridapDistributed API
-v_parts = local_views(v)
-m_parts = local_views(m)
-
-v_parts = local_views(v,block_range)
-m_parts = local_views(m,block_range,block_range)
-
-
 end
\ No newline at end of file

From 9998a2ebf9901d62b681385d7d4038316a68a4be Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 10:54:15 +1000
Subject: [PATCH 33/48] Changed Base.map to Base.foreach whenever possible

---
 src/Adaptivity.jl             |    2 +-
 src/Assembly.jl               |   24 +-
 src/BlockPartitionedArrays.jl |   12 +-
 src/CellData.jl               |    6 +-
 src/FESpaces.jl               |   14 +-
 src/Geometry.jl               |    2 +-
 src/PArraysExtras.jl          |   30 +-
 src/Partitions.jl             |   16 +-
 src/Pullbacks.jl              |    6 +-
 src/_Algebra_old.jl           | 1238 ---------------------------------
 10 files changed, 48 insertions(+), 1302 deletions(-)
 delete mode 100644 src/_Algebra_old.jl

diff --git a/src/Adaptivity.jl b/src/Adaptivity.jl
index 05d285ab..b2a37046 100644
--- a/src/Adaptivity.jl
+++ b/src/Adaptivity.jl
@@ -760,7 +760,7 @@ function Adaptivity.coarsen(fmodel::DistributedDiscreteModel, ptopo::Distributed
 
   # First, some preliminary checks
   fgids = partition(get_cell_gids(fmodel))
-  map(local_views(ptopo), fgids) do ptopo, fids
+  foreach(local_views(ptopo), fgids) do ptopo, fids
     patch_cells = Geometry.get_patch_cells(ptopo)
     lcell_to_ocell = local_to_own(fids)
     ocell_to_lcell = own_to_local(fids)
diff --git a/src/Assembly.jl b/src/Assembly.jl
index c2f020a1..0feeb692 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -301,13 +301,13 @@ function create_from_nz_locally_assembled(
   rows = map(remove_ghost,map(unpermute,test_ids))
   b = callback(rows)
 
-  map(map_local_to_global!,I,test_ids)
-  map(map_local_to_global!,J,trial_ids)
+  foreach(map_local_to_global!,I,test_ids)
+  foreach(map_local_to_global!,J,trial_ids)
 
   cols = map(unpermute,trial_ids)
 
-  map(map_global_to_local!,I,rows)
-  map(map_global_to_local!,J,cols)
+  foreach(map_global_to_local!,I,rows)
+  foreach(map_global_to_local!,J,cols)
 
   assembled = true
   a_sys = change_axes(a,(PRange(rows),PRange(cols)))
@@ -327,8 +327,8 @@ function create_from_nz_assembled(
   trial_ids = partition(axes(a,2))
 
   # convert I and J to global dof ids
-  map(map_local_to_global!,I,test_ids)
-  map(map_local_to_global!,J,trial_ids)
+  foreach(map_local_to_global!,I,test_ids)
+  foreach(map_local_to_global!,J,trial_ids)
 
   # Overlapped COO communication and vector assembly
   rows = filter_and_replace_ghost(map(unpermute,test_ids),I)
@@ -340,8 +340,8 @@ function create_from_nz_assembled(
   t2 = async_callback(b)
   cols = filter_and_replace_ghost(map(unpermute,trial_ids),J)
 
-  map(map_global_to_local!,I,rows)
-  map(map_global_to_local!,J,cols)
+  foreach(map_global_to_local!,I,rows)
+  foreach(map_global_to_local!,J,cols)
 
   assembled = true
   a_sys = change_axes(a,(PRange(rows),PRange(cols)))
@@ -409,8 +409,8 @@ function create_from_nz_block_assembled(
   trial_ids = [partition(axes(a.array[j,j],2)) for j in 1:NBc]
 
   for i in 1:NBr, j in 1:NBc
-    map(map_local_to_global!,I[i,j],test_ids[i])
-    map(map_local_to_global!,J[i,j],trial_ids[j])
+    foreach(map_local_to_global!,I[i,j],test_ids[i])
+    foreach(map_local_to_global!,J[i,j],trial_ids[j])
   end
 
   # Shared row partition per block-row: union I[i,:] across all columns
@@ -432,8 +432,8 @@ function create_from_nz_block_assembled(
   foreach(wait,t2s)
 
   for i in 1:NBr, j in 1:NBc
-    map(map_global_to_local!,I[i,j],rows_da[i])
-    map(map_global_to_local!,J[i,j],cols_da[j])
+    foreach(map_global_to_local!,I[i,j],rows_da[i])
+    foreach(map_global_to_local!,J[i,j],cols_da[j])
   end
 
   # Build each block's PSparseMatrix using the shared row/col partitions.
diff --git a/src/BlockPartitionedArrays.jl b/src/BlockPartitionedArrays.jl
index d754f5ae..6b5cbaff 100644
--- a/src/BlockPartitionedArrays.jl
+++ b/src/BlockPartitionedArrays.jl
@@ -179,7 +179,7 @@ function Base.copyto!(y::BlockPMatrix,x::BlockPMatrix)
 end
 
 function Base.fill!(a::BlockPVector,v)
-  map(blocks(a)) do a
+  foreach(blocks(a)) do a
     fill!(a,v)
   end
   return a
@@ -216,14 +216,14 @@ function Base.all(f::Function,x::BlockPVector)
 end
 
 function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
-  map(blocks(x),blocks(y)) do x,y
+  foreach(blocks(x),blocks(y)) do x,y
     LinearAlgebra.axpy!(α,x,y)
   end
   return y
 end
 
 function LinearAlgebra.rmul!(a::BlockPVector,v::Number)
-  map(ai->rmul!(ai,v),blocks(a))
+  foreach(ai->rmul!(ai,v),blocks(a))
   return a
 end
 
@@ -372,7 +372,7 @@ function LinearAlgebra.norm(v::BlockPVector,p::Real=2)
 end
 
 function LinearAlgebra.fillstored!(a::BlockPMatrix,v)
-  map(blocks(a)) do a
+  foreach(blocks(a)) do a
     LinearAlgebra.fillstored!(a,v)
   end
   return a
@@ -382,7 +382,7 @@ function Algebra.axpy_entries!(
   α::Number, A::BlockPMatrix, B::BlockPMatrix;
   check::Bool=true
 )
-  map(blocks(A),blocks(B)) do A, B
+  foreach(blocks(A),blocks(B)) do A, B
     Algebra.axpy_entries!(α,A,B;check)
   end
   return B
@@ -440,6 +440,6 @@ function Base.materialize(b::BlockPBroadcasted)
 end
 
 function Base.materialize!(a::BlockPArray,b::BlockPBroadcasted)
-  map(Base.materialize!,blocks(a),blocks(b))
+  foreach(Base.materialize!,blocks(a),blocks(b))
   return a
 end
diff --git a/src/CellData.jl b/src/CellData.jl
index 0c82a000..1115d806 100644
--- a/src/CellData.jl
+++ b/src/CellData.jl
@@ -502,9 +502,11 @@ function DiracDelta{0}(model::DistributedDiscreteModel;tags)
   degree = 0
   DiracDelta{0}(model,degree;tags=tags)
 end
+
 function (d::DistributedDiracDelta)(f)
- evaluate(d,f)
+  evaluate(d,f)
 end
+
 function Gridap.Arrays.evaluate!(cache,d::DistributedDiracDelta,f)
- ∫(f)*d.dΓ
+  ∫(f)*d.dΓ
 end
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index f403324c..73e980f9 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -279,7 +279,7 @@ function FESpaces.interpolate!(
   free_values::AbstractVector,
   f::DistributedSingleFieldFESpace, 
   isconsistent=false)
-  map(f.spaces,local_views(free_values)) do V,vec
+  foreach(f.spaces,local_views(free_values)) do V,vec
     interpolate!(u,vec,V)
   end
   FEFunction(f,free_values,isconsistent)
@@ -288,9 +288,9 @@ end
 function FESpaces.interpolate!(
   u::DistributedCellField,
   free_values::AbstractVector,
-  f::DistributedSingleFieldFESpace, 
+  f::DistributedSingleFieldFESpace,
   isconsistent=false)
-  map(local_views(u),f.spaces,local_views(free_values)) do u, V,vec
+  foreach(local_views(u),f.spaces,local_views(free_values)) do u, V,vec
     interpolate!(u,vec,V)
   end
   FEFunction(f,free_values,isconsistent)
@@ -306,7 +306,7 @@ function FESpaces.interpolate_dirichlet!(
   u, free_values::AbstractVector,
   dirichlet_values::AbstractArray{<:AbstractVector},
   f::DistributedSingleFieldFESpace, isconsistent=false)
-  map(f.spaces,local_views(free_values),dirichlet_values) do V,fvec,dvec
+  foreach(f.spaces,local_views(free_values),dirichlet_values) do V,fvec,dvec
     interpolate_dirichlet!(u,fvec,dvec,V)
   end
   FEFunction(f,free_values,dirichlet_values,isconsistent)
@@ -316,7 +316,7 @@ function FESpaces.interpolate_dirichlet!(
   u::DistributedCellField, free_values::AbstractVector,
   dirichlet_values::AbstractArray{<:AbstractVector},
   f::DistributedSingleFieldFESpace, isconsistent=false)
-  map(local_views(u), f.spaces,local_views(free_values),dirichlet_values) do u,V,fvec,dvec
+  foreach(local_views(u), f.spaces,local_views(free_values),dirichlet_values) do u,V,fvec,dvec
     interpolate_dirichlet!(u,fvec,dvec,V)
   end
   FEFunction(f,free_values,dirichlet_values,isconsistent)
@@ -332,7 +332,7 @@ function FESpaces.interpolate_everywhere!(
   u, free_values::AbstractVector,
   dirichlet_values::AbstractArray{<:AbstractVector},
   f::DistributedSingleFieldFESpace, isconsistent=false)
-  map(f.spaces,local_views(free_values),dirichlet_values) do V,fvec,dvec
+  foreach(f.spaces,local_views(free_values),dirichlet_values) do V,fvec,dvec
     interpolate_everywhere!(u,fvec,dvec,V)
   end
   FEFunction(f,free_values,dirichlet_values,isconsistent)
@@ -342,7 +342,7 @@ function FESpaces.interpolate_everywhere!(
   u::DistributedCellField, free_values::AbstractVector,
   dirichlet_values::AbstractArray{<:AbstractVector},
   f::DistributedSingleFieldFESpace, isconsistent=false)
-  map(local_views(u),f.spaces,local_views(free_values),dirichlet_values) do ui,V,fvec,dvec
+  foreach(local_views(u),f.spaces,local_views(free_values),dirichlet_values) do ui,V,fvec,dvec
     interpolate_everywhere!(ui,fvec,dvec,V)
   end
   FEFunction(f,free_values,dirichlet_values,isconsistent)
diff --git a/src/Geometry.jl b/src/Geometry.jl
index 7c49af11..1d6ec322 100644
--- a/src/Geometry.jl
+++ b/src/Geometry.jl
@@ -122,7 +122,7 @@ function _setup_consistent_faces!(topo::DistributedGridTopology, dimfrom::Intege
     JaggedArray(lfrom_to_lto.data, lfrom_to_lto.ptrs)
   end
   wait(consistent!(PVector(lfrom_to_gto, gids_from)))
-  map(lfrom_to_gto, gids_to) do lfrom_to_gto, gids_to
+  foreach(lfrom_to_gto, gids_to) do lfrom_to_gto, gids_to
     to_local!(lfrom_to_gto.data, gids_to)
   end
   return nothing
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 539204f6..7711d612 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -215,7 +215,7 @@ const empty_async_task = PartitionedArrays.FakeTask(() -> nothing)
 
 function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
   @check matching_local_indices(partition(axes(x,1)),partition(axes(y,1)))
-  map(partition(x),partition(y)) do x,y
+  foreach(partition(x),partition(y)) do x,y
     LinearAlgebra.axpy!(α,x,y)
   end
   consistent!(y) |> wait
@@ -232,7 +232,7 @@ function Algebra.axpy_entries!(
 # same (i.e can't use ===). Checking the index partitions would then be costly...
   @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
   @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
-  map(partition(A),partition(B)) do A, B
+  foreach(partition(A),partition(B)) do A, B
     Algebra.axpy_entries!(α,A,B;check)
   end
   return B
@@ -262,31 +262,13 @@ end
 # To local/to global for blocks 
 
 function to_local_indices!(I,ids::PRange;kwargs...)
-  map(to_local!,I,partition(ids))
+  foreach(to_local!,I,partition(ids))
+  return I
 end
 
 function to_global_indices!(I,ids::PRange;kwargs...)
-  map(to_global!,I,partition(ids))
-end
-for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
-  @eval begin
-    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
-      map($f,I,ids)
-    end
-
-    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
-      @check ax ∈ [:rows,:cols]
-      block_ids = CartesianIndices(I)
-      map(block_ids) do id
-        i = id[1]; j = id[2];
-        if ax == :rows
-          $f(I[i,j],ids[i])
-        else
-          $f(I[i,j],ids[j])
-        end
-      end
-    end
-  end
+  foreach(to_global!,I,partition(ids))
+  return I
 end
 
 # This type is required because MPIArray from PArrays 
diff --git a/src/Partitions.jl b/src/Partitions.jl
index 8e3aebd8..a0e541f1 100644
--- a/src/Partitions.jl
+++ b/src/Partitions.jl
@@ -2,7 +2,7 @@
 # Cell-wise communication helpers
 
 function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids)
-  map(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids
+  foreach(cell_wise_vector,dof_wise_vector,cell_to_ldofs,cell_ids) do cwv,dwv,cell_to_ldofs,cell_ids
     cache  = array_cache(cell_to_ldofs)
     for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
@@ -18,7 +18,7 @@ function dof_wise_to_cell_wise!(cell_wise_vector,dof_wise_vector,cell_to_ldofs,c
 end
 
 function posneg_wise_to_cell_wise!(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids)
-  map(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids) do cwv,pwv,nwv,cell_to_posneg,cell_ids
+  foreach(cell_wise_vector,pos_wise_vector,neg_wise_vector,cell_to_posneg,cell_ids) do cwv,pwv,nwv,cell_to_posneg,cell_ids
     cache  = array_cache(cell_to_posneg)
     for cell in cell_ids
       lids = getindex!(cache,cell_to_posneg,cell)
@@ -36,7 +36,7 @@ function posneg_wise_to_cell_wise!(cell_wise_vector,pos_wise_vector,neg_wise_vec
 end
 
 function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids)
-  map(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
+  foreach(dof_wise_vector,cell_wise_vector,cell_to_ldofs,cell_ids) do dwv,cwv,cell_to_ldofs,cell_ids
     cache = array_cache(cell_to_ldofs)
     for cell in cell_ids
       ldofs = getindex!(cache,cell_to_ldofs,cell)
@@ -52,7 +52,7 @@ function cell_wise_to_dof_wise!(dof_wise_vector,cell_wise_vector,cell_to_ldofs,c
 end
 
 function cell_wise_to_posneg_wise!(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids)
-  map(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids) do pwv,nwv,cwv,cell_to_posneg,cell_ids
+  foreach(pos_wise_vector,neg_wise_vector,cell_wise_vector,cell_to_posneg,cell_ids) do pwv,nwv,cwv,cell_to_posneg,cell_ids
     cache = array_cache(cell_to_posneg)
     for cell in cell_ids
       lids = getindex!(cache,cell_to_posneg,cell)
@@ -165,7 +165,7 @@ function generate_gids(
   fetch_vector_ghost_values!(cell_to_gdofs,cache_fetch) |> wait
 
   # Fill ghost gids with exchanged information
-  map(
+  foreach(
     cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,partition(cell_range)
   ) do cell_to_ldofs,cell_to_gdofs,ldof_to_gdof,ldof_to_owner,indices
     cache = array_cache(cell_to_ldofs)
@@ -281,7 +281,7 @@ function generate_posneg_gids(
   fetch_vector_ghost_values!(cell_to_gposneg,cache_fetch) |> wait
 
   # Fill ghost gids with exchanged information
-  map(
+  foreach(
     cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,partition(cell_range)
   ) do cell_to_lposneg,cell_to_gposneg,lpos_to_gpos,lneg_to_gneg,lpos_to_owner,lneg_to_owner,indices
     cache = array_cache(cell_to_lposneg)
@@ -616,14 +616,14 @@ function _vcat_propagate_ghost!(
 )
   for (gids, p_flid_lid, p_fiset) in zip(f_gids, f_p_flid_lid, f_p_fiset)
     p_flid_gid = local_views(gids)
-    map(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
+    foreach(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
       for flid in own_to_local(fiset)
         flid_gid[flid] = lid_gid[flid_lid[flid]]
       end
     end
     cache = fetch_vector_ghost_values_cache(partition(gids), p_fiset)
     fetch_vector_ghost_values!(partition(gids), cache) |> wait
-    map(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
+    foreach(p_flid_gid, p_flid_lid, p_lid_gid, p_fiset) do flid_gid, flid_lid, lid_gid, fiset
       for flid in ghost_to_local(fiset)
         lid_gid[flid_lid[flid]] = flid_gid[flid]
       end
diff --git a/src/Pullbacks.jl b/src/Pullbacks.jl
index 420f89fe..eded2413 100644
--- a/src/Pullbacks.jl
+++ b/src/Pullbacks.jl
@@ -127,7 +127,7 @@ function FESpaces.compute_facet_owners(model::DistributedDiscreteModel)
   end
 
   # Map local owners to global ids
-  map(facet_to_owner, cell_ids) do facet_to_owner, cell_ids
+  foreach(facet_to_owner, cell_ids) do facet_to_owner, cell_ids
     lid_to_gid = local_to_global(cell_ids)
     for f in eachindex(facet_to_owner)
       lid = facet_to_owner[f]
@@ -138,9 +138,9 @@ function FESpaces.compute_facet_owners(model::DistributedDiscreteModel)
   # Communicate true owners across processes
   wait(consistent!(PVector(facet_to_owner, facet_ids)))
 
-  # Non-local owners will be set to zero, which 
+  # Non-local owners will be set to zero, which
   # will trigger a sign flip (which is the correct behaviour)
-  map(facet_to_owner, cell_ids) do facet_to_owner, cell_ids
+  foreach(facet_to_owner, cell_ids) do facet_to_owner, cell_ids
     gid_to_lid = global_to_local(cell_ids)
     for f in eachindex(facet_to_owner)
       gid = facet_to_owner[f]
diff --git a/src/_Algebra_old.jl b/src/_Algebra_old.jl
deleted file mode 100644
index e5aefd6a..00000000
--- a/src/_Algebra_old.jl
+++ /dev/null
@@ -1,1238 +0,0 @@
-
-# Vector allocation 
-
-function Algebra.allocate_vector(::Type{<:PVector{V}},ids::PRange) where {V}
-  PVector{V}(undef,partition(ids))
-end
-
-function Algebra.allocate_vector(::Type{<:BlockPVector{V}},ids::BlockPRange) where {V}
-  BlockPVector{V}(undef,ids)
-end
-
-function Algebra.allocate_in_range(matrix::PSparseMatrix)
-  V = Vector{eltype(matrix)}
-  allocate_in_range(PVector{V},matrix)
-end
-
-function Algebra.allocate_in_domain(matrix::PSparseMatrix)
-  V = Vector{eltype(matrix)}
-  allocate_in_domain(PVector{V},matrix)
-end
-
-function Algebra.allocate_in_range(matrix::BlockPMatrix)
-  V = Vector{eltype(matrix)}
-  allocate_in_range(BlockPVector{V},matrix)
-end
-
-function Algebra.allocate_in_domain(matrix::BlockPMatrix)
-  V = Vector{eltype(matrix)}
-  allocate_in_domain(BlockPVector{V},matrix)
-end
-
-# PartitionedArrays extras
-
-function LinearAlgebra.axpy!(α,x::PVector,y::PVector)
-  @check partition(axes(x,1)) === partition(axes(y,1))
-  map(partition(x),partition(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  consistent!(y) |> wait
-  return y
-end
-
-function LinearAlgebra.axpy!(α,x::BlockPVector,y::BlockPVector)
-  map(blocks(x),blocks(y)) do x,y
-    LinearAlgebra.axpy!(α,x,y)
-  end
-  return y
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::PSparseMatrix, B::PSparseMatrix;
-  check::Bool=true
-)
-# We should definitely check here that the index partitions are the same. 
-# However: Because the different matrices are assembled separately, the objects are not the 
-# same (i.e can't use ===). Checking the index partitions would then be costly...
-  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,1)),partition(axes(B,1))))
-  @assert reduce(&,map(PArrays.matching_local_indices,partition(axes(A,2)),partition(axes(B,2))))
-  map(partition(A),partition(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-function Algebra.axpy_entries!(
-  α::Number, A::BlockPMatrix, B::BlockPMatrix;
-  check::Bool=true
-)
-  map(blocks(A),blocks(B)) do A, B
-    Algebra.axpy_entries!(α,A,B;check)
-  end
-  return B
-end
-
-# TODO: Can we remove this?
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.ArrayCounter,axes)
-  @notimplemented
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.CounterCOO{T,A}, axes::A) where {T,A}
-  b = Algebra.CounterCOO{T}(axes)
-  b.nnz = a.nnz
-  b
-end
-
-# This might go to Gridap in the future. We keep it here for the moment.
-function change_axes(a::Algebra.AllocationCOO{T,A}, axes::A) where {T,A}
-  counter = change_axes(a.counter,axes)
-  Algebra.AllocationCOO(counter,a.I,a.J,a.V)
-end
-
-# Array of PArrays -> PArray of Arrays 
-function to_parray_of_arrays(a::AbstractArray{<:MPIArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      PartitionedArrays.getany(aj)
-    end
-  end
-end
-
-function to_parray_of_arrays(a::AbstractArray{<:DebugArray})
-  indices = linear_indices(first(a))
-  map(indices) do i
-    map(a) do aj
-      aj.items[i]
-    end
-  end
-end
-
-# This type is required because MPIArray from PArrays 
-# cannot be instantiated with a NULL communicator
-struct MPIVoidVector{T} <: AbstractVector{T}
-  comm::MPI.Comm
-  function MPIVoidVector(::Type{T}) where {T}
-    new{T}(MPI.COMM_NULL)
-  end
-end
-
-Base.size(a::MPIVoidVector) = (0,)
-Base.IndexStyle(::Type{<:MPIVoidVector}) = IndexLinear()
-function Base.getindex(a::MPIVoidVector,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.setindex!(a::MPIVoidVector,v,i::Int)
-  error("Indexing of MPIVoidVector not possible.")
-end
-function Base.show(io::IO,k::MIME"text/plain",data::MPIVoidVector)
-  println(io,"MPIVoidVector")
-end
-
-function num_parts(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    nparts = MPI.Comm_size(comm)
-  else
-    nparts = -1
-  end
-  nparts
-end
-@inline num_parts(comm::MPIArray) = num_parts(comm.comm)
-@inline num_parts(comm::DebugArray) = length(comm.items)
-@inline num_parts(comm::MPIVoidVector) = num_parts(comm.comm)
-
-function get_part_id(comm::MPI.Comm)
-  if comm != MPI.COMM_NULL
-    id = MPI.Comm_rank(comm)+1
-  else
-    id = -1
-  end
-  id
-end
-@inline get_part_id(comm::MPIArray) = get_part_id(comm.comm)
-@inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
-
-"""
-    i_am_in(comm::MPIArray)
-    i_am_in(comm::DebugArray)
-  
-  Returns `true` if the processor is part of the subcommunicator `comm`.
-"""
-function i_am_in(comm::MPI.Comm)
-  get_part_id(comm) >=0
-end
-@inline i_am_in(comm::MPIArray) = i_am_in(comm.comm)
-@inline i_am_in(comm::MPIVoidVector) = i_am_in(comm.comm)
-@inline i_am_in(comm::DebugArray) = true
-
-function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_parts; default=nothing)
-  x_new = map(new_parts) do p
-    if isa(x,MPIArray)
-      PartitionedArrays.getany(x)
-    elseif isa(x,DebugArray) && (p <= length(x.items))
-      x.items[p]
-    else
-      default
-    end
-  end
-  return x_new
-end
-
-function generate_subparts(parts::MPIArray,new_comm_size)
-  root_comm = parts.comm
-  root_size = MPI.Comm_size(root_comm)
-  rank = MPI.Comm_rank(root_comm)
-
-  @static if isdefined(MPI,:MPI_UNDEFINED)
-    mpi_undefined = MPI.MPI_UNDEFINED[]
-  else
-    mpi_undefined = MPI.API.MPI_UNDEFINED[]
-  end
-  
-  if root_size == new_comm_size
-    return parts
-  else
-    if rank < new_comm_size
-      comm = MPI.Comm_split(root_comm,0,0)
-      return distribute_with_mpi(LinearIndices((new_comm_size,));comm=comm,duplicate_comm=false)
-    else
-      comm = MPI.Comm_split(root_comm,mpi_undefined,mpi_undefined)
-      return MPIVoidVector(eltype(parts))
-    end
-  end
-end
-
-function generate_subparts(parts::DebugArray,new_comm_size)
-  DebugArray(LinearIndices((new_comm_size,)))
-end
-
-# local_views
-
-function local_views(a)
-  @abstractmethod
-end
-
-function get_parts(a)
-  return linear_indices(local_views(a))
-end
-
-function local_views(a::AbstractVector,rows)
-  @notimplemented
-end
-
-function local_views(a::AbstractMatrix,rows,cols)
-  @notimplemented
-end
-
-local_views(a::AbstractArray) = a
-local_views(a::PRange) = partition(a)
-local_views(a::PVector) = partition(a)
-local_views(a::PSparseMatrix) = partition(a)
-
-function local_views(a::BlockPRange)
-  map(blocks(a)) do a
-    local_views(a)
-  end |> to_parray_of_arrays
-end
-
-function local_views(a::BlockPArray)
-  vals = map(blocks(a)) do a
-    local_views(a)
-  end |> to_parray_of_arrays
-  return map(mortar,vals)
-end
-
-# change_ghost
-
-function change_ghost(a::PVector{T},ids::PRange;is_consistent=false,make_consistent=false) where T
-  same_partition = (a.index_partition === partition(ids))
-  a_new = same_partition ? a : change_ghost(T,a,ids)
-  if make_consistent && (!same_partition || !is_consistent)
-    consistent!(a_new) |> wait
-  end
-  return a_new
-end
-
-function change_ghost(::Type{<:AbstractVector},a::PVector,ids::PRange)
-  a_new = similar(a,eltype(a),(ids,))
-  # Equivalent to copy!(a_new,a) but does not check that owned indices match
-  map(copy!,own_values(a_new),own_values(a))
-  return a_new
-end
-
-function change_ghost(::Type{<:OwnAndGhostVectors},a::PVector,ids::PRange)
-  values = map(own_values(a),partition(ids)) do own_vals,ids
-    ghost_vals = fill(zero(eltype(a)),ghost_length(ids))
-    perm = PartitionedArrays.local_permutation(ids)
-    OwnAndGhostVectors(own_vals,ghost_vals,perm)
-  end
-  return PVector(values,partition(ids))
-end
-
-function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_consistent=false)
-  vals = map(blocks(a),blocks(ids)) do a, ids
-    change_ghost(a,ids;is_consistent=is_consistent,make_consistent=make_consistent)
-  end
-  return BlockPVector(vals,ids)
-end
-
-# TODO: Can we remove this?
-# This function computes a mapping among the local identifiers of a and b
-# for which the corresponding global identifiers are both in a and b. 
-# Note that the haskey check is necessary because in the general case 
-# there might be gids in b which are not present in a
-function find_local_to_local_map(a::AbstractLocalIndices,b::AbstractLocalIndices)
-  a_local_to_b_local = fill(Int32(-1),local_length(a))
-  b_local_to_global  = local_to_global(b)
-  a_global_to_local  = global_to_local(a)
-  for blid in 1:local_length(b)
-    gid = b_local_to_global[blid]
-    if a_global_to_local[gid] != zero(eltype(a_global_to_local))
-      alid = a_global_to_local[gid]
-      a_local_to_b_local[alid] = blid
-    end  
-  end
-  a_local_to_b_local
-end
-
-# TODO: Can we remove this?
-# This type is required in order to be able to access the local portion 
-# of distributed sparse matrices and vectors using local indices from the 
-# distributed test and trial spaces
-struct LocalView{T,N,A,B} <:AbstractArray{T,N}
-  plids_to_value::A
-  d_to_lid_to_plid::B
-  local_size::NTuple{N,Int}
-  function LocalView(
-    plids_to_value::AbstractArray{T,N},d_to_lid_to_plid::NTuple{N}) where {T,N}
-    A = typeof(plids_to_value)
-    B = typeof(d_to_lid_to_plid)
-    local_size = map(length,d_to_lid_to_plid)
-    new{T,N,A,B}(plids_to_value,d_to_lid_to_plid,local_size)
-  end
-end
-
-Base.size(a::LocalView) = a.local_size
-Base.IndexStyle(::Type{<:LocalView}) = IndexCartesian()
-function Base.getindex(a::LocalView{T,N},lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
-  if all(i->i>0,plids)
-    a.plids_to_value[plids...]
-  else
-    zero(T)
-  end
-end
-function Base.setindex!(a::LocalView{T,N},v,lids::Vararg{Integer,N}) where {T,N}
-  plids = map(_lid_to_plid,lids,a.d_to_lid_to_plid)
-  @check all(i->i>0,plids) "You are trying to set a value that is not stored in the local portion"
-  a.plids_to_value[plids...] = v
-end
-
-function _lid_to_plid(lid,lid_to_plid)
-  plid = lid_to_plid[lid]
-  plid
-end
-
-function local_views(a::PVector,new_rows::PRange)
-  old_rows = axes(a,1)
-  if partition(old_rows) === partition(new_rows)
-    partition(a)
-  else
-    map(partition(a),partition(old_rows),partition(new_rows)) do vector_partition,old_rows,new_rows
-      LocalView(vector_partition,(find_local_to_local_map(new_rows,old_rows),))
-    end
-  end
-end
-
-function local_views(a::PSparseMatrix,new_rows::PRange,new_cols::PRange)
-  old_rows, old_cols = axes(a)
-  if (partition(old_rows) === partition(new_rows) && partition(old_cols) === partition(new_cols) )
-    partition(a)
-  else
-    map(partition(a),
-        partition(old_rows),partition(old_cols),
-        partition(new_rows),partition(new_cols)) do matrix_partition,old_rows,old_cols,new_rows,new_cols
-      rl2lmap = find_local_to_local_map(new_rows,old_rows)
-      cl2lmap = find_local_to_local_map(new_cols,old_cols)
-      LocalView(matrix_partition,(rl2lmap,cl2lmap))
-    end
-  end
-end
-
-function local_views(a::BlockPVector,new_rows::BlockPRange)
-  vals = map(local_views,blocks(a),blocks(new_rows)) |> to_parray_of_arrays
-  return map(mortar,vals)
-end
-
-function local_views(a::BlockPMatrix,new_rows::BlockPRange,new_cols::BlockPRange)
-  vals = map(CartesianIndices(blocksize(a))) do I
-    local_views(blocks(a)[I],blocks(new_rows)[I],blocks(new_cols)[I])
-  end |> to_parray_of_arrays
-  return map(mortar,vals)
-end
-
-# PSparseMatrix assembly
-
-struct Assembled <: FESpaces.AssemblyStrategy end
-struct SubAssembled <: FESpaces.AssemblyStrategy end
-struct LocallyAssembled <: FESpaces.AssemblyStrategy end
-
-struct PSparseMatrixBuilderCOO{T,B}
-  local_matrix_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(
-  builder::PSparseMatrixBuilderCOO{A}, axs::Tuple{<:PRange,<:PRange}
-) where A
-  rows, cols = axs # test ids, trial ids
-  counters = map(partition(rows),partition(cols)) do r,c
-    axs = (Base.OneTo(local_length(r)),Base.OneTo(local_length(c)))
-    Algebra.CounterCOO{A}(axs)
-  end
-  DistributedCounterCOO(builder.par_strategy,counters,rows,cols)
-end
-
-function Algebra.get_array_type(::PSparseMatrixBuilderCOO{Tv}) where Tv
-  T = eltype(Tv)
-  return PSparseMatrix{T}
-end
-
-
-"""
-"""
-struct DistributedCounterCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  counters    ::B
-  test_gids   ::C
-  trial_gids  ::D
-  function DistributedCounterCOO(
-    par_strategy,
-    counters::AbstractArray{<:Algebra.CounterCOO},
-    test_gids::PRange,
-    trial_gids::PRange
-  )
-    A = typeof(par_strategy)
-    B = typeof(counters)
-    C = typeof(test_gids)
-    D = typeof(trial_gids)
-    new{A,B,C,D}(par_strategy,counters,test_gids,trial_gids)
-  end
-end
-
-function local_views(a::DistributedCounterCOO)
-  return a.counters
-end
-
-function local_views(a::DistributedCounterCOO,test_gids,trial_gids)
-  @check PArrays.matching_local_indices(test_gids,a.test_gids)
-  @check PArrays.matching_local_indices(trial_gids,a.trial_gids)
-  return a.counters
-end
-
-function Algebra.nz_allocation(a::DistributedCounterCOO)
-  allocs = map(nz_allocation,a.counters) # Array{AllocationCOO}
-  DistributedAllocationCOO(a.par_strategy,allocs,a.test_gids,a.trial_gids)
-end
-
-struct DistributedAllocationCOO{A,B,C,D} <: GridapType
-  par_strategy::A
-  allocs::B
-  test_gids::C
-  trial_gids::D
-  function DistributedAllocationCOO(
-    par_strategy,
-    allocs::AbstractArray{<:Algebra.AllocationCOO},
-    test_gids::PRange,
-    trial_gids::PRange)
-    A = typeof(par_strategy)
-    B = typeof(allocs)
-    C = typeof(test_gids)
-    D = typeof(trial_gids)
-    new{A,B,C,D}(par_strategy,allocs,test_gids,trial_gids)
-  end
-end
-
-function change_axes(
-  a::DistributedAllocationCOO{A,B,<:PRange,<:PRange},
-  axes::Tuple{<:PRange,<:PRange}
-) where {A,B}
-  local_axes = map(partition(axes[1]),partition(axes[2])) do rows,cols
-    (Base.OneTo(local_length(rows)), Base.OneTo(local_length(cols)))
-  end
-  allocs = map(change_axes,a.allocs,local_axes)
-  DistributedAllocationCOO(a.par_strategy,allocs,axes[1],axes[2])
-end
-
-function change_axes(
-  a::MatrixBlock{<:DistributedAllocationCOO},
-  axes::Tuple{<:Vector,<:Vector}
-)
-  block_ids  = CartesianIndices(a.array)
-  rows, cols = axes
-  array = map(block_ids) do I
-    change_axes(a[I],(rows[I[1]],cols[I[2]]))
-  end
-  return ArrayBlock(array,a.touched)
-end
-
-function local_views(a::DistributedAllocationCOO)
-  a.allocs
-end
-
-function local_views(a::DistributedAllocationCOO,test_gids,trial_gids)
-  @check test_gids === a.test_gids
-  @check trial_gids === a.trial_gids
-  a.allocs
-end
-
-function local_views(a::MatrixBlock{<:DistributedAllocationCOO})
-  array = map(local_views,a.array) |> to_parray_of_arrays
-  return map(ai -> ArrayBlock(ai,a.touched),array)
-end
-
-function get_allocations(a::DistributedAllocationCOO)
-  I,J,V = map(local_views(a)) do alloc
-    alloc.I, alloc.J, alloc.V
-  end |> tuple_of_arrays
-  return I,J,V
-end
-
-function get_allocations(a::ArrayBlock{<:DistributedAllocationCOO})
-  tuple_of_array_of_parrays = map(get_allocations,a.array) |> tuple_of_arrays
-  return tuple_of_array_of_parrays
-end
-
-get_test_gids(a::DistributedAllocationCOO)  = a.test_gids
-get_trial_gids(a::DistributedAllocationCOO) = a.trial_gids
-get_test_gids(a::ArrayBlock{<:DistributedAllocationCOO})  = map(get_test_gids,diag(a.array))
-get_trial_gids(a::ArrayBlock{<:DistributedAllocationCOO}) = map(get_trial_gids,diag(a.array))
-
-function Algebra.create_from_nz(a::PSparseMatrix)
-  # For LocallyAssembled the underlying Exchanger should
-  # not have ghost layer making assemble! do nothing (TODO check)
-  assemble!(a) |> wait
-  a
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:LocallyAssembled})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:LocallyAssembled}})
-  f(x) = nothing
-  A, = _fa_create_from_nz_with_callback(f,a)
-  return A
-end
-
-function _fa_create_from_nz_with_callback(callback,a)
-
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_gids  = get_test_gids(a)
-  trial_gids = get_trial_gids(a)
-
-  rows = _setup_prange(test_gids,I;ghost=false,ax=:rows)
-  b = callback(rows)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
-
-  # Create the range for cols
-  cols = _setup_prange(trial_gids,J;ax=:cols)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  A = _setup_matrix(I,J,V,rows,cols)
-  return A, b
-end
-
-function Algebra.create_from_nz(a::DistributedAllocationCOO{<:Assembled})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function Algebra.create_from_nz(a::ArrayBlock{<:DistributedAllocationCOO{<:Assembled}})
-  f(x) = nothing
-  A, = _sa_create_from_nz_with_callback(f,f,a,nothing)
-  return A
-end
-
-function _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  # Recover some data
-  I,J,V = get_allocations(a)
-  test_gids = get_test_gids(a)
-  trial_gids = get_trial_gids(a)
-
-  # convert I and J to global dof ids
-  to_global_indices!(I,test_gids;ax=:rows)
-  to_global_indices!(J,trial_gids;ax=:cols)
-
-  # Create the Prange for the rows
-  rows = _setup_prange(test_gids,I;ax=:rows)
-  
-  # Move (I,J,V) triplets to the owner process of each row I.
-  # The triplets are accompanyed which Jo which is the process column owner
-  Jo = get_gid_owners(J,trial_gids;ax=:cols)
-  t  = _assemble_coo!(I,J,V,rows;owners=Jo)
-
-  # Here we can overlap computations
-  # This is a good place to overlap since
-  # sending the matrix rows is a lot of data
-  if !isa(b,Nothing)
-    bprange=_setup_prange_from_pvector_allocation(b)
-    b = callback(bprange)
-  end
-
-  # Wait the transfer to finish
-  wait(t)
-
-  # Create the Prange for the cols
-  cols = _setup_prange(trial_gids,J;ax=:cols,owners=Jo)
-
-  # Overlap rhs communications with CSC compression
-  t2 = async_callback(b)
-
-  # Convert again I,J to local numeration
-  to_local_indices!(I,rows;ax=:rows)
-  to_local_indices!(J,cols;ax=:cols)
-
-  A = _setup_matrix(a,I,J,V,rows,cols)
-
-  # Wait the transfer to finish
-  if !isa(t2,Nothing)
-    wait(t2)
-  end
-  
-  return A, b
-end
-
-
-# PVector assembly 
-
-struct PVectorBuilder{T,B}
-  local_vector_type::Type{T}
-  par_strategy::B
-end
-
-function Algebra.nz_counter(builder::PVectorBuilder,axs::Tuple{<:PRange})
-  T = builder.local_vector_type
-  rows, = axs
-  counters = map(partition(rows)) do rows
-    axs = (Base.OneTo(local_length(rows)),)
-    nz_counter(ArrayBuilder(T),axs)
-  end
-  PVectorCounter(builder.par_strategy,counters,rows)
-end
-
-function Algebra.get_array_type(::PVectorBuilder{Tv}) where Tv
-  T = eltype(Tv)
-  return PVector{T}
-end
-
-struct PVectorCounter{A,B,C}
-  par_strategy::A
-  counters::B
-  test_gids::C
-end
-
-Algebra.LoopStyle(::Type{<:PVectorCounter}) = DoNotLoop()
-
-function local_views(a::PVectorCounter)
-  a.counters
-end
-
-function local_views(a::PVectorCounter,rows)
-  @check rows === a.test_gids
-  a.counters
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:LocallyAssembled})
-  dofs = a.test_gids
-  values = map(nz_allocation,a.counters)
-  PVectorAllocationTrackOnlyValues(a.par_strategy,values,dofs)
-end
-
-struct PVectorAllocationTrackOnlyValues{A,B,C}
-  par_strategy::A
-  values::B
-  test_gids::C
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues)
-  a.values
-end
-
-function local_views(a::PVectorAllocationTrackOnlyValues,rows)
-  @check rows === a.test_gids
-  a.values
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:LocallyAssembled})
-  rows = _setup_prange_without_ghosts(a.test_gids)
-  _rhs_callback(a,rows)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackOnlyValues{<:Assembled})
-  # This point MUST NEVER be reached. If reached there is an inconsistency
-  # in the parallel code in charge of vector assembly
-  @assert false
-end
-
-function _rhs_callback(row_partitioned_vector_partition,rows)
-  # The ghost values in row_partitioned_vector_partition are 
-  # aligned with the FESpace but not with the ghost values in the rows of A
-  b_fespace = PVector(row_partitioned_vector_partition.values,
-                      partition(row_partitioned_vector_partition.test_gids))
-
-  # This one is aligned with the rows of A
-  b = similar(b_fespace,eltype(b_fespace),(rows,))
-
-  # First transfer owned values
-  b .= b_fespace
-
-  # Now transfer ghost
-  function transfer_ghost(b,b_fespace,ids,ids_fespace)
-    num_ghosts_vec = ghost_length(ids)
-    gho_to_loc_vec = ghost_to_local(ids)
-    loc_to_glo_vec = local_to_global(ids)
-    gid_to_lid_fe  = global_to_local(ids_fespace)
-    for ghost_lid_vec in 1:num_ghosts_vec
-      lid_vec     = gho_to_loc_vec[ghost_lid_vec]
-      gid         = loc_to_glo_vec[lid_vec]
-      lid_fespace = gid_to_lid_fe[gid]
-      b[lid_vec] = b_fespace[lid_fespace]
-    end
-  end
-  map(
-    transfer_ghost,
-    partition(b),
-    partition(b_fespace),
-    b.index_partition,
-    b_fespace.index_partition
-  )
-
-  return b
-end
-
-function Algebra.create_from_nz(a::PVector)
-  assemble!(a) |> wait
-  return a
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:LocallyAssembled},
-  c_fespace::PVectorAllocationTrackOnlyValues{<:LocallyAssembled}
-)
-
-  function callback(rows)
-    _rhs_callback(c_fespace,rows)
-  end
-
-  A,b = _fa_create_from_nz_with_callback(callback,a)
-  return A,b
-end
-
-struct PVectorAllocationTrackTouchedAndValues{A,B,C}
-  allocations::A
-  values::B
-  test_gids::C
-end
-
-function Algebra.create_from_nz(
-  a::DistributedAllocationCOO{<:Assembled},
-  b::PVectorAllocationTrackTouchedAndValues
-)
-
-  function callback(rows)
-    _rhs_callback(b,rows)
-  end
-
-  function async_callback(b)
-    # now we can assemble contributions
-    assemble!(b)
-  end
-
-  A,b = _sa_create_from_nz_with_callback(callback,async_callback,a,b)
-  return A,b
-end
-
-struct ArrayAllocationTrackTouchedAndValues{A}
-  touched::Vector{Bool}
-  values::A
-end
-
-Gridap.Algebra.LoopStyle(::Type{<:ArrayAllocationTrackTouchedAndValues}) = Gridap.Algebra.Loop()
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues,rows)
-  @check PArrays.matching_local_indices(rows, a.test_gids)
-  a.allocations
-end
-
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entry!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if i > 0
-    a.touched[i]=true
-    if !isnothing(v)
-      a.values[i]=c(v,a.values[i])
-    end
-  end
-  nothing
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i,j)
-  @notimplemented
-end
-@inline function Arrays.add_entries!(c::Function,a::ArrayAllocationTrackTouchedAndValues,v,i)
-  if !isa(v,Nothing)
-    for (ve,ie) in zip(v,i)
-      Arrays.add_entry!(c,a,ve,ie)
-    end
-  else
-    for ie in i
-      Arrays.add_entry!(c,a,nothing,ie)
-    end
-  end
-  nothing
-end
-
-
-function _setup_touched_and_allocations_arrays(values)
-  touched = map(values) do values
-    fill!(Vector{Bool}(undef,length(values)),false)
-  end
-  allocations = map(values,touched) do values,touched
-   ArrayAllocationTrackTouchedAndValues(touched,values)
-  end
-  touched, allocations
-end
-
-function Arrays.nz_allocation(
-  a::DistributedCounterCOO{<:Assembled},
-  b::PVectorCounter{<:Assembled}
-)
-  A      = nz_allocation(a)
-  dofs   = b.test_gids
-  values = map(nz_allocation,b.counters)
-  touched, allocations = _setup_touched_and_allocations_arrays(values)
-  B = PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-  return A, B
-end
-
-function Arrays.nz_allocation(a::PVectorCounter{<:Assembled})
-  dofs = a.test_gids
-  values = map(nz_allocation,a.counters)
-  touched,allocations=_setup_touched_and_allocations_arrays(values)
-  return PVectorAllocationTrackTouchedAndValues(allocations,values,dofs)
-end
-
-function local_views(a::PVectorAllocationTrackTouchedAndValues)
-  a.allocations
-end
-
-function _setup_prange_from_pvector_allocation(a::PVectorAllocationTrackTouchedAndValues)
-
-  # Find the ghost rows
-  allocations = local_views(a.allocations)
-  indices = partition(a.test_gids)
-  I_ghost_lids_to_dofs_ghost_lids = map(allocations, indices) do allocation, indices
-    dofs_lids_touched = findall(allocation.touched)
-    loc_to_gho = local_to_ghost(indices)
-    n_I_ghost_lids = count((x)->loc_to_gho[x]!=0,dofs_lids_touched)
-    I_ghost_lids = Vector{Int32}(undef,n_I_ghost_lids)
-    cur = 1
-    for lid in dofs_lids_touched
-      dof_lid = loc_to_gho[lid]
-      if dof_lid != 0
-        I_ghost_lids[cur] = dof_lid
-        cur = cur+1
-      end
-    end
-    I_ghost_lids
-  end
-
-  ghost_to_global, ghost_to_owner = map(
-    find_gid_and_owner,I_ghost_lids_to_dofs_ghost_lids,indices) |> tuple_of_arrays
-
-  ngids = length(a.test_gids)
-  _setup_prange_impl_(ngids,indices,ghost_to_global,ghost_to_owner)
-end
-
-function Algebra.create_from_nz(a::PVectorAllocationTrackTouchedAndValues)
-  rows = _setup_prange_from_pvector_allocation(a)
-  b    = _rhs_callback(a,rows)
-  t2   = assemble!(b)
-  # Wait the transfer to finish
-  if t2 !== nothing
-    wait(t2)
-  end
-  return b
-end
-
-# Common Assembly Utilities
-function first_gdof_from_ids(ids)
-  if own_length(ids) == 0
-    return 1
-  end
-  lid_to_gid   = local_to_global(ids) 
-  owned_to_lid = own_to_local(ids)
-  return Int(lid_to_gid[first(owned_to_lid)])
-end
-
-function find_gid_and_owner(ighost_to_jghost,jindices)
-  jghost_to_local  = ghost_to_local(jindices)
-  jlocal_to_global = local_to_global(jindices)
-  jlocal_to_owner  = local_to_owner(jindices)
-  ighost_to_jlocal = sort(view(jghost_to_local,ighost_to_jghost))
-
-  ighost_to_global = jlocal_to_global[ighost_to_jlocal]
-  ighost_to_owner  = jlocal_to_owner[ighost_to_jlocal]
-  return ighost_to_global, ighost_to_owner
-end
-
-# The given ids are assumed to be a sub-set of the lids
-function ghost_lids_touched(a::AbstractLocalIndices,gids::AbstractVector{<:Integer})
-  glo_to_loc = global_to_local(a)
-  loc_to_gho = local_to_ghost(a)
-  
-  # First pass: Allocate
-  i = 0
-  ghost_lids_touched = fill(false,ghost_length(a))
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      i += 1
-    end
-  end
-  gids_ghost_lid_to_ghost_lid = Vector{Int32}(undef,i)
-
-  # Second pass: fill 
-  i = 1
-  fill!(ghost_lids_touched,false)
-  for gid in gids
-    lid = glo_to_loc[gid]
-    ghost_lid = loc_to_gho[lid]
-    if ghost_lid > 0 && !ghost_lids_touched[ghost_lid]
-      ghost_lids_touched[ghost_lid] = true
-      gids_ghost_lid_to_ghost_lid[i] = ghost_lid
-      i += 1
-    end
-  end
-
-  return gids_ghost_lid_to_ghost_lid
-end
-
-# Find the neighbours of partition1 trying 
-# to use those in partition2 as a hint 
-function _find_neighbours!(partition1, partition2)
-  partition2_snd, partition2_rcv = assembly_neighbors(partition2)
-  partition2_graph = ExchangeGraph(partition2_snd, partition2_rcv)
-  return assembly_neighbors(partition1; neighbors=partition2_graph)
-end
-
-# to_global! & to_local! analogs, needed for dispatching in block assembly
-
-function to_local_indices!(I,ids::PRange;kwargs...)
-  map(to_local!,I,partition(ids))
-end
-
-function to_global_indices!(I,ids::PRange;kwargs...)
-  map(to_global!,I,partition(ids))
-end
-
-function get_gid_owners(I,ids::PRange;kwargs...)
-  map(I,partition(ids)) do I, indices 
-    glo_to_loc = global_to_local(indices) 
-    loc_to_own = local_to_owner(indices)
-    map(x->loc_to_own[glo_to_loc[x]], I)
-  end 
-end
-
-for f in [:to_local_indices!, :to_global_indices!, :get_gid_owners]
-  @eval begin
-    function $f(I::Vector,ids::AbstractVector{<:PRange};kwargs...)
-      map($f,I,ids)
-    end
-
-    function $f(I::Matrix,ids::AbstractVector{<:PRange};ax=:rows)
-      @check ax ∈ [:rows,:cols]
-      block_ids = CartesianIndices(I)
-      map(block_ids) do id
-        i = id[1]; j = id[2];
-        if ax == :rows
-          $f(I[i,j],ids[i])
-        else
-          $f(I[i,j],ids[j])
-        end
-      end
-    end
-  end
-end
-
-
-function _setup_matrix(
-  a,I,J,V,rows::PRange,cols::PRange
-)
-  _rows = PRange(map(remove_ghost,partition(rows)))
-  asys = change_axes(a,(_rows,cols))
-  values = map(create_from_nz,local_views(asys))
-  PSparseMatrix(values,partition(_rows),partition(cols),true)
-end
-
-function _setup_matrix(a,I,J,V,rows::Vector{<:PRange},cols::Vector{<:PRange})
-  block_ids  = CartesianIndices(I)
-  block_mats = map(block_ids) do id
-    i = id[1]; j = id[2];
-    _setup_matrix(a[i,j],I[i,j],J[i,j],V[i,j],rows[i],cols[j])
-  end
-  return mortar(block_mats)
-end
-
-# _assemble_coo! : local coo triplets + global PRange -> Global coo values
-
-function _assemble_coo!(I,J,V,rows::PRange;owners=nothing)
-  if isa(owners,Nothing)
-    PArrays.assemble_coo!(I,J,V,partition(rows))
-  else
-    assemble_coo_with_column_owner!(I,J,V,partition(rows),owners)
-  end
-end
-
-function _assemble_coo!(I,J,V,rows::Vector{<:PRange};owners=nothing)
-  block_ids = CartesianIndices(I)
-  map(block_ids) do id
-    i = id[1]; j = id[2];
-    if isa(owners,Nothing)
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i])
-    else
-      _assemble_coo!(I[i,j],J[i,j],V[i,j],rows[i],owners=owners[i,j])
-    end
-  end
-end
-
-function assemble_coo_with_column_owner!(I,J,V,row_partition,Jown)
-  """
-    Returns three JaggedArrays with the coo triplets
-    to be sent to the corresponding owner parts in parts_snd
-  """
-  function setup_snd(part,parts_snd,row_lids,coo_entries_with_column_owner)
-    global_to_local_row = global_to_local(row_lids)
-    local_row_to_owner = local_to_owner(row_lids)
-    owner_to_i = Dict(( owner=>i for (i,owner) in enumerate(parts_snd) ))
-    ptrs = zeros(Int32,length(parts_snd)+1)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        ptrs[owner_to_i[owner]+1] +=1
-      end
-    end
-    PArrays.length_to_ptrs!(ptrs)
-    gi_snd_data = zeros(eltype(k_gi),ptrs[end]-1)
-    gj_snd_data = zeros(eltype(k_gj),ptrs[end]-1)
-    jo_snd_data = zeros(eltype(k_jo),ptrs[end]-1)
-    v_snd_data = zeros(eltype(k_v),ptrs[end]-1)
-    for k in 1:length(k_gi)
-      gi = k_gi[k]
-      li = global_to_local_row[gi]
-      owner = local_row_to_owner[li]
-      if owner != part
-        gj = k_gj[k]
-        v = k_v[k]
-        p = ptrs[owner_to_i[owner]]
-        gi_snd_data[p] = gi
-        gj_snd_data[p] = gj
-        jo_snd_data[p] = k_jo[k]
-        v_snd_data[p]  = v
-        k_v[k] = zero(v)
-        ptrs[owner_to_i[owner]] += 1
-      end
-    end
-    PArrays.rewind_ptrs!(ptrs)
-    gi_snd = JaggedArray(gi_snd_data,ptrs)
-    gj_snd = JaggedArray(gj_snd_data,ptrs)
-    jo_snd = JaggedArray(jo_snd_data,ptrs)
-    v_snd = JaggedArray(v_snd_data,ptrs)
-    gi_snd, gj_snd, jo_snd, v_snd
-  end
-  """
-    Pushes to coo_entries_with_column_owner the tuples 
-    gi_rcv,gj_rcv,jo_rcv,v_rcv received from remote processes
-  """
-  function setup_rcv!(coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-    k_gi, k_gj, k_jo, k_v = coo_entries_with_column_owner
-    current_n = length(k_gi)
-    new_n = current_n + length(gi_rcv.data)
-    resize!(k_gi,new_n)
-    resize!(k_gj,new_n)
-    resize!(k_jo,new_n)
-    resize!(k_v,new_n)
-    for p in 1:length(gi_rcv.data)
-        k_gi[current_n+p] = gi_rcv.data[p]
-        k_gj[current_n+p] = gj_rcv.data[p]
-        k_jo[current_n+p] = jo_rcv.data[p]
-        k_v[current_n+p] = v_rcv.data[p]
-    end
-  end
-  part = linear_indices(row_partition)
-  parts_snd, parts_rcv = assembly_neighbors(row_partition)
-  coo_entries_with_column_owner = map(tuple,I,J,Jown,V)
-  gi_snd, gj_snd, jo_snd, v_snd = map(setup_snd,part,parts_snd,row_partition,coo_entries_with_column_owner) |> tuple_of_arrays
-  graph = ExchangeGraph(parts_snd,parts_rcv)
-  t1 = exchange(gi_snd,graph)
-  t2 = exchange(gj_snd,graph)
-  t3 = exchange(jo_snd,graph)
-  t4 = exchange(v_snd,graph)
-  @async begin
-      gi_rcv = fetch(t1)
-      gj_rcv = fetch(t2)
-      jo_rcv = fetch(t3)
-      v_rcv = fetch(t4)
-      map(setup_rcv!,coo_entries_with_column_owner,gi_rcv,gj_rcv,jo_rcv,v_rcv)
-      I,J,Jown,V
-  end
-end
-
-# dofs_gids_prange can be either test_gids or trial_gids
-# In the former case, gids is a vector of global test dof identifiers, while in the 
-# latter, a vector of global trial dof identifiers
-function _setup_prange(dofs_gids_prange::PRange,gids;ghost=true,owners=nothing,kwargs...)
-  if !ghost
-    _setup_prange_without_ghosts(dofs_gids_prange)
-  elseif isa(owners,Nothing)
-    _setup_prange_with_ghosts(dofs_gids_prange,gids)
-  else
-    _setup_prange_with_ghosts(dofs_gids_prange,gids,owners)
-  end
-end
-
-function _setup_prange(
-  dofs_gids_prange::AbstractVector{<:PRange},
-  gids::AbstractMatrix;
-  ax=:rows,ghost=true,owners=nothing
-)
-  @check ax ∈ (:rows,:cols)
-  block_ids = LinearIndices(dofs_gids_prange)
-  pvcat(x) = map(xi -> vcat(xi...), to_parray_of_arrays(x))
-
-  gids_union, owners_union = map(block_ids,dofs_gids_prange) do id, prange
-    gids_slice = (ax == :rows) ? gids[id,:] : gids[:,id]
-    gids_union = pvcat(gids_slice)
-
-    owners_union = nothing
-    if !isnothing(owners)
-      owners_slice = (ax == :rows) ? owners[id,:] : owners[:,id]
-      owners_union = pvcat(owners_slice)
-    end
-
-    return gids_union, owners_union
-  end |> tuple_of_arrays
-  
-  return map((p,g,o) -> _setup_prange(p,g;ghost=ghost,owners=o),dofs_gids_prange,gids_union,owners_union)
-end
-
-# Create PRange for the rows of the linear system
-# without local ghost dofs as per required in the 
-# LocallyAssembled() parallel assembly strategy 
-function _setup_prange_without_ghosts(dofs_gids_prange::PRange)
-  ngdofs = length(dofs_gids_prange)
-  indices = map(partition(dofs_gids_prange)) do dofs_indices 
-    owner = part_id(dofs_indices)
-    own_indices = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,Int64[],Int32[])
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  return PRange(indices)
-end
-
-# Here we are assuming that the sparse communication graph underlying test_dofs_gids_partition
-# is a superset of the one underlying indices. This is (has to be) true for the rows of the linear system.
-# The precondition required for the consistency of any parallel assembly process in GridapDistributed 
-# is that each processor can determine locally with a single layer of ghost cells the global indices and associated 
-# processor owners of the rows that it touches after assembly of integration terms posed on locally-owned entities 
-# (i.e., either cells or faces). 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> dof PRange ghost ids
-  gids_ghost_lids_to_dofs_ghost_lids = map(ghost_lids_touched,dofs_gids_partition,gids)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    find_gid_and_owner,gids_ghost_lids_to_dofs_ghost_lids,dofs_gids_partition
-  ) |> tuple_of_arrays
-
-  return _setup_prange_impl_(ngdofs,dofs_gids_partition,gids_ghost_to_global,gids_ghost_to_owner)
-end
-
-# Here we cannot assume that the sparse communication graph underlying 
-# trial_dofs_gids_partition is a superset of the one underlying indices.
-# Here we chould check whether it is included and call _find_neighbours!()
-# if this is the case. At present, we are not taking advantage of this, 
-# but let the parallel scalable algorithm to compute the reciprocal to do the job. 
-function _setup_prange_with_ghosts(dofs_gids_prange::PRange,gids,owners)
-  ngdofs = length(dofs_gids_prange)
-  dofs_gids_partition = partition(dofs_gids_prange)
-
-  # Selected ghost ids -> [global dof ids, owner processor ids]
-  gids_ghost_to_global, gids_ghost_to_owner = map(
-    gids,owners,dofs_gids_partition) do gids, owners, indices
-    ghost_touched   = Dict{Int,Bool}()
-    ghost_to_global = Int64[] 
-    ghost_to_owner  = Int64[]
-    me = part_id(indices)
-    for (j,jo) in zip(gids,owners)
-      if jo != me
-        if !haskey(ghost_touched,j)
-          push!(ghost_to_global,j)
-          push!(ghost_to_owner,jo)
-          ghost_touched[j] = true
-        end
-      end
-    end
-    ghost_to_global, ghost_to_owner
-  end |> tuple_of_arrays
-
-  return _setup_prange_impl_(
-    ngdofs,
-    dofs_gids_partition,
-    gids_ghost_to_global,
-    gids_ghost_to_owner;
-    discover_neighbours=false
-  )
-end 
-
-function _setup_prange_impl_(
-  ngdofs,
-  dofs_gids_partition,
-  gids_ghost_to_global,
-  gids_ghost_to_owner;
-  discover_neighbours=true
-)
-  indices = map(dofs_gids_partition, 
-                gids_ghost_to_global, 
-                gids_ghost_to_owner) do dofs_indices, ghost_to_global, ghost_to_owner 
-    owner = part_id(dofs_indices)
-    own_indices   = OwnIndices(ngdofs,owner,own_to_global(dofs_indices))
-    ghost_indices = GhostIndices(ngdofs,ghost_to_global,ghost_to_owner)
-    OwnAndGhostIndices(own_indices,ghost_indices)
-  end
-  if discover_neighbours
-    _find_neighbours!(indices,dofs_gids_partition)
-  end
-  return PRange(indices)
-end

From 2408692579943fcb1b44de23ec58dac4404ce6b8 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 14:01:37 +1000
Subject: [PATCH 34/48] strart fixing adaptivity

---
 src/Adaptivity.jl     | 16 ++++++++++------
 src/Redistribution.jl |  4 +++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/Adaptivity.jl b/src/Adaptivity.jl
index b2a37046..b214894b 100644
--- a/src/Adaptivity.jl
+++ b/src/Adaptivity.jl
@@ -683,15 +683,15 @@ function refine_cell_gids(
   t = exchange(child_gids_rcv,graph)
   child_gids_snd = fetch(t)
   
-  # We finally can create the global numeration of the fine cells by piecing together: 
+  # We finally can create the global numeration of the fine cells by piecing together:
   #   1. The (local ids,global ids) of the owned fine cells
   #   2. The (owners,local ids,global ids) of the ghost fine cells
-  fgids = map(
+  local2global, local2owner = map(
     ranks,f_own_to_local,own_fgids,parts_snd,lids_snd,child_gids_snd
   ) do rank,own_lids,own_gids,nbors,ghost_lids,ghost_gids
 
     own2global = own_to_global(own_gids)
-  
+
     n_own   = length(own_lids)
     n_ghost = length(ghost_lids.data)
     local2global = fill(0,n_own+n_ghost)
@@ -702,7 +702,7 @@ function refine_cell_gids(
       local2global[lid] = own2global[oid]
       local2owner[lid]  = rank
     end
-    
+
     # Ghost cells
     for (n,nbor) in enumerate(nbors)
       for i in ghost_lids.ptrs[n]:ghost_lids.ptrs[n+1]-1
@@ -712,9 +712,13 @@ function refine_cell_gids(
         local2owner[lid]  = nbor
       end
     end
-    return LocalIndices(num_f_gids,rank,local2global,local2owner)
-  end
+    return local2global, local2owner
+  end |> tuple_of_arrays
 
+  fgids = permuted_variable_partition(
+    num_f_owned_cells, local2global, local2owner;
+    n_global=num_f_gids, start=first_f_gid
+  )
   return PRange(fgids)
 end
 
diff --git a/src/Redistribution.jl b/src/Redistribution.jl
index f3d8a8f2..ebdc41c9 100644
--- a/src/Redistribution.jl
+++ b/src/Redistribution.jl
@@ -649,7 +649,9 @@ function p_vector_redistribution_cache(values, indices, indices_red)
   buffers_snd, buffers_rcv = map(
     PartitionedArrays.assembly_buffers,values,lids_snd,lids_rcv
   ) |> tuple_of_arrays
-  caches = map(PartitionedArrays.VectorAssemblyCache,nbors_snd, nbors_rcv,lids_snd, lids_rcv,buffers_snd,buffers_rcv)
+  graph = ExchangeGraph(nbors_snd, nbors_rcv)
+  exchange_setup = PartitionedArrays.setup_exchange(buffers_rcv, buffers_snd, graph)
+  caches = map(PartitionedArrays.VectorAssemblyCache,nbors_snd, nbors_rcv,lids_snd, lids_rcv,buffers_snd,buffers_rcv,exchange_setup)
   return caches
 end
 

From 6f79d0da6c978467de685d0bf50061e6506939e5 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 14:13:31 +1000
Subject: [PATCH 35/48] More adaptivity tests

---
 src/Adaptivity.jl     | 20 +++++++++++---------
 src/Redistribution.jl |  4 +++-
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/Adaptivity.jl b/src/Adaptivity.jl
index b214894b..cb49dd85 100644
--- a/src/Adaptivity.jl
+++ b/src/Adaptivity.jl
@@ -604,12 +604,12 @@ function refine_cell_gids(
   ranks = linear_indices(cgids)
 
   # Create own numbering (without ghosts)
-  num_f_owned_cells = map(length,f_own_to_local)
-  num_f_gids = reduce(+,num_f_owned_cells)
-  first_f_gid = scan(+,num_f_owned_cells,type=:exclusive,init=1)
+  num_f_oids  = map(length,f_own_to_local)
+  num_f_gids  = reduction(+,num_f_oids,destination=:all,init=0)
+  first_f_gid = scan(+,num_f_oids,type=:exclusive,init=1)
   
-  own_fgids = map(ranks,first_f_gid,num_f_owned_cells) do rank,first_f_gid,num_f_owned_cells
-    f_o2g = collect(first_f_gid:first_f_gid+num_f_owned_cells-1)
+  own_fgids = map(ranks,num_f_gids,first_f_gid,num_f_oids) do rank,num_f_gids,first_f_gid,num_f_oids
+    f_o2g = collect(first_f_gid:first_f_gid+num_f_oids-1)
     own   = OwnIndices(num_f_gids,rank,f_o2g)
     ghost = GhostIndices(num_f_gids) # No ghosts
     return OwnAndGhostIndices(own,ghost)
@@ -628,7 +628,9 @@ function refine_cell_gids(
   # collect two keys: 
   #   1. The global id of the coarse parent
   #   2. The child id of the fine cell
-  parent_gids_snd, child_ids_snd = map(cgids,cmodels,fmodels,lids_snd) do cgids,cmodel,fmodel,lids_snd
+  parent_gids_snd, child_ids_snd = map(
+    cgids,cmodels,fmodels,lids_snd
+  ) do cgids,cmodel,fmodel,lids_snd
     glue = get_adaptivity_glue(fmodel)
     f2c_map = glue.n2o_faces_map[Dc+1]
     child_map = glue.n2o_cell_to_child_id
@@ -656,8 +658,8 @@ function refine_cell_gids(
   # We process the received keys, and collect the global ids of the fine cells
   # that have been requested by our neighbors.
   child_gids_rcv = map(
-    cgids,own_fgids,f_own_to_local,cmodels,fmodels,parent_gids_rcv,child_ids_rcv
-  ) do cgids,own_fgids,f_own_to_local,cmodel,fmodel,parent_gids_rcv,child_ids_rcv
+    cgids,own_fgids,f_own_to_local,fmodels,parent_gids_rcv,child_ids_rcv
+  ) do cgids,own_fgids,f_own_to_local,fmodel,parent_gids_rcv,child_ids_rcv
     glue = get_adaptivity_glue(fmodel)
     c2f_map = glue.o2n_faces_map
     child_map = glue.n2o_cell_to_child_id
@@ -716,7 +718,7 @@ function refine_cell_gids(
   end |> tuple_of_arrays
 
   fgids = permuted_variable_partition(
-    num_f_owned_cells, local2global, local2owner;
+    num_f_oids, local2global, local2owner;
     n_global=num_f_gids, start=first_f_gid
   )
   return PRange(fgids)
diff --git a/src/Redistribution.jl b/src/Redistribution.jl
index ebdc41c9..a20e2723 100644
--- a/src/Redistribution.jl
+++ b/src/Redistribution.jl
@@ -651,7 +651,9 @@ function p_vector_redistribution_cache(values, indices, indices_red)
   ) |> tuple_of_arrays
   graph = ExchangeGraph(nbors_snd, nbors_rcv)
   exchange_setup = PartitionedArrays.setup_exchange(buffers_rcv, buffers_snd, graph)
-  caches = map(PartitionedArrays.VectorAssemblyCache,nbors_snd, nbors_rcv,lids_snd, lids_rcv,buffers_snd,buffers_rcv,exchange_setup)
+  caches = map(nbors_snd, nbors_rcv, lids_snd, lids_rcv, buffers_snd, buffers_rcv) do ns, nr, ls, lr, bs, br
+    PartitionedArrays.VectorAssemblyCache(ns, nr, ls, lr, bs, br, exchange_setup)
+  end
   return caches
 end
 

From 37c9c9be3f618dbeccb94a8de587c9f5a21c7cbc Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 14:36:59 +1000
Subject: [PATCH 36/48] Fixed MacroDiscreteModel

---
 src/MacroDiscreteModels.jl | 108 +++++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 46 deletions(-)

diff --git a/src/MacroDiscreteModels.jl b/src/MacroDiscreteModels.jl
index c08392ec..e1cdf48a 100644
--- a/src/MacroDiscreteModels.jl
+++ b/src/MacroDiscreteModels.jl
@@ -49,7 +49,8 @@ end
 # DistributedDiscreteModel API
 
 function get_cell_gids(macro_model::MacroDiscreteModel)
-  return linear_indices(local_views(macro_model.model))
+  ranks = linear_indices(local_views(macro_model.model))
+  return PRange(uniform_partition(ranks,length(ranks)))
 end
 
 # TODO: Reuse macro_model.interface_gids
@@ -74,15 +75,17 @@ function get_local_dimranges(macro_model::MacroDiscreteModel{Dc}) where Dc
 end
 
 """
-  Returns a JaggedArray of JaggedArrays such that we have
+    classify_interfaces(model::DistributedDiscreteModel{Dc};sort_faces=true)
+
+Returns a JaggedArray of JaggedArrays such that we have
 
     [interface dimension][interface lid][face dimension] -> interface dfaces lids
-  
-  I.e we bundle interfaces of the same dimension together (macro-faces, macro-edges, macro-nodes), 
-  and for each interface we return a JaggedArray with the d-faces in the interface.
 
-  If `sort_faces` is true, the faces in each interface are sorted by gid. If false, the faces
-  are by default sorted by lid.
+I.e we bundle interfaces of the same dimension together (macro-faces, macro-edges, macro-nodes), 
+and for each interface we return a JaggedArray with the d-faces in the interface.
+
+If `sort_faces` is true, the faces in each interface are sorted by gid. If false, the faces
+are by default sorted by lid.
 """
 function classify_interfaces(model::DistributedDiscreteModel{Dc};sort_faces=true) where Dc
   ranks = linear_indices(local_views(model))
@@ -166,22 +169,22 @@ function classify_interfaces(model::DistributedDiscreteModel{Dc};sort_faces=true
 end
 
 """
-    function generate_interface_gids(nbors,keys) -> gids
+    generate_interface_gids(nbors,keys) -> gids
 
-  Generates a global numbering for the interfaces, given two input arrays per processor:
+Generates a global numbering for the interfaces, given two input arrays per processor:
 
-    - `nbors`: For each interface, the minimum rank of the neighboring processors.
-    - `keys`: For each interface, the minimum gid of the faces in the interface.
-  
-  Then the gids are assigned in the following way: 
-  We iterate over the processors in ascending order. For each processor, we iterate over the local
-  interfaces. Then: 
-
-    - If the `nbor` processor of an interface has a higher (or equal) rank than the current processor, it means we
-    haven't assigned a gid to that interface yet. So we assign a new gid to the current interface.
-    - If the `nbor` processor of an interface has a lower rank than the current processor, it means we 
-    already assigned a gid to that interface (while iterating over `nbor`). So we look for a matching `key`
-    in the `keys` array of the `nbor` processor, and assign the same gid to the current interface.
+- `nbors`: For each interface, the minimum rank of the neighboring processors.
+- `keys`: For each interface, the minimum gid of the faces in the interface.
+
+Then the gids are assigned in the following way: 
+We iterate over the processors in ascending order. For each processor, we iterate over the local
+interfaces. Then: 
+
+- If the `nbor` processor of an interface has a higher (or equal) rank than the current processor, it means we
+haven't assigned a gid to that interface yet. So we assign a new gid to the current interface.
+- If the `nbor` processor of an interface has a lower rank than the current processor, it means we 
+already assigned a gid to that interface (while iterating over `nbor`). So we look for a matching `key`
+in the `keys` array of the `nbor` processor, and assign the same gid to the current interface.
 """
 function generate_interface_gids(nbors,keys)
   # Gather to main
@@ -222,21 +225,27 @@ function generate_interface_gids(nbors,keys)
   n_glob = emit(_n_glob)
 
   # Create PRange
-  ranks = linear_indices(igids)
-  indices = map(ranks,n_glob,igids,nbors) do rank,n_glob, igids, nbors
-    LocalIndices(n_glob,rank,igids,nbors)
+  n_own = map(linear_indices(nbors), nbors) do rank, nbors
+    count(==(rank), nbors)
   end
-  return PRange(indices)
+  return PRange(permuted_variable_partition(n_own, igids, nbors; n_global=n_glob))
 end
 
 """
-  Given a model and a set of local interfaces for each model, returns
-    - `nbors`: For each interface, the minimum rank of the neighboring processors.
-    - `keys`: For each interface, the minimum gid of the faces in the interface.
+    generate_nbors_and_keys(
+      model::DistributedDiscreteModel{Dc}, d_to_interfaces;
+      is_sorted = false, dimensions = 0:Dc-1
+    )
+
+Given a model and a set of local interfaces for each model, returns
+
+- `nbors`: For each interface, the minimum rank of the neighboring processors.
+- `keys`: For each interface, the minimum gid of the faces in the interface.
 
-  Options: 
-   - `is_sorted`: If true, the keys are assumed to be sorted by gid. This allows some optimization.
-   - `dimensions`: List/Set of interface dimensions to be considered. 
+Options: 
+
+- `is_sorted`: If true, the keys are assumed to be sorted by gid. This allows some optimization.
+- `dimensions`: List/Set of interface dimensions to be considered. 
 """
 function generate_nbors_and_keys(
   model::DistributedDiscreteModel{Dc},
@@ -303,13 +312,16 @@ function generate_nbors_and_keys(
 end
 
 """
-  Returns a global (consistent) face labeling for the macro model. Requires communication.
+    get_global_face_labeling(macro_model::MacroDiscreteModel{Dc})
+
+Returns a global (consistent) face labeling for the macro model. Requires communication.
 
-  The face labeling contains the following tags: 
-    - `Interior_i` : Faces which are interior to processor `i`.
-    - `Interface_j`: Faces which belong to interface `j` (global id).
-    - `Interiors`  : Union of all `Interior_i` tags.
-    - `Interfaces` : Union of all `Interface_j` tags.
+The face labeling contains the following tags: 
+
+- `Interior_i` : Faces which are interior to processor `i`.
+- `Interface_j`: Faces which belong to interface `j` (global id).
+- `Interiors`  : Union of all `Interior_i` tags.
+- `Interfaces` : Union of all `Interface_j` tags.
 """
 function get_global_face_labeling(macro_model::MacroDiscreteModel{Dc}) where Dc
   model = macro_model.model
@@ -374,15 +386,19 @@ function get_global_face_labeling(macro_model::MacroDiscreteModel{Dc}) where Dc
 end
 
 """
-  Returns a local face labeling for the macro model. Does not require communication.
-  WARNING: This is NOT consistent, it is sub-assembled. The same face might have 
-           different labels in different processors.
-
-  The face labeling contains the following tags: 
-    - `Interior`   : Faces which are interior to the processor.
-    - `Exterior`   : Faces which are exterior to the processor.
-    - `Interface_i`: Faces which belong to interface `i` (local id).
-    - `Interfaces` : Union of all `Interface_i` tags.
+    get_local_face_labeling(macro_model::MacroDiscreteModel{Dc})
+
+Returns a local face labeling for the macro model. Does not require communication.
+
+WARNING: This is NOT consistent, it is sub-assembled. The same face might have 
+         different labels in different processors.
+
+The face labeling contains the following tags: 
+
+- `Interior`   : Faces which are interior to the processor.
+- `Exterior`   : Faces which are exterior to the processor.
+- `Interface_i`: Faces which belong to interface `i` (local id).
+- `Interfaces` : Union of all `Interface_i` tags.
 """
 function get_local_face_labeling(macro_model::MacroDiscreteModel{Dc}) where Dc
   model = macro_model.model

From 6345e55d193d9a16f2bd21e2b0255411ba25c7ce Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 14:50:11 +1000
Subject: [PATCH 37/48] Fixed ConstantFESpaces

---
 src/FESpaces.jl | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 73e980f9..3895d6f8 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -572,10 +572,26 @@ end
 
 # Constant FESpace
 
+# Creates a partition with 1 global DOF, owned by `owner`, ghost on all other ranks.
+function constant_partition(ranks, owner)
+  indices = map(ranks) do rank
+    if rank == owner
+      own   = OwnIndices(1, rank, Int[1])
+      ghost = GhostIndices(1, Int[], Int32[])
+    else
+      own   = OwnIndices(1, rank, Int[])
+      ghost = GhostIndices(1, Int[1], Int32[owner])
+    end
+    global_to_owner = [rank]
+    return OwnAndGhostIndices(own, ghost, global_to_owner)
+  end
+  return indices
+end
+
 """
     ConstantFESpace(
-      model::DistributedDiscreteModel; 
-      constraint_type=:global, 
+      model::DistributedDiscreteModel;
+      constraint_type=:global,
       kwargs...
     )
 
@@ -604,16 +620,12 @@ function FESpaces.ConstantFESpace(
 
   # Single dof, owned by processor 1 (ghost for all other processors)
   nranks = length(spaces)
-  cell_gids = get_cell_gids(model)
-  indices = map(partition(cell_gids)) do cell_indices
-    me = part_id(cell_indices)
-    if constraint_type == :global
-      LocalIndices(1,me,Int[1],Int32[1])
-    else
-      LocalIndices(nranks,me,Int[me],Int32[me])
-    end
+  ranks = linear_indices(partition(get_cell_gids(model)))
+  gids = if constraint_type == :global
+    PRange(constant_partition(ranks, 1))
+  else
+    PRange(uniform_partition(ranks, nranks))
   end
-  gids = PRange(indices)
 
   trian = DistributedTriangulation(map(get_triangulation,spaces),model)
   vector_type = _find_vector_type(spaces,gids)

From 1a521c37a93eaef741004fd9b692ba21223796e7 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 15:00:19 +1000
Subject: [PATCH 38/48] Minor

---
 src/Partitions.jl | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/Partitions.jl b/src/Partitions.jl
index a0e541f1..fe327267 100644
--- a/src/Partitions.jl
+++ b/src/Partitions.jl
@@ -533,15 +533,13 @@ function restrict_gids(gids::PRange, new_to_old_lid::AbstractArray)
   consistent!(PVector(old_lid_to_new_gid,partition(gids))) |> wait
 
   # Prepare new partition
-  n_gids = reduction(+,n_own,destination=:all,init=zero(eltype(n_own)))
-
-  new_indices = map(
-    n_gids, old_lid_to_new_gid, new_to_old_lid, partition(gids)
-  ) do n_gids, old_lid_to_new_gid, new_to_old_lid, ids
-    lid_to_gid   = old_lid_to_new_gid[new_to_old_lid]
-    lid_to_owner = local_to_owner(ids)[new_to_old_lid]
-    return LocalIndices(n_gids,part_id(ids),lid_to_gid,lid_to_owner)
-  end
+  n_gids     = reduction(+,n_own,destination=:all,init=zero(eltype(n_own)))
+  new_gids   = map(getindex, old_lid_to_new_gid, new_to_old_lid)
+  new_owners = map(getindex, local_to_owner(gids), new_to_old_lid)
+
+  new_indices = permuted_variable_partition(
+    n_own, new_gids, new_owners; n_global=n_gids, start=first_gid
+  )
 
   p_snd, p_rcv = assembly_neighbors(partition(gids))
   assembly_neighbors(new_indices; neighbors=ExchangeGraph(p_snd, p_rcv))

From 8e63110e1cda807aef6213490ba5017b47400158 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 15:31:49 +1000
Subject: [PATCH 39/48] Minor

---
 src/BlockPartitionedArrays.jl | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/BlockPartitionedArrays.jl b/src/BlockPartitionedArrays.jl
index 6b5cbaff..8825c81f 100644
--- a/src/BlockPartitionedArrays.jl
+++ b/src/BlockPartitionedArrays.jl
@@ -92,7 +92,9 @@ function BlockPMatrix{V}(::UndefInitializer,rows::BlockPRange,cols::BlockPRange)
   vals = map(block_ids) do I
     r = block_rows[I[1]]
     c = block_cols[I[2]]
-    PSparseMatrix{V}(undef,partition(r),partition(c))
+    psparse(partition(r),partition(c);assembled=true) do row_indices,col_indices
+      PartitionedArrays.allocate_local_values(V,row_indices,col_indices)
+    end
   end
   return BlockPMatrix(vals,rows,cols)
 end
@@ -124,7 +126,12 @@ function Base.similar(a::BlockPMatrix,::Type{T},inds::Tuple{<:BlockPRange,<:Bloc
   vals = map(CartesianIndices(blocksize(a))) do I
     rows = inds[1].ranges[I[1]]
     cols = inds[2].ranges[I[2]]
-    similar(a.blocks[I],T,(rows,cols))
+    new_partition = map(
+      PartitionedArrays.partition(a.blocks[I]),partition(rows),partition(cols)
+    ) do local_mat,row_inds,col_inds
+      PartitionedArrays.allocate_local_values(local_mat,T,row_inds,col_inds)
+    end
+    PSparseMatrix(new_partition,partition(rows),partition(cols),false)
   end
   return BlockPArray(vals,inds)
 end
@@ -134,7 +141,9 @@ function Base.similar(::Type{<:BlockPMatrix{V,T,A}},inds::Tuple{<:BlockPRange,<:
   cols = blocks(inds[2])
   values = map(CartesianIndices((length(rows),length(cols)))) do I
     i,j = I[1],I[2]
-    return similar(A,(rows[i],cols[j]))
+    psparse(partition(rows[i]),partition(cols[j]);assembled=true) do row_inds,col_inds
+      PartitionedArrays.allocate_local_values(V,row_inds,col_inds)
+    end
   end
   return BlockPArray(values,inds)
 end

From 5fb4fd57c238971f9fbef7d440452b0b39188580 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 15:34:25 +1000
Subject: [PATCH 40/48] Fix MPI warning

---
 test/mpi/runtests.jl | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/test/mpi/runtests.jl b/test/mpi/runtests.jl
index 80f1cca4..cd114d23 100644
--- a/test/mpi/runtests.jl
+++ b/test/mpi/runtests.jl
@@ -8,14 +8,13 @@ testdir = joinpath(mpidir, "..")
 repodir = joinpath(testdir, "..")
 
 function run_driver(procs, file)
-  mpiexec() do cmd
-    if MPI.MPI_LIBRARY == "OpenMPI" || (isdefined(MPI, :OpenMPI) && MPI.MPI_LIBRARY == MPI.OpenMPI)
-      run(`$cmd -n $procs --oversubscribe $(Base.julia_cmd()) --code-coverage=user --project=$repodir $(joinpath(mpidir, file))`)
-    else
-      run(`$cmd -n $procs $(Base.julia_cmd()) --code-coverage=user --project=$repodir $(joinpath(mpidir, file))`)
-    end
-    @test true
+  cmd = mpiexec()
+  if MPI.MPI_LIBRARY == "OpenMPI" || (isdefined(MPI, :OpenMPI) && MPI.MPI_LIBRARY == MPI.OpenMPI)
+    run(`$cmd -n $procs --oversubscribe $(Base.julia_cmd()) --code-coverage=user --project=$repodir $(joinpath(mpidir, file))`)
+  else
+    run(`$cmd -n $procs $(Base.julia_cmd()) --code-coverage=user --project=$repodir $(joinpath(mpidir, file))`)
   end
+  @test true
 end
 
 run_driver(4, "runtests_np4.jl")

From b7eec29247dcd9a25ecd10b79068bb8872bda24f Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 15 May 2026 17:44:33 +1000
Subject: [PATCH 41/48] Branching in MPI tests now happens before mpi processes
 get spawned. One MPI process is spawned by testset

---
 test/mpi/AdaptivityCartesianTests.jl          |  4 +
 test/mpi/AdaptivityMultiFieldTests.jl         |  4 +
 test/mpi/AdaptivityTests.jl                   |  4 +
 test/mpi/AdaptivityUnstructuredTests.jl       |  4 +
 test/mpi/AutodiffTests.jl                     |  5 +
 test/mpi/BlockPartitionedArraysTests.jl       |  5 +
 test/mpi/BlockSparseMatrixAssemblersTests.jl  |  5 +
 test/mpi/CellDataTests.jl                     |  5 +
 test/mpi/ConstantFESpacesTests.jl             |  4 +
 test/mpi/FESpacesTests.jl                     |  5 +
 test/mpi/GeometryTests.jl                     |  5 +
 test/mpi/HcurlProjectionTests.jl              |  5 +
 test/mpi/HeatEquationTests.jl                 |  5 +
 test/mpi/MacroDiscreteModelsTests.jl          |  4 +
 test/mpi/MultiFieldTests.jl                   |  5 +
 test/mpi/PLaplacianTests.jl                   |  5 +
 test/mpi/PeriodicBCsTests.jl                  |  5 +
 test/mpi/PoissonTests.jl                      |  5 +
 test/mpi/PolytopalCoarseningTests.jl          |  4 +
 test/mpi/StokesHdivDGTests.jl                 |  5 +
 test/mpi/StokesOpenBoundaryTests.jl           |  5 +
 test/mpi/SurfaceCouplingTests.jl              |  5 +
 .../mpi/TransientDistributedCellFieldTests.jl |  5 +
 ...ientMultiFieldDistributedCellFieldTests.jl |  5 +
 test/mpi/VisualizationTests.jl                |  4 +
 test/mpi/ZeroMeanFESpacesTests.jl             |  5 +
 test/mpi/drivers/AdaptivityCartesianTests.jl  | 14 +++
 test/mpi/drivers/AdaptivityMultiFieldTests.jl | 14 +++
 test/mpi/drivers/AdaptivityTests.jl           | 14 +++
 .../drivers/AdaptivityUnstructuredTests.jl    | 14 +++
 test/mpi/drivers/AutodiffTests.jl             | 17 ++++
 .../drivers/BlockPartitionedArraysTests.jl    | 17 ++++
 .../BlockSparseMatrixAssemblersTests.jl       | 17 ++++
 test/mpi/drivers/CellDataTests.jl             | 17 ++++
 test/mpi/drivers/ConstantFESpacesTests.jl     | 14 +++
 test/mpi/drivers/FESpacesTests.jl             | 17 ++++
 test/mpi/drivers/GeometryTests.jl             | 17 ++++
 test/mpi/drivers/HcurlProjectionTests.jl      | 17 ++++
 test/mpi/drivers/HeatEquationTests.jl         | 17 ++++
 test/mpi/drivers/MacroDiscreteModelsTests.jl  | 14 +++
 test/mpi/drivers/MultiFieldTests.jl           | 17 ++++
 test/mpi/drivers/PLaplacianTests.jl           | 17 ++++
 test/mpi/drivers/PeriodicBCsTests.jl          | 17 ++++
 test/mpi/drivers/PoissonTests.jl              | 17 ++++
 test/mpi/drivers/PolytopalCoarseningTests.jl  | 14 +++
 test/mpi/drivers/StokesHdivDGTests.jl         | 17 ++++
 test/mpi/drivers/StokesOpenBoundaryTests.jl   | 17 ++++
 test/mpi/drivers/SurfaceCouplingTests.jl      | 17 ++++
 .../TransientDistributedCellFieldTests.jl     | 17 ++++
 ...ientMultiFieldDistributedCellFieldTests.jl | 17 ++++
 test/mpi/drivers/VisualizationTests.jl        | 14 +++
 test/mpi/drivers/ZeroMeanFESpacesTests.jl     | 17 ++++
 test/mpi/run_mpi_driver.jl                    | 13 +++
 test/mpi/runtests.jl                          | 62 +++++++++---
 test/mpi/runtests_np4.jl                      | 54 ----------
 test/mpi/runtests_np4_body.jl                 | 99 -------------------
 test/sequential/runtests.jl                   | 11 ++-
 57 files changed, 608 insertions(+), 171 deletions(-)
 create mode 100644 test/mpi/AdaptivityCartesianTests.jl
 create mode 100644 test/mpi/AdaptivityMultiFieldTests.jl
 create mode 100644 test/mpi/AdaptivityTests.jl
 create mode 100644 test/mpi/AdaptivityUnstructuredTests.jl
 create mode 100644 test/mpi/AutodiffTests.jl
 create mode 100644 test/mpi/BlockPartitionedArraysTests.jl
 create mode 100644 test/mpi/BlockSparseMatrixAssemblersTests.jl
 create mode 100644 test/mpi/CellDataTests.jl
 create mode 100644 test/mpi/ConstantFESpacesTests.jl
 create mode 100644 test/mpi/FESpacesTests.jl
 create mode 100644 test/mpi/GeometryTests.jl
 create mode 100644 test/mpi/HcurlProjectionTests.jl
 create mode 100644 test/mpi/HeatEquationTests.jl
 create mode 100644 test/mpi/MacroDiscreteModelsTests.jl
 create mode 100644 test/mpi/MultiFieldTests.jl
 create mode 100644 test/mpi/PLaplacianTests.jl
 create mode 100644 test/mpi/PeriodicBCsTests.jl
 create mode 100644 test/mpi/PoissonTests.jl
 create mode 100644 test/mpi/PolytopalCoarseningTests.jl
 create mode 100644 test/mpi/StokesHdivDGTests.jl
 create mode 100644 test/mpi/StokesOpenBoundaryTests.jl
 create mode 100644 test/mpi/SurfaceCouplingTests.jl
 create mode 100644 test/mpi/TransientDistributedCellFieldTests.jl
 create mode 100644 test/mpi/TransientMultiFieldDistributedCellFieldTests.jl
 create mode 100644 test/mpi/VisualizationTests.jl
 create mode 100644 test/mpi/ZeroMeanFESpacesTests.jl
 create mode 100644 test/mpi/drivers/AdaptivityCartesianTests.jl
 create mode 100644 test/mpi/drivers/AdaptivityMultiFieldTests.jl
 create mode 100644 test/mpi/drivers/AdaptivityTests.jl
 create mode 100644 test/mpi/drivers/AdaptivityUnstructuredTests.jl
 create mode 100644 test/mpi/drivers/AutodiffTests.jl
 create mode 100644 test/mpi/drivers/BlockPartitionedArraysTests.jl
 create mode 100644 test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl
 create mode 100644 test/mpi/drivers/CellDataTests.jl
 create mode 100644 test/mpi/drivers/ConstantFESpacesTests.jl
 create mode 100644 test/mpi/drivers/FESpacesTests.jl
 create mode 100644 test/mpi/drivers/GeometryTests.jl
 create mode 100644 test/mpi/drivers/HcurlProjectionTests.jl
 create mode 100644 test/mpi/drivers/HeatEquationTests.jl
 create mode 100644 test/mpi/drivers/MacroDiscreteModelsTests.jl
 create mode 100644 test/mpi/drivers/MultiFieldTests.jl
 create mode 100644 test/mpi/drivers/PLaplacianTests.jl
 create mode 100644 test/mpi/drivers/PeriodicBCsTests.jl
 create mode 100644 test/mpi/drivers/PoissonTests.jl
 create mode 100644 test/mpi/drivers/PolytopalCoarseningTests.jl
 create mode 100644 test/mpi/drivers/StokesHdivDGTests.jl
 create mode 100644 test/mpi/drivers/StokesOpenBoundaryTests.jl
 create mode 100644 test/mpi/drivers/SurfaceCouplingTests.jl
 create mode 100644 test/mpi/drivers/TransientDistributedCellFieldTests.jl
 create mode 100644 test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl
 create mode 100644 test/mpi/drivers/VisualizationTests.jl
 create mode 100644 test/mpi/drivers/ZeroMeanFESpacesTests.jl
 create mode 100644 test/mpi/run_mpi_driver.jl
 delete mode 100644 test/mpi/runtests_np4.jl
 delete mode 100644 test/mpi/runtests_np4_body.jl

diff --git a/test/mpi/AdaptivityCartesianTests.jl b/test/mpi/AdaptivityCartesianTests.jl
new file mode 100644
index 00000000..6d37f685
--- /dev/null
+++ b/test/mpi/AdaptivityCartesianTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AdaptivityCartesianTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/AdaptivityMultiFieldTests.jl b/test/mpi/AdaptivityMultiFieldTests.jl
new file mode 100644
index 00000000..88e55359
--- /dev/null
+++ b/test/mpi/AdaptivityMultiFieldTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AdaptivityMultiFieldTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/AdaptivityTests.jl b/test/mpi/AdaptivityTests.jl
new file mode 100644
index 00000000..89a8c60f
--- /dev/null
+++ b/test/mpi/AdaptivityTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AdaptivityTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/AdaptivityUnstructuredTests.jl b/test/mpi/AdaptivityUnstructuredTests.jl
new file mode 100644
index 00000000..8c453c0b
--- /dev/null
+++ b/test/mpi/AdaptivityUnstructuredTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AdaptivityUnstructuredTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/AutodiffTests.jl b/test/mpi/AutodiffTests.jl
new file mode 100644
index 00000000..ecd6c67a
--- /dev/null
+++ b/test/mpi/AutodiffTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "AutodiffTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/BlockPartitionedArraysTests.jl b/test/mpi/BlockPartitionedArraysTests.jl
new file mode 100644
index 00000000..22baf713
--- /dev/null
+++ b/test/mpi/BlockPartitionedArraysTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "BlockPartitionedArraysTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/BlockSparseMatrixAssemblersTests.jl b/test/mpi/BlockSparseMatrixAssemblersTests.jl
new file mode 100644
index 00000000..309c8056
--- /dev/null
+++ b/test/mpi/BlockSparseMatrixAssemblersTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "BlockSparseMatrixAssemblersTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/CellDataTests.jl b/test/mpi/CellDataTests.jl
new file mode 100644
index 00000000..415466fa
--- /dev/null
+++ b/test/mpi/CellDataTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "CellDataTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/ConstantFESpacesTests.jl b/test/mpi/ConstantFESpacesTests.jl
new file mode 100644
index 00000000..68be3c3a
--- /dev/null
+++ b/test/mpi/ConstantFESpacesTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "ConstantFESpacesTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/FESpacesTests.jl b/test/mpi/FESpacesTests.jl
new file mode 100644
index 00000000..de718e04
--- /dev/null
+++ b/test/mpi/FESpacesTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "FESpacesTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/GeometryTests.jl b/test/mpi/GeometryTests.jl
new file mode 100644
index 00000000..fc2db280
--- /dev/null
+++ b/test/mpi/GeometryTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "GeometryTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/HcurlProjectionTests.jl b/test/mpi/HcurlProjectionTests.jl
new file mode 100644
index 00000000..323ffa14
--- /dev/null
+++ b/test/mpi/HcurlProjectionTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "HcurlProjectionTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/HeatEquationTests.jl b/test/mpi/HeatEquationTests.jl
new file mode 100644
index 00000000..417de562
--- /dev/null
+++ b/test/mpi/HeatEquationTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "HeatEquationTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/MacroDiscreteModelsTests.jl b/test/mpi/MacroDiscreteModelsTests.jl
new file mode 100644
index 00000000..c3a7e0e0
--- /dev/null
+++ b/test/mpi/MacroDiscreteModelsTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "MacroDiscreteModelsTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/MultiFieldTests.jl b/test/mpi/MultiFieldTests.jl
new file mode 100644
index 00000000..c9dc8241
--- /dev/null
+++ b/test/mpi/MultiFieldTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "MultiFieldTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/PLaplacianTests.jl b/test/mpi/PLaplacianTests.jl
new file mode 100644
index 00000000..fa361328
--- /dev/null
+++ b/test/mpi/PLaplacianTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "PLaplacianTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/PeriodicBCsTests.jl b/test/mpi/PeriodicBCsTests.jl
new file mode 100644
index 00000000..6c6d2ed8
--- /dev/null
+++ b/test/mpi/PeriodicBCsTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "PeriodicBCsTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/PoissonTests.jl b/test/mpi/PoissonTests.jl
new file mode 100644
index 00000000..e8da2e38
--- /dev/null
+++ b/test/mpi/PoissonTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "PoissonTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/PolytopalCoarseningTests.jl b/test/mpi/PolytopalCoarseningTests.jl
new file mode 100644
index 00000000..8eef0370
--- /dev/null
+++ b/test/mpi/PolytopalCoarseningTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "PolytopalCoarseningTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/StokesHdivDGTests.jl b/test/mpi/StokesHdivDGTests.jl
new file mode 100644
index 00000000..be60561f
--- /dev/null
+++ b/test/mpi/StokesHdivDGTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "StokesHdivDGTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/StokesOpenBoundaryTests.jl b/test/mpi/StokesOpenBoundaryTests.jl
new file mode 100644
index 00000000..6f6a4f41
--- /dev/null
+++ b/test/mpi/StokesOpenBoundaryTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "StokesOpenBoundaryTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/SurfaceCouplingTests.jl b/test/mpi/SurfaceCouplingTests.jl
new file mode 100644
index 00000000..c6ab8156
--- /dev/null
+++ b/test/mpi/SurfaceCouplingTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "SurfaceCouplingTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/TransientDistributedCellFieldTests.jl b/test/mpi/TransientDistributedCellFieldTests.jl
new file mode 100644
index 00000000..ef96b0d8
--- /dev/null
+++ b/test/mpi/TransientDistributedCellFieldTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "TransientDistributedCellFieldTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/TransientMultiFieldDistributedCellFieldTests.jl b/test/mpi/TransientMultiFieldDistributedCellFieldTests.jl
new file mode 100644
index 00000000..3c780107
--- /dev/null
+++ b/test/mpi/TransientMultiFieldDistributedCellFieldTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "TransientMultiFieldDistributedCellFieldTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/VisualizationTests.jl b/test/mpi/VisualizationTests.jl
new file mode 100644
index 00000000..f97402cd
--- /dev/null
+++ b/test/mpi/VisualizationTests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "VisualizationTests.jl")
+run_mpi_driver(file; procs=4)
diff --git a/test/mpi/ZeroMeanFESpacesTests.jl b/test/mpi/ZeroMeanFESpacesTests.jl
new file mode 100644
index 00000000..c0bf280a
--- /dev/null
+++ b/test/mpi/ZeroMeanFESpacesTests.jl
@@ -0,0 +1,5 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__, "drivers", "ZeroMeanFESpacesTests.jl")
+run_mpi_driver(file; procs=4)
+run_mpi_driver(file; procs=1)
diff --git a/test/mpi/drivers/AdaptivityCartesianTests.jl b/test/mpi/drivers/AdaptivityCartesianTests.jl
new file mode 100644
index 00000000..50193fd2
--- /dev/null
+++ b/test/mpi/drivers/AdaptivityCartesianTests.jl
@@ -0,0 +1,14 @@
+module AdaptivityCartesianMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AdaptivityCartesianTests.jl"))
+
+with_mpi() do distribute
+  AdaptivityCartesianTests.main(distribute)
+end
+
+end # module
diff --git a/test/mpi/drivers/AdaptivityMultiFieldTests.jl b/test/mpi/drivers/AdaptivityMultiFieldTests.jl
new file mode 100644
index 00000000..98d4a0fc
--- /dev/null
+++ b/test/mpi/drivers/AdaptivityMultiFieldTests.jl
@@ -0,0 +1,14 @@
+module AdaptivityMultiFieldMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AdaptivityMultiFieldTests.jl"))
+
+with_mpi() do distribute
+  AdaptivityMultiFieldTests.main(distribute)
+end
+
+end # module
diff --git a/test/mpi/drivers/AdaptivityTests.jl b/test/mpi/drivers/AdaptivityTests.jl
new file mode 100644
index 00000000..795d40d1
--- /dev/null
+++ b/test/mpi/drivers/AdaptivityTests.jl
@@ -0,0 +1,14 @@
+module AdaptivityMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AdaptivityTests.jl"))
+
+with_mpi() do distribute
+  AdaptivityTests.main(distribute)
+end
+
+end # module
diff --git a/test/mpi/drivers/AdaptivityUnstructuredTests.jl b/test/mpi/drivers/AdaptivityUnstructuredTests.jl
new file mode 100644
index 00000000..f9cd1c05
--- /dev/null
+++ b/test/mpi/drivers/AdaptivityUnstructuredTests.jl
@@ -0,0 +1,14 @@
+module AdaptivityUnstructuredMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AdaptivityUnstructuredTests.jl"))
+
+with_mpi() do distribute
+  AdaptivityUnstructuredTests.main(distribute)
+end
+
+end # module
diff --git a/test/mpi/drivers/AutodiffTests.jl b/test/mpi/drivers/AutodiffTests.jl
new file mode 100644
index 00000000..9dc994b9
--- /dev/null
+++ b/test/mpi/drivers/AutodiffTests.jl
@@ -0,0 +1,17 @@
+module AutodiffMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "AutodiffTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  AutodiffTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/BlockPartitionedArraysTests.jl b/test/mpi/drivers/BlockPartitionedArraysTests.jl
new file mode 100644
index 00000000..587b7a97
--- /dev/null
+++ b/test/mpi/drivers/BlockPartitionedArraysTests.jl
@@ -0,0 +1,17 @@
+module BlockPartitionedArraysMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "BlockPartitionedArraysTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  BlockPartitionedArraysTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl b/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl
new file mode 100644
index 00000000..944e159a
--- /dev/null
+++ b/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl
@@ -0,0 +1,17 @@
+module BlockSparseMatrixAssemblersMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "BlockSparseMatrixAssemblersTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  BlockSparseMatrixAssemblersTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/CellDataTests.jl b/test/mpi/drivers/CellDataTests.jl
new file mode 100644
index 00000000..5eb0f8c9
--- /dev/null
+++ b/test/mpi/drivers/CellDataTests.jl
@@ -0,0 +1,17 @@
+module CellDataMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "CellDataTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  CellDataTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/ConstantFESpacesTests.jl b/test/mpi/drivers/ConstantFESpacesTests.jl
new file mode 100644
index 00000000..4166326c
--- /dev/null
+++ b/test/mpi/drivers/ConstantFESpacesTests.jl
@@ -0,0 +1,14 @@
+module ConstantFESpacesMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "ConstantFESpacesTests.jl"))
+
+with_mpi() do distribute
+  ConstantFESpacesTests.main(distribute, (2, 2))
+end
+
+end # module
diff --git a/test/mpi/drivers/FESpacesTests.jl b/test/mpi/drivers/FESpacesTests.jl
new file mode 100644
index 00000000..9c34b287
--- /dev/null
+++ b/test/mpi/drivers/FESpacesTests.jl
@@ -0,0 +1,17 @@
+module FESpacesMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "FESpacesTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  FESpacesTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/GeometryTests.jl b/test/mpi/drivers/GeometryTests.jl
new file mode 100644
index 00000000..b5a5a926
--- /dev/null
+++ b/test/mpi/drivers/GeometryTests.jl
@@ -0,0 +1,17 @@
+module GeometryMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "GeometryTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  GeometryTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/HcurlProjectionTests.jl b/test/mpi/drivers/HcurlProjectionTests.jl
new file mode 100644
index 00000000..7bec0458
--- /dev/null
+++ b/test/mpi/drivers/HcurlProjectionTests.jl
@@ -0,0 +1,17 @@
+module HcurlProjectionMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "HcurlProjectionTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  HcurlProjectionTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/HeatEquationTests.jl b/test/mpi/drivers/HeatEquationTests.jl
new file mode 100644
index 00000000..f3d2f600
--- /dev/null
+++ b/test/mpi/drivers/HeatEquationTests.jl
@@ -0,0 +1,17 @@
+module HeatEquationMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "HeatEquationTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  HeatEquationTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/MacroDiscreteModelsTests.jl b/test/mpi/drivers/MacroDiscreteModelsTests.jl
new file mode 100644
index 00000000..543feb1c
--- /dev/null
+++ b/test/mpi/drivers/MacroDiscreteModelsTests.jl
@@ -0,0 +1,14 @@
+module MacroDiscreteModelsMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "MacroDiscreteModelsTests.jl"))
+
+with_mpi() do distribute
+  MacroDiscreteModelsTests.main(distribute, (2, 2))
+end
+
+end # module
diff --git a/test/mpi/drivers/MultiFieldTests.jl b/test/mpi/drivers/MultiFieldTests.jl
new file mode 100644
index 00000000..ed048e57
--- /dev/null
+++ b/test/mpi/drivers/MultiFieldTests.jl
@@ -0,0 +1,17 @@
+module MultiFieldMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "MultiFieldTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  MultiFieldTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/PLaplacianTests.jl b/test/mpi/drivers/PLaplacianTests.jl
new file mode 100644
index 00000000..7278317c
--- /dev/null
+++ b/test/mpi/drivers/PLaplacianTests.jl
@@ -0,0 +1,17 @@
+module PLaplacianMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "PLaplacianTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  PLaplacianTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/PeriodicBCsTests.jl b/test/mpi/drivers/PeriodicBCsTests.jl
new file mode 100644
index 00000000..7e8e280d
--- /dev/null
+++ b/test/mpi/drivers/PeriodicBCsTests.jl
@@ -0,0 +1,17 @@
+module PeriodicBCsMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "PeriodicBCsTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  PeriodicBCsTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/PoissonTests.jl b/test/mpi/drivers/PoissonTests.jl
new file mode 100644
index 00000000..b291670a
--- /dev/null
+++ b/test/mpi/drivers/PoissonTests.jl
@@ -0,0 +1,17 @@
+module PoissonMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "PoissonTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  PoissonTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/PolytopalCoarseningTests.jl b/test/mpi/drivers/PolytopalCoarseningTests.jl
new file mode 100644
index 00000000..97bd6163
--- /dev/null
+++ b/test/mpi/drivers/PolytopalCoarseningTests.jl
@@ -0,0 +1,14 @@
+module PolytopalCoarseningMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "PolytopalCoarseningTests.jl"))
+
+with_mpi() do distribute
+  PolytopalCoarseningTests.main(distribute, (2, 2))
+end
+
+end # module
diff --git a/test/mpi/drivers/StokesHdivDGTests.jl b/test/mpi/drivers/StokesHdivDGTests.jl
new file mode 100644
index 00000000..7c4c3532
--- /dev/null
+++ b/test/mpi/drivers/StokesHdivDGTests.jl
@@ -0,0 +1,17 @@
+module StokesHdivDGMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "StokesHdivDGTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  StokesHdivDGTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/StokesOpenBoundaryTests.jl b/test/mpi/drivers/StokesOpenBoundaryTests.jl
new file mode 100644
index 00000000..9c88d28d
--- /dev/null
+++ b/test/mpi/drivers/StokesOpenBoundaryTests.jl
@@ -0,0 +1,17 @@
+module StokesOpenBoundaryMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "StokesOpenBoundaryTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  StokesOpenBoundaryTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/SurfaceCouplingTests.jl b/test/mpi/drivers/SurfaceCouplingTests.jl
new file mode 100644
index 00000000..8ab27451
--- /dev/null
+++ b/test/mpi/drivers/SurfaceCouplingTests.jl
@@ -0,0 +1,17 @@
+module SurfaceCouplingMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "SurfaceCouplingTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  SurfaceCouplingTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/TransientDistributedCellFieldTests.jl b/test/mpi/drivers/TransientDistributedCellFieldTests.jl
new file mode 100644
index 00000000..bfd42001
--- /dev/null
+++ b/test/mpi/drivers/TransientDistributedCellFieldTests.jl
@@ -0,0 +1,17 @@
+module TransientDistributedCellFieldMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "TransientDistributedCellFieldTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  TransientDistributedCellFieldTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl b/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl
new file mode 100644
index 00000000..4c4328be
--- /dev/null
+++ b/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl
@@ -0,0 +1,17 @@
+module TransientMultiFieldDistributedCellFieldMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "TransientMultiFieldDistributedCellFieldTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  TransientMultiFieldDistributedCellFieldTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/drivers/VisualizationTests.jl b/test/mpi/drivers/VisualizationTests.jl
new file mode 100644
index 00000000..49bf425c
--- /dev/null
+++ b/test/mpi/drivers/VisualizationTests.jl
@@ -0,0 +1,14 @@
+module VisualizationMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "VisualizationTests.jl"))
+
+with_mpi() do distribute
+  VisualizationTests.main(distribute, (2, 2))
+end
+
+end # module
diff --git a/test/mpi/drivers/ZeroMeanFESpacesTests.jl b/test/mpi/drivers/ZeroMeanFESpacesTests.jl
new file mode 100644
index 00000000..918ba416
--- /dev/null
+++ b/test/mpi/drivers/ZeroMeanFESpacesTests.jl
@@ -0,0 +1,17 @@
+module ZeroMeanFESpacesMPIDriver
+
+using GridapDistributed
+using PartitionedArrays
+const PArrays = PartitionedArrays
+using MPI
+
+include(joinpath(@__DIR__, "..", "..", "ZeroMeanFESpacesTests.jl"))
+
+nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+parts = nprocs == 4 ? (2, 2) : (1, 1)
+
+with_mpi() do distribute
+  ZeroMeanFESpacesTests.main(distribute, parts)
+end
+
+end # module
diff --git a/test/mpi/run_mpi_driver.jl b/test/mpi/run_mpi_driver.jl
new file mode 100644
index 00000000..d84a629c
--- /dev/null
+++ b/test/mpi/run_mpi_driver.jl
@@ -0,0 +1,13 @@
+using MPI
+using Test
+function run_mpi_driver(file; procs)
+  repodir = normpath(joinpath(@__DIR__, "..", ".."))
+  mpiexec() do cmd
+    if MPI.MPI_LIBRARY == "OpenMPI" || (isdefined(MPI, :OpenMPI) && MPI.MPI_LIBRARY == MPI.OpenMPI)
+      run(`$cmd -n $procs --oversubscribe $(Base.julia_cmd()) --project=$repodir $(file)`)
+    else
+      run(`$cmd -n $procs $(Base.julia_cmd()) --project=$repodir $(file)`)
+    end
+    @test true
+  end
+end
diff --git a/test/mpi/runtests.jl b/test/mpi/runtests.jl
index cd114d23..7c97eb9f 100644
--- a/test/mpi/runtests.jl
+++ b/test/mpi/runtests.jl
@@ -1,23 +1,59 @@
 module MPITests
 
-using MPI
 using Test
+using MPI
 
-mpidir  = @__DIR__
-testdir = joinpath(mpidir, "..")
-repodir = joinpath(testdir, "..")
+TESTCASE = get(ENV, "TESTCASE", "all")
+@info "Running MPI tests with TESTCASE=$TESTCASE"
+
+if TESTCASE ∈ ("all", "mpi-geometry")
+  @time @testset "Geometry" begin include("GeometryTests.jl") end
+  @time @testset "CellData" begin include("CellDataTests.jl") end
+end
 
-function run_driver(procs, file)
-  cmd = mpiexec()
-  if MPI.MPI_LIBRARY == "OpenMPI" || (isdefined(MPI, :OpenMPI) && MPI.MPI_LIBRARY == MPI.OpenMPI)
-    run(`$cmd -n $procs --oversubscribe $(Base.julia_cmd()) --code-coverage=user --project=$repodir $(joinpath(mpidir, file))`)
-  else
-    run(`$cmd -n $procs $(Base.julia_cmd()) --code-coverage=user --project=$repodir $(joinpath(mpidir, file))`)
+if TESTCASE ∈ ("all", "mpi-fespaces")
+  @time @testset "FESpaces"         begin include("FESpacesTests.jl") end
+  @time @testset "MultiField"       begin include("MultiFieldTests.jl") end
+  @time @testset "ZeroMeanFESpaces" begin include("ZeroMeanFESpacesTests.jl") end
+  @time @testset "PeriodicBCs"      begin include("PeriodicBCsTests.jl") end
+  @time @testset "ConstantFESpaces" begin include("ConstantFESpacesTests.jl") end
+end
+
+if TESTCASE ∈ ("all", "mpi-physics")
+  @time @testset "Poisson"            begin include("PoissonTests.jl") end
+  @time @testset "PLaplacian"         begin include("PLaplacianTests.jl") end
+  @time @testset "SurfaceCoupling"    begin include("SurfaceCouplingTests.jl") end
+  @time @testset "StokesOpenBoundary" begin include("StokesOpenBoundaryTests.jl") end
+  @time @testset "StokesHdivDG"       begin include("StokesHdivDGTests.jl") end
+  @time @testset "HcurlProjection"    begin include("HcurlProjectionTests.jl") end
+end
+
+if TESTCASE ∈ ("all", "mpi-transient")
+  @time @testset "TransientDistributedCellField" begin
+    include("TransientDistributedCellFieldTests.jl")
+  end
+  @time @testset "TransientMultiFieldDistributedCellField" begin
+    include("TransientMultiFieldDistributedCellFieldTests.jl")
   end
-  @test true
+  @time @testset "HeatEquation" begin include("HeatEquationTests.jl") end
+end
+
+if TESTCASE ∈ ("all", "mpi-adaptivity")
+  @time @testset "AdaptivityTests"             begin include("AdaptivityTests.jl") end
+  @time @testset "AdaptivityCartesianTests"    begin include("AdaptivityCartesianTests.jl") end
+  @time @testset "AdaptivityMultiFieldTests"   begin include("AdaptivityMultiFieldTests.jl") end
+  @time @testset "AdaptivityUnstructuredTests" begin include("AdaptivityUnstructuredTests.jl") end
+  @time @testset "PolytopalCoarsening"         begin include("PolytopalCoarseningTests.jl") end
 end
 
-run_driver(4, "runtests_np4.jl")
-run_driver(1, "runtests_np4.jl") # Check that the degenerated case works
+if TESTCASE ∈ ("all", "mpi-misc")
+  @time @testset "BlockSparseMatrixAssemblers" begin
+    include("BlockSparseMatrixAssemblersTests.jl")
+  end
+  @time @testset "BlockPartitionedArrays" begin include("BlockPartitionedArraysTests.jl") end
+  @time @testset "Visualization"          begin include("VisualizationTests.jl") end
+  @time @testset "Autodiff"               begin include("AutodiffTests.jl") end
+  @time @testset "MacroDiscreteModels"    begin include("MacroDiscreteModelsTests.jl") end
+end
 
 end # module
diff --git a/test/mpi/runtests_np4.jl b/test/mpi/runtests_np4.jl
deleted file mode 100644
index d2946152..00000000
--- a/test/mpi/runtests_np4.jl
+++ /dev/null
@@ -1,54 +0,0 @@
-module NP4
-# All tests running on either 1 or 4 procs here
-
-using GridapDistributed
-using PartitionedArrays
-const PArrays = PartitionedArrays
-using MPI
-
-if !MPI.Initialized()
-  MPI.Init()
-end
-
-include("../CellDataTests.jl")
-include("../FESpacesTests.jl")
-include("../GeometryTests.jl")
-include("../MultiFieldTests.jl")
-include("../PLaplacianTests.jl")
-include("../PoissonTests.jl")
-include("../PeriodicBCsTests.jl")
-include("../SurfaceCouplingTests.jl")
-include("../TransientDistributedCellFieldTests.jl")
-include("../TransientMultiFieldDistributedCellFieldTests.jl")
-include("../ZeroMeanFESpacesTests.jl")
-include("../HeatEquationTests.jl")
-include("../StokesOpenBoundaryTests.jl")
-include("../StokesHdivDGTests.jl")
-include("../AdaptivityTests.jl")
-include("../AdaptivityCartesianTests.jl")
-include("../AdaptivityUnstructuredTests.jl")
-include("../AdaptivityMultiFieldTests.jl")
-include("../PolytopalCoarseningTests.jl")
-include("../BlockSparseMatrixAssemblersTests.jl")
-include("../BlockPartitionedArraysTests.jl")
-include("../VisualizationTests.jl")
-include("../AutodiffTests.jl")
-include("../ConstantFESpacesTests.jl")
-include("../MacroDiscreteModelsTests.jl")
-include("../HcurlProjectionTests.jl")
-
-include("runtests_np4_body.jl")
-
-if MPI.Comm_size(MPI.COMM_WORLD) == 4
-  with_mpi() do distribute
-    all_tests(distribute, (2,2))
-  end
-elseif MPI.Comm_size(MPI.COMM_WORLD) == 1
-  with_mpi() do distribute
-    all_tests(distribute, (1,1))
-  end
-else
-  MPI.Abort(MPI.COMM_WORLD, 0)
-end
-
-end #module
diff --git a/test/mpi/runtests_np4_body.jl b/test/mpi/runtests_np4_body.jl
deleted file mode 100644
index cc3753ed..00000000
--- a/test/mpi/runtests_np4_body.jl
+++ /dev/null
@@ -1,99 +0,0 @@
-function all_tests(distribute, parts)
-  TESTCASE = get(ENV, "TESTCASE", "all")
-
-  ranks = distribute(LinearIndices((prod(parts),)))
-
-  t = PArrays.PTimer(ranks, verbose=true)
-  PArrays.tic!(t)
-
-  if TESTCASE ∈ ("all", "mpi-geometry")
-    GeometryTests.main(distribute, parts)
-    PArrays.toc!(t, "Geometry")
-
-    CellDataTests.main(distribute, parts)
-    PArrays.toc!(t, "CellData")
-  end
-
-  if TESTCASE ∈ ("all", "mpi-fespaces")
-    FESpacesTests.main(distribute, parts)
-    PArrays.toc!(t, "FESpaces")
-
-    MultiFieldTests.main(distribute, parts)
-    PArrays.toc!(t, "MultiField")
-
-    ZeroMeanFESpacesTests.main(distribute, parts)
-    PArrays.toc!(t, "ZeroMeanFESpaces")
-
-    PeriodicBCsTests.main(distribute, parts)
-    PArrays.toc!(t, "PeriodicBCs")
-
-    if prod(parts) == 4
-      ConstantFESpacesTests.main(distribute, parts)
-      PArrays.toc!(t, "ConstantFESpaces")
-    end
-  end
-
-  if TESTCASE ∈ ("all", "mpi-physics")
-    PoissonTests.main(distribute, parts)
-    PArrays.toc!(t, "Poisson")
-
-    PLaplacianTests.main(distribute, parts)
-    PArrays.toc!(t, "PLaplacian")
-
-    SurfaceCouplingTests.main(distribute, parts)
-    PArrays.toc!(t, "SurfaceCoupling")
-
-    StokesOpenBoundaryTests.main(distribute, parts)
-    PArrays.toc!(t, "StokesOpenBoundary")
-
-    StokesHdivDGTests.main(distribute, parts)
-    PArrays.toc!(t, "StokesHdivDG")
-
-    HcurlProjectionTests.main(distribute, parts)
-    PArrays.toc!(t, "HcurlProjection")
-    
-  end
-
-  if TESTCASE ∈ ("all", "mpi-transient")
-    TransientDistributedCellFieldTests.main(distribute, parts)
-    PArrays.toc!(t, "TransientDistributedCellField")
-
-    TransientMultiFieldDistributedCellFieldTests.main(distribute, parts)
-    PArrays.toc!(t, "TransientMultiFieldDistributedCellField")
-
-    HeatEquationTests.main(distribute, parts)
-    PArrays.toc!(t, "HeatEquation")
-  end
-
-  if TESTCASE ∈ ("all", "mpi-adaptivity")
-    if prod(parts) == 4
-      AdaptivityTests.main(distribute)
-      AdaptivityCartesianTests.main(distribute)
-      AdaptivityMultiFieldTests.main(distribute)
-      AdaptivityUnstructuredTests.main(distribute)
-      PolytopalCoarseningTests.main(distribute, parts)
-      PArrays.toc!(t, "Adaptivity")
-    end
-  end
-
-  if TESTCASE ∈ ("all", "mpi-misc")
-    BlockSparseMatrixAssemblersTests.main(distribute, parts)
-    BlockPartitionedArraysTests.main(distribute, parts)
-    PArrays.toc!(t, "BlockSparseMatrixAssemblers")
-
-    if prod(parts) == 4
-      VisualizationTests.main(distribute, parts)
-      PArrays.toc!(t, "Visualization")
-    end
-
-    AutodiffTests.main(distribute, parts)
-    PArrays.toc!(t, "Autodiff")
-
-    if prod(parts) == 4
-      MacroDiscreteModelsTests.main(distribute, parts)
-      PArrays.toc!(t, "MacroDiscreteModels")
-    end
-  end
-
-  isempty(t.timings) || display(t)
-end
diff --git a/test/sequential/runtests.jl b/test/sequential/runtests.jl
index aee13a6d..90370037 100644
--- a/test/sequential/runtests.jl
+++ b/test/sequential/runtests.jl
@@ -3,6 +3,7 @@ module SequentialTests
 using Test
 
 TESTCASE = get(ENV, "TESTCASE", "all")
+@info "Running SEQUENTIAL tests with TESTCASE=$TESTCASE"
 
 if TESTCASE ∈ ("all", "seq-geometry")
   @time @testset "Geometry" begin include("GeometryTests.jl") end
@@ -19,11 +20,11 @@ if TESTCASE ∈ ("all", "seq-fespaces")
 end
 
 if TESTCASE ∈ ("all", "seq-physics")
-  @time @testset "Poisson"                     begin include("PoissonTests.jl") end
-  @time @testset "PLaplacian"                  begin include("PLaplacianTests.jl") end
-  @time @testset "SurfaceCouplingTests"        begin include("SurfaceCouplingTests.jl") end
-  @time @testset "StokesHdivDGTests"           begin include("StokesHdivDGTests.jl") end
-  @time @testset "StokesOpenBoundary"          begin include("StokesOpenBoundaryTests.jl") end
+  @time @testset "Poisson"              begin include("PoissonTests.jl") end
+  @time @testset "PLaplacian"           begin include("PLaplacianTests.jl") end
+  @time @testset "SurfaceCouplingTests" begin include("SurfaceCouplingTests.jl") end
+  @time @testset "StokesHdivDGTests"    begin include("StokesHdivDGTests.jl") end
+  @time @testset "StokesOpenBoundary"   begin include("StokesOpenBoundaryTests.jl") end
 end
 
 if TESTCASE ∈ ("all", "seq-transient")

From 60a46552a8e316f39647aea86295b7f247a487f0 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Sat, 16 May 2026 11:40:32 +1000
Subject: [PATCH 42/48] Minor changes in tests

---
 test/mpi/drivers/AutodiffTests.jl                   |  5 ++---
 test/mpi/drivers/BlockPartitionedArraysTests.jl     |  5 ++---
 .../mpi/drivers/BlockSparseMatrixAssemblersTests.jl |  5 ++---
 test/mpi/drivers/CellDataTests.jl                   |  5 ++---
 test/mpi/drivers/FESpacesTests.jl                   |  5 ++---
 test/mpi/drivers/GeometryTests.jl                   |  5 ++---
 test/mpi/drivers/HcurlProjectionTests.jl            |  5 ++---
 test/mpi/drivers/HeatEquationTests.jl               |  5 ++---
 test/mpi/drivers/MultiFieldTests.jl                 |  5 ++---
 test/mpi/drivers/PLaplacianTests.jl                 |  5 ++---
 test/mpi/drivers/PeriodicBCsTests.jl                |  5 ++---
 test/mpi/drivers/PoissonTests.jl                    |  5 ++---
 test/mpi/drivers/StokesHdivDGTests.jl               |  5 ++---
 test/mpi/drivers/StokesOpenBoundaryTests.jl         |  5 ++---
 test/mpi/drivers/SurfaceCouplingTests.jl            |  5 ++---
 .../drivers/TransientDistributedCellFieldTests.jl   |  5 ++---
 .../TransientMultiFieldDistributedCellFieldTests.jl |  5 ++---
 test/mpi/drivers/ZeroMeanFESpacesTests.jl           |  5 ++---
 test/mpi/run_mpi_driver.jl                          | 13 ++++++-------
 19 files changed, 42 insertions(+), 61 deletions(-)

diff --git a/test/mpi/drivers/AutodiffTests.jl b/test/mpi/drivers/AutodiffTests.jl
index 9dc994b9..81404068 100644
--- a/test/mpi/drivers/AutodiffTests.jl
+++ b/test/mpi/drivers/AutodiffTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "AutodiffTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   AutodiffTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/BlockPartitionedArraysTests.jl b/test/mpi/drivers/BlockPartitionedArraysTests.jl
index 587b7a97..5cccec53 100644
--- a/test/mpi/drivers/BlockPartitionedArraysTests.jl
+++ b/test/mpi/drivers/BlockPartitionedArraysTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "BlockPartitionedArraysTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   BlockPartitionedArraysTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl b/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl
index 944e159a..ba27fa3b 100644
--- a/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl
+++ b/test/mpi/drivers/BlockSparseMatrixAssemblersTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "BlockSparseMatrixAssemblersTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   BlockSparseMatrixAssemblersTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/CellDataTests.jl b/test/mpi/drivers/CellDataTests.jl
index 5eb0f8c9..d01e7c24 100644
--- a/test/mpi/drivers/CellDataTests.jl
+++ b/test/mpi/drivers/CellDataTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "CellDataTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   CellDataTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/FESpacesTests.jl b/test/mpi/drivers/FESpacesTests.jl
index 9c34b287..e3caf521 100644
--- a/test/mpi/drivers/FESpacesTests.jl
+++ b/test/mpi/drivers/FESpacesTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "FESpacesTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   FESpacesTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/GeometryTests.jl b/test/mpi/drivers/GeometryTests.jl
index b5a5a926..2cac1822 100644
--- a/test/mpi/drivers/GeometryTests.jl
+++ b/test/mpi/drivers/GeometryTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "GeometryTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   GeometryTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/HcurlProjectionTests.jl b/test/mpi/drivers/HcurlProjectionTests.jl
index 7bec0458..d580f1f1 100644
--- a/test/mpi/drivers/HcurlProjectionTests.jl
+++ b/test/mpi/drivers/HcurlProjectionTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "HcurlProjectionTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   HcurlProjectionTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/HeatEquationTests.jl b/test/mpi/drivers/HeatEquationTests.jl
index f3d2f600..dfb31c8c 100644
--- a/test/mpi/drivers/HeatEquationTests.jl
+++ b/test/mpi/drivers/HeatEquationTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "HeatEquationTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   HeatEquationTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/MultiFieldTests.jl b/test/mpi/drivers/MultiFieldTests.jl
index ed048e57..12a49fb3 100644
--- a/test/mpi/drivers/MultiFieldTests.jl
+++ b/test/mpi/drivers/MultiFieldTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "MultiFieldTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   MultiFieldTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/PLaplacianTests.jl b/test/mpi/drivers/PLaplacianTests.jl
index 7278317c..bf5ec3f0 100644
--- a/test/mpi/drivers/PLaplacianTests.jl
+++ b/test/mpi/drivers/PLaplacianTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "PLaplacianTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   PLaplacianTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/PeriodicBCsTests.jl b/test/mpi/drivers/PeriodicBCsTests.jl
index 7e8e280d..8221b3df 100644
--- a/test/mpi/drivers/PeriodicBCsTests.jl
+++ b/test/mpi/drivers/PeriodicBCsTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "PeriodicBCsTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   PeriodicBCsTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/PoissonTests.jl b/test/mpi/drivers/PoissonTests.jl
index b291670a..f644a4d4 100644
--- a/test/mpi/drivers/PoissonTests.jl
+++ b/test/mpi/drivers/PoissonTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "PoissonTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   PoissonTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/StokesHdivDGTests.jl b/test/mpi/drivers/StokesHdivDGTests.jl
index 7c4c3532..687e16af 100644
--- a/test/mpi/drivers/StokesHdivDGTests.jl
+++ b/test/mpi/drivers/StokesHdivDGTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "StokesHdivDGTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   StokesHdivDGTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/StokesOpenBoundaryTests.jl b/test/mpi/drivers/StokesOpenBoundaryTests.jl
index 9c88d28d..d1c9d707 100644
--- a/test/mpi/drivers/StokesOpenBoundaryTests.jl
+++ b/test/mpi/drivers/StokesOpenBoundaryTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "StokesOpenBoundaryTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   StokesOpenBoundaryTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/SurfaceCouplingTests.jl b/test/mpi/drivers/SurfaceCouplingTests.jl
index 8ab27451..b79ce487 100644
--- a/test/mpi/drivers/SurfaceCouplingTests.jl
+++ b/test/mpi/drivers/SurfaceCouplingTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "SurfaceCouplingTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   SurfaceCouplingTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/TransientDistributedCellFieldTests.jl b/test/mpi/drivers/TransientDistributedCellFieldTests.jl
index bfd42001..e28fa5fe 100644
--- a/test/mpi/drivers/TransientDistributedCellFieldTests.jl
+++ b/test/mpi/drivers/TransientDistributedCellFieldTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "TransientDistributedCellFieldTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   TransientDistributedCellFieldTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl b/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl
index 4c4328be..ffa2f2bd 100644
--- a/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl
+++ b/test/mpi/drivers/TransientMultiFieldDistributedCellFieldTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "TransientMultiFieldDistributedCellFieldTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   TransientMultiFieldDistributedCellFieldTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/drivers/ZeroMeanFESpacesTests.jl b/test/mpi/drivers/ZeroMeanFESpacesTests.jl
index 918ba416..81ccf552 100644
--- a/test/mpi/drivers/ZeroMeanFESpacesTests.jl
+++ b/test/mpi/drivers/ZeroMeanFESpacesTests.jl
@@ -7,10 +7,9 @@ using MPI
 
 include(joinpath(@__DIR__, "..", "..", "ZeroMeanFESpacesTests.jl"))
 
-nprocs = MPI.Comm_size(MPI.COMM_WORLD)
-parts = nprocs == 4 ? (2, 2) : (1, 1)
-
 with_mpi() do distribute
+  nprocs = MPI.Comm_size(MPI.COMM_WORLD)
+  parts = nprocs == 4 ? (2, 2) : (1, 1)
   ZeroMeanFESpacesTests.main(distribute, parts)
 end
 
diff --git a/test/mpi/run_mpi_driver.jl b/test/mpi/run_mpi_driver.jl
index d84a629c..34ba6ff9 100644
--- a/test/mpi/run_mpi_driver.jl
+++ b/test/mpi/run_mpi_driver.jl
@@ -2,12 +2,11 @@ using MPI
 using Test
 function run_mpi_driver(file; procs)
   repodir = normpath(joinpath(@__DIR__, "..", ".."))
-  mpiexec() do cmd
-    if MPI.MPI_LIBRARY == "OpenMPI" || (isdefined(MPI, :OpenMPI) && MPI.MPI_LIBRARY == MPI.OpenMPI)
-      run(`$cmd -n $procs --oversubscribe $(Base.julia_cmd()) --project=$repodir $(file)`)
-    else
-      run(`$cmd -n $procs $(Base.julia_cmd()) --project=$repodir $(file)`)
-    end
-    @test true
+  cmd = mpiexec()
+  if MPI.MPI_LIBRARY == "OpenMPI" || (isdefined(MPI, :OpenMPI) && MPI.MPI_LIBRARY == MPI.OpenMPI)
+    run(`$cmd -n $procs --oversubscribe $(Base.julia_cmd()) --project=$repodir $(file)`)
+  else
+    run(`$cmd -n $procs $(Base.julia_cmd()) --project=$repodir $(file)`)
   end
+  @test true
 end

From ed55df4eca140d78658ca28c2cbabeb5f795a794 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Sat, 16 May 2026 12:08:36 +1000
Subject: [PATCH 43/48] Added copy for MPIArray

---
 src/PArraysExtras.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index 7711d612..a1edc54e 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -211,6 +211,8 @@ end
 
 const empty_async_task = PartitionedArrays.FakeTask(() -> nothing)
 
+Base.copy(a::MPIArray) = map(copy, a)
+
 # Linear algebra
 
 function LinearAlgebra.axpy!(α,x::PVector,y::PVector)

From 73ad9c93e3a8a5c8266b7b5cd7b6c8cce3bc3cbc Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Sun, 17 May 2026 11:49:08 +1000
Subject: [PATCH 44/48] Added more docstrings

---
 src/Adaptivity.jl    | 10 ++++++++
 src/Algebra.jl       | 23 +++++++++++++++---
 src/Assembly.jl      | 39 ++++++++++++++++++++++++++++++
 src/CellData.jl      | 12 ++++++++++
 src/FESpaces.jl      | 14 +++++++++++
 src/Geometry.jl      | 57 ++++++++++++++++++++++++++++++++++++++++++++
 src/MultiField.jl    |  8 +++++++
 src/PArraysExtras.jl | 27 +++++++++++++++++----
 src/Visualization.jl |  8 +++++++
 9 files changed, 191 insertions(+), 7 deletions(-)

diff --git a/src/Adaptivity.jl b/src/Adaptivity.jl
index cb49dd85..572c898e 100644
--- a/src/Adaptivity.jl
+++ b/src/Adaptivity.jl
@@ -1,6 +1,16 @@
 
 # DistributedAdaptedDiscreteModels
 
+"""
+    DistributedAdaptedDiscreteModel{Dc,Dp}
+
+Type alias for a [`GenericDistributedDiscreteModel`](@ref) whose local models are
+`AdaptedDiscreteModel` objects from Gridap. Represents a distributed mesh obtained by
+refining a coarser mesh, together with the parent-child cell relationship (adaptivity glue).
+
+Constructed by `DistributedAdaptedDiscreteModel(fine_model, coarse_model, glue_array)`.
+Use `get_model`, `get_parent`, and `get_adaptivity_glue` to retrieve components.
+"""
 const DistributedAdaptedDiscreteModel{Dc,Dp} = GenericDistributedDiscreteModel{Dc,Dp,<:AbstractArray{<:AdaptedDiscreteModel{Dc,Dp}}}
 
 struct DistributedAdaptedDiscreteModelCache{A,B,C}
diff --git a/src/Algebra.jl b/src/Algebra.jl
index 5b7b11e3..0ee01ed7 100644
--- a/src/Algebra.jl
+++ b/src/Algebra.jl
@@ -31,10 +31,30 @@ end
 
 # local_views
 
+"""
+    local_views(a)
+
+Return the distributed array of local (per-rank) objects wrapped by `a`.
+
+This is the central accessor for all distributed objects in `GridapDistributed.jl`.
+Use it together with `map` to operate on each rank's local data independently:
+
+```julia
+map(local_views(model)) do local_model
+  println(num_cells(local_model))
+end
+```
+"""
 function local_views(a)
   @abstractmethod
 end
 
+"""
+    get_parts(a)
+
+Return a distributed array of rank indices for the partition of `a`.
+Equivalent to `linear_indices(local_views(a))`.
+"""
 function get_parts(a)
   return linear_indices(local_views(a))
 end
@@ -99,9 +119,6 @@ function change_ghost(a::BlockPVector,ids::BlockPRange;is_consistent=false,make_
   return BlockPVector(vals,ids)
 end
 
-# This type is required in order to be able to access the local portion 
-# of distributed sparse matrices and vectors using local indices from the 
-# distributed test and trial spaces
 struct LocalView{T,N,A} <:AbstractArray{T,N}
   plids_to_value::A
   d_to_lid_to_plid::NTuple{N,Vector{Int32}}
diff --git a/src/Assembly.jl b/src/Assembly.jl
index 0feeb692..27b25d8c 100644
--- a/src/Assembly.jl
+++ b/src/Assembly.jl
@@ -1,8 +1,37 @@
 
 # Parallel assembly strategies
 
+"""
+    Assembled <: AssemblyStrategy
+
+Standard parallel assembly strategy. Local contributions from ghost cells are
+communicated to their owning ranks and summed. The result is a fully consistent
+global `PSparseMatrix` / `PVector` with each row owned by exactly one rank.
+
+This is the DEFAULT strategy for all standard FE problems.
+"""
 struct Assembled <: FESpaces.AssemblyStrategy end
+
+"""
+    SubAssembled <: AssemblyStrategy
+
+Sub-assembled strategy. Each rank assembles only over its owned cells; no
+communication of ghost-cell contributions is performed. Ghost-DOF rows are left
+incomplete. Intended for subdomain-based methods (e.g. domain decomposition) that
+handle the interface coupling themselves.
+"""
 struct SubAssembled <: FESpaces.AssemblyStrategy end
+
+"""
+    LocallyAssembled <: AssemblyStrategy
+
+Local-only assembly strategy. Each rank assembles over both owned and ghost cells,
+but only owned-DOF rows are kept; ghost-DOF contributions are discarded. This ASSUMES
+that all local contributions for owned rows can be computed locally, without communication.
+
+WARNING: This is NOT true in general and might yield incorrect assembled systems. 
+Use with caution and only if you know what you are doing.
+"""
 struct LocallyAssembled <: FESpaces.AssemblyStrategy end
 
 function local_assembly_strategy(::Union{Assembled,SubAssembled},rows,cols)
@@ -22,6 +51,16 @@ end
 
 # PSparseMatrix and PVector builders
 
+"""
+    DistributedArrayBuilder{T,N,B}
+
+Builder object used by Gridap's assembly machinery to allocate distributed arrays
+(`PVector` or `PSparseMatrix`). `T` is the local array type, `N` its dimensionality
+(1 for vectors, 2 for matrices), and `B` the assembly strategy.
+
+Constructed automatically by `SparseMatrixAssembler` — users rarely need to build
+this directly.
+"""
 struct DistributedArrayBuilder{T,N,B}
   local_array_type::Type{T}
   strategy::B
diff --git a/src/CellData.jl b/src/CellData.jl
index 1115d806..07b0fe96 100644
--- a/src/CellData.jl
+++ b/src/CellData.jl
@@ -16,6 +16,10 @@ end
 
 # DistributedCellField
 """
+    DistributedCellField{A,B,C} <: CellField
+
+Distributed cell field. Wraps a distributed array of local `CellField` objects
+together with the underlying [`DistributedTriangulation`](@ref). 
 """
 struct DistributedCellField{A,B,C} <: CellField
   fields::A
@@ -189,6 +193,10 @@ end
 
 # Integration related
 """
+    DistributedMeasure{A,B} <: GridapType
+
+Distributed integration measure. Wraps a distributed array of local `Measure`
+objects and the underlying [`DistributedTriangulation`](@ref).
 """
 struct DistributedMeasure{A<:AbstractArray{<:Measure},B<:DistributedTriangulation} <: GridapType
   measures::A
@@ -216,6 +224,10 @@ function CellData.get_cell_points(a::DistributedMeasure)
 end
 
 """
+    DistributedDomainContribution{A} <: GridapType
+
+The result of a distributed integral `∫(f)dΩ`. Wraps a distributed array of local
+`DomainContribution` objects.
 """
 struct DistributedDomainContribution{A<:AbstractArray{<:DomainContribution}} <: GridapType
   contribs::A
diff --git a/src/FESpaces.jl b/src/FESpaces.jl
index 3895d6f8..79557eb9 100644
--- a/src/FESpaces.jl
+++ b/src/FESpaces.jl
@@ -1,6 +1,16 @@
 #
 # Generic FE space related methods
 
+"""
+    DistributedFESpace <: FESpace
+
+Abstract supertype for all distributed finite element spaces. Subtypes wrap a
+distributed array of local `FESpace` objects (one per rank) and a `PRange` of global
+DOF indices.
+
+The standard concrete implementations are [`DistributedSingleFieldFESpace`](@ref) and
+[`DistributedMultiFieldFESpace`](@ref).
+"""
 abstract type DistributedFESpace <: FESpace end
 
 function FESpaces.get_vector_type(fs::DistributedFESpace)
@@ -86,6 +96,10 @@ end
 
 # Single field related
 """
+    DistributedSingleFieldFESpace{A,B,C,D,E} <: DistributedFESpace
+
+Distributed single-field FE space. Stores a distributed array of local
+`SingleFieldFESpace` objects (`spaces`) and the global DOF `PRange` (`gids`).
 """
 struct DistributedSingleFieldFESpace{A,B,C,D,E} <: DistributedFESpace
   spaces::A
diff --git a/src/Geometry.jl b/src/Geometry.jl
index 1d6ec322..13b1185f 100644
--- a/src/Geometry.jl
+++ b/src/Geometry.jl
@@ -2,12 +2,33 @@
 struct WithGhost end
 struct NoGhost end
 
+"""
+    with_ghost
+
+Sentinel value passed to triangulation and FE-function accessors to request that
+ghost (off-processor) entries be included in the returned local array.
+
+See also [`no_ghost`](@ref).
+"""
 const with_ghost = WithGhost()
+
+"""
+    no_ghost
+
+Sentinel value passed to triangulation and FE-function accessors to request that
+only owned (on-processor) entries be included in the returned local array.
+
+See also [`with_ghost`](@ref).
+"""
 const no_ghost = NoGhost()
 
 # We do not inherit from Grid on purpose.
 # This object cannot implement the Grid interface in a strict sense
 """
+    DistributedGrid{Dc,Dp,A} <: GridapType
+
+Distributed counterpart of Gridap's `Grid`. Wraps a distributed array of local
+`Grid` objects.
 """
 struct DistributedGrid{Dc,Dp,A} <: GridapType
   grids::A
@@ -37,6 +58,11 @@ Geometry.num_point_dims(::Type{<:DistributedGrid{Dc,Dp}}) where {Dc,Dp} = Dp
 # We do not inherit from GridTopology on purpose.
 # This object cannot implement the GridTopology interface in a strict sense
 """
+    DistributedGridTopology{Dc,Dp,A,B} <: GridapType
+
+Distributed counterpart of Gridap's `GridTopology`. Stores a distributed array of
+local topologies together with distributed index ranges for each
+face dimension.
 """
 struct DistributedGridTopology{Dc,Dp,A,B} <: GridapType
   topos::A
@@ -169,6 +195,11 @@ function Geometry.get_isboundary_face(topo::DistributedGridTopology, d::Integer)
 end
 
 """
+    DistributedFaceLabeling{A} <: GridapType
+
+Distributed counterpart of Gridap's `FaceLabeling`. Wraps a distributed array of
+local `FaceLabeling` objects. Tags are applied locally on each rank and do NOT need to be 
+consistent; consistency across must be ensured explicitly by the user.
 """
 struct DistributedFaceLabeling{A<:AbstractArray{<:FaceLabeling}}
   labels::A
@@ -215,13 +246,27 @@ end
 # This object cannot implement the DiscreteModel interface in a strict sense
 
 """
+    DistributedDiscreteModel{Dc,Dp} <: GridapType
+
+Abstract type for distributed mesh models. Each MPI rank holds a local portion of the
+mesh (owned cells + ghost cells).
 """
 abstract type DistributedDiscreteModel{Dc,Dp} <: GridapType end
 
+"""
+    get_cell_gids(model::DistributedDiscreteModel) -> PRange
+
+Return the `PRange` of global cell indices for the distributed model.
+"""
 function get_cell_gids(model::DistributedDiscreteModel{Dc}) where Dc
   @abstractmethod
 end
 
+"""
+    get_face_gids(model::DistributedDiscreteModel, dim::Integer) -> PRange
+
+Return the `PRange` of global `dim`-face indices for the distributed model.
+"""
 function get_face_gids(model::DistributedDiscreteModel,dim::Integer)
   @abstractmethod
 end
@@ -268,6 +313,11 @@ function Geometry.get_face_labeling(model::DistributedDiscreteModel)
 end
 
 """
+    GenericDistributedDiscreteModel{Dc,Dp,A,B,C} <: DistributedDiscreteModel{Dc,Dp}
+
+Standard concrete implementation of [`DistributedDiscreteModel`](@ref). Stores a
+distributed array of local `DiscreteModel` objects (`models`) and a vector of
+`PRange` objects for each face dimension (`face_gids`).
 """
 struct GenericDistributedDiscreteModel{Dc,Dp,A,B,C} <: DistributedDiscreteModel{Dc,Dp}
   models::A
@@ -609,6 +659,13 @@ end
 # We do not inherit from Triangulation on purpose.
 # This object cannot implement the Triangulation interface in a strict sense
 """
+    DistributedTriangulation{Dc,Dp,A,B,C} <: GridapType
+
+Distributed counterpart of Gridap's `Triangulation`. Wraps a distributed array of
+local triangulations (one per rank).
+
+Constructed from a `DistributedDiscreteModel` using the standard Gridap constructors:
+`Triangulation(model)`, `Boundary(model, tags=...)`, etc.
 """
 struct DistributedTriangulation{Dc,Dp,A,B,C} <: GridapType
   trians  ::A
diff --git a/src/MultiField.jl b/src/MultiField.jl
index 00400854..053c6ff1 100644
--- a/src/MultiField.jl
+++ b/src/MultiField.jl
@@ -62,6 +62,14 @@ end
 # DistributedMultiFieldFESpace
 
 """
+    DistributedMultiFieldFESpace{MS,A,B,C,D} <: DistributedFESpace
+
+Distributed multi-field FE space coupling several [`DistributedSingleFieldFESpace`](@ref)
+objects into a block system. The global DOF indices are stored as either a `PRange`
+(monolithic) or a `BlockPRange` (block-structured, for block preconditioners).
+
+Constructed via `MultiFieldFESpace([V, Q, ...])` from a vector of distributed
+single-field spaces.
 """
 struct DistributedMultiFieldFESpace{MS,A,B,C,D} <: DistributedFESpace
   multi_field_style::MS
diff --git a/src/PArraysExtras.jl b/src/PArraysExtras.jl
index a1edc54e..4453e22e 100644
--- a/src/PArraysExtras.jl
+++ b/src/PArraysExtras.jl
@@ -320,10 +320,19 @@ end
 @inline get_part_id(comm::MPIVoidVector) = get_part_id(comm.comm)
 
 """
-    i_am_in(comm::MPIArray)
-    i_am_in(comm::DebugArray)
-  
-  Returns `true` if the processor is part of the subcommunicator `comm`.
+    i_am_in(comm) -> Bool
+
+Return `true` if the current MPI rank belongs to the (sub-)communicator `comm`.
+In `DebugArray` mode always returns `true`.
+
+Use this to guard code that should only run on a subset of ranks:
+
+```julia
+coarse_ranks = generate_subparts(ranks, 2)
+if i_am_in(coarse_ranks)
+  # only executed on ranks 1–2
+end
+```
 """
 function i_am_in(comm::MPI.Comm)
   get_part_id(comm) >=0
@@ -345,6 +354,16 @@ function change_parts(x::Union{MPIArray,DebugArray,Nothing,MPIVoidVector}, new_p
   return x_new
 end
 
+"""
+    generate_subparts(parts, new_comm_size) -> AbstractArray
+
+Return a new distributed array of rank indices with `new_comm_size` entries,
+formed from the first `new_comm_size` ranks of `parts`. Ranks outside the new
+sub-communicator receive an `MPIVoidVector` (a null communicator placeholder).
+
+Use [`i_am_in`](@ref) to check whether the current rank belongs to the returned
+sub-communicator before executing sub-communicator-specific code.
+"""
 function generate_subparts(parts::MPIArray,new_comm_size)
   root_comm = parts.comm
   root_size = MPI.Comm_size(root_comm)
diff --git a/src/Visualization.jl b/src/Visualization.jl
index cfaafb34..71f2b527 100644
--- a/src/Visualization.jl
+++ b/src/Visualization.jl
@@ -216,6 +216,14 @@ function Visualization.createvtk(
   )
 end
 
+"""
+    DistributedPvd{T<:AbstractArray}
+
+Handle for a distributed ParaView Data (PVD) collection used to write time-series
+VTK output from a parallel simulation. The actual file is only written by rank 0.
+
+Created by `createpvd(parts, filename)` and closed by `savepvd(pvd)`.
+"""
 struct DistributedPvd{T<:AbstractArray}
   pvds::T
 end

From d94f7447979a41dffaab13707105589e833705f0 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Mon, 18 May 2026 12:07:17 +1000
Subject: [PATCH 45/48] Revamp docs

---
 docs/make.jl                  |  22 +++---
 docs/src/Adaptivity.md        |   6 --
 docs/src/Algebra.md           |   6 --
 docs/src/CellData.md          |   6 --
 docs/src/FESpaces.md          |   6 --
 docs/src/Geometry.md          |  15 ----
 docs/src/GridapDistributed.md |   6 --
 docs/src/MultiField.md        |   6 --
 docs/src/Visualization.md     |   6 --
 docs/src/adaptivity.md        |  97 ++++++++++++++++++++++++++
 docs/src/algebra.md           |  65 ++++++++++++++++++
 docs/src/assembly.md          |  80 ++++++++++++++++++++++
 docs/src/backends.md          | 102 ++++++++++++++++++++++++++++
 docs/src/fespaces.md          |  81 ++++++++++++++++++++++
 docs/src/geometry.md          | 124 ++++++++++++++++++++++++++++++++++
 docs/src/getting-started.md   |  45 ------------
 docs/src/index.md             |  61 ++++++++---------
 docs/src/visualization.md     |  39 +++++++++++
 18 files changed, 625 insertions(+), 148 deletions(-)
 delete mode 100644 docs/src/Adaptivity.md
 delete mode 100644 docs/src/Algebra.md
 delete mode 100644 docs/src/CellData.md
 delete mode 100644 docs/src/FESpaces.md
 delete mode 100644 docs/src/Geometry.md
 delete mode 100644 docs/src/GridapDistributed.md
 delete mode 100644 docs/src/MultiField.md
 delete mode 100644 docs/src/Visualization.md
 create mode 100644 docs/src/adaptivity.md
 create mode 100644 docs/src/algebra.md
 create mode 100644 docs/src/assembly.md
 create mode 100644 docs/src/backends.md
 create mode 100644 docs/src/fespaces.md
 create mode 100644 docs/src/geometry.md
 delete mode 100644 docs/src/getting-started.md
 create mode 100644 docs/src/visualization.md

diff --git a/docs/make.jl b/docs/make.jl
index 73d36b0a..0a13c924 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,17 +1,15 @@
 using Documenter, GridapDistributed
 
 pages = [
-  "Home" => "index.md",
-  "Getting Started" => "getting-started.md",
-  "GridapDistributed" => "GridapDistributed.md",
-  "Algebra" => "Algebra.md",
-  "CellData" => "CellData.md",
-  "FESpaces" => "FESpaces.md",
-  "Geometry" => "Geometry.md",
-  "MultiField" => "MultiField.md",
-  "Visualization" => "Visualization.md",
-  "Adaptivity" => "Adaptivity.md",
- ]
+  "Home"          => "index.md",
+  "Backends"      => "backends.md",
+  "Algebra"       => "algebra.md",
+  "Geometry"      => "geometry.md",
+  "FESpaces"      => "fespaces.md",
+  "Assembly"      => "assembly.md",
+  "Adaptivity"    => "adaptivity.md",
+  "Visualization" => "visualization.md",
+]
 
 makedocs(;
     modules=[GridapDistributed],
@@ -20,7 +18,7 @@ makedocs(;
     repo="https://github.com/gridap/GridapDistributed.jl/blob/{commit}{path}#L{line}",
     sitename="GridapDistributed.jl",
     authors="S. Badia <santiago.badia@monash.edu>, A. F. Martin <alberto.f.martin@anu.edu.au>, F. Verdugo <fverdugo@cimne.upc.edu>",
-    # warnonly=true, # for debugging
+    warnonly=true,
 )
 
 deploydocs(;
diff --git a/docs/src/Adaptivity.md b/docs/src/Adaptivity.md
deleted file mode 100644
index 3bf14320..00000000
--- a/docs/src/Adaptivity.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Adaptivity
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["Adaptivity.jl","Redistribution.jl"]
-```
diff --git a/docs/src/Algebra.md b/docs/src/Algebra.md
deleted file mode 100644
index 462ac8d4..00000000
--- a/docs/src/Algebra.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Algebra
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["Algebra.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/CellData.md b/docs/src/CellData.md
deleted file mode 100644
index 7e6a2c1e..00000000
--- a/docs/src/CellData.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# CellData
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["CellData.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/FESpaces.md b/docs/src/FESpaces.md
deleted file mode 100644
index 37f10693..00000000
--- a/docs/src/FESpaces.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# FESpaces
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["FESpaces.jl","DivConformingFESpaces.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/Geometry.md b/docs/src/Geometry.md
deleted file mode 100644
index 11d2c37c..00000000
--- a/docs/src/Geometry.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Geometry
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["Geometry.jl"]
-```
-
-## MacroDiscreteModels
-
-These are functionalities to select and globally number the interfaces between processors:
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["MacroDiscreteModels.jl"]
-```
diff --git a/docs/src/GridapDistributed.md b/docs/src/GridapDistributed.md
deleted file mode 100644
index 022222d1..00000000
--- a/docs/src/GridapDistributed.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# GridapDistributed
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["GridapDistributed.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/MultiField.md b/docs/src/MultiField.md
deleted file mode 100644
index f1472582..00000000
--- a/docs/src/MultiField.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# MultiField
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["MultiField.jl","BlockPartitionedArrays.jl"]
-```
diff --git a/docs/src/Visualization.md b/docs/src/Visualization.md
deleted file mode 100644
index 8eda502c..00000000
--- a/docs/src/Visualization.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Visualization
-
-```@autodocs
-Modules = [GridapDistributed]
-Pages   = ["Visualization.jl"]
-```
\ No newline at end of file
diff --git a/docs/src/adaptivity.md b/docs/src/adaptivity.md
new file mode 100644
index 00000000..a55695bd
--- /dev/null
+++ b/docs/src/adaptivity.md
@@ -0,0 +1,97 @@
+# [Adaptivity](@id adaptivity)
+
+`GridapDistributed.jl` provides an interface for the adaptive mesh refinement (AMR) abstractions defined in `Gridap.jl`. Moreover, we also provide redistribution of meshes.
+The framework here works out of the box for Gridap's built-in refinement routines, but also interfaces with [`GridapP4est.jl`](https://github.com/gridap/GridapP4est.jl) for scalable AMR on unstructured meshes.
+
+## Adapted discrete models
+
+`DistributedAdaptedDiscreteModel` represents a distributed mesh obtained by refining a
+parent mesh. It wraps both the fine (child) mesh, the coarse (parent) mesh, and a
+distributed array of `AdaptivityGlue` objects that record the parent-child cell
+relationship.
+
+### Uniform Cartesian refinement
+
+```julia
+using Gridap, GridapDistributed, PartitionedArrays
+
+function main(distribute)
+  ranks  = distribute(LinearIndices((4,)))
+  coarse = CartesianDiscreteModel(ranks, (2,2), (0,1,0,1), (4,4))
+
+  # Refine uniformly by a factor of 2 in every direction
+  fine = refine(coarse, 2)   # returns a DistributedAdaptedDiscreteModel
+
+  parent     = get_parent(fine)
+  glue_array = get_adaptivity_glue(fine)
+
+  # The fine mesh is used like any other DistributedDiscreteModel
+  reffe = ReferenceFE(lagrangian, Float64, 1)
+  V     = TestFESpace(fine, reffe)
+  uh    = interpolate(x -> sum(x), V)
+end
+
+with_debug(main)
+```
+
+`refine(model, r)` accepts an integer or a tuple refinement factor; `refine(model, 2)`
+halves the cell size in every direction.
+
+## Sub-communicators
+
+AMR workflows sometimes involve different sets of ranks at different stages (e.g. a coarse mesh on 2 ranks that is refined onto 8). Use `generate_subparts` to carve out a sub-communicator of `n` ranks from the full set:
+
+```julia
+fine_ranks   = distribute(LinearIndices((8,)))
+coarse_ranks = generate_subparts(fine_ranks, 2)  # ranks 1–2 only
+
+if i_am_in(coarse_ranks)
+  # only executed on ranks 1 and 2
+end
+```
+
+Ranks outside the sub-communicator receive an inert placeholder and skip the guarded block.
+
+## Redistribution
+
+After refinement the load may become unbalanced. `redistribute` moves a distributed mesh from one partition layout to another and returns a `RedistributeGlue` that records the data movement:
+
+```julia
+# Move from a 2×2 to a 4×1 partition
+new_desc       = DistributedCartesianDescriptor(ranks, (4,1), (0,1,0,1), (8,8))
+new_model, glue = redistribute(old_model, new_desc)
+```
+
+The `RedistributeGlue` can then migrate free-DOF values from the old to the new partition:
+
+```julia
+redistribute_free_values(
+  new_free_values, new_fespace,
+  old_free_values, old_dir_values, old_fespace,
+  new_model, glue
+)
+```
+
+Redistribution for unstructured meshes is handled inside `GridapP4est.jl`.
+
+## API
+
+### Adapted models
+
+```@docs
+GridapDistributed.DistributedAdaptedDiscreteModel
+```
+
+### Redistribution
+
+```@docs
+redistribute
+GridapDistributed.RedistributeGlue
+```
+
+### Sub-communicators
+
+```@docs
+GridapDistributed.generate_subparts
+GridapDistributed.i_am_in
+```
diff --git a/docs/src/algebra.md b/docs/src/algebra.md
new file mode 100644
index 00000000..5203e029
--- /dev/null
+++ b/docs/src/algebra.md
@@ -0,0 +1,65 @@
+# [Algebra](@id algebra)
+
+`GridapDistributed.jl` relies on [`PartitionedArrays.jl`](https://github.com/fverdugo/PartitionedArrays.jl) for all distributed linear algebra. Users interacting with the assembled system — for example, to inspect residuals or feed vectors into a custom solver — will work directly with PartitionedArrays types. This page gives a brief conceptual overview; the [PartitionedArrays documentation](https://fverdugo.github.io/PartitionedArrays.jl/stable/) is the authoritative reference.
+
+## Distributed index ranges: `PRange`
+
+A `PRange` is a distributed range of integers. It is the backbone of every distributed array: it records, for each local index on each rank, its global ID and which rank owns it.
+
+Each rank's slice of a `PRange` is an `AbstractLocalIndices` object that exposes the local index layout:
+
+| Function | Returns |
+|---|---|
+| `local_to_global(idx)` | Global ID for each local index (owned + ghost) |
+| `own_to_global(idx)` | Global IDs of owned (non-ghost) indices |
+| `ghost_to_global(idx)` | Global IDs of ghost indices |
+| `own_to_local(idx)` | Local positions of owned indices |
+| `ghost_to_local(idx)` | Local positions of ghost indices |
+| `local_to_owner(idx)` | Owner rank for each local index |
+
+Access the local indices on each rank via `partition`:
+
+```julia
+map(partition(prange)) do indices
+  lid_to_gid = local_to_global(indices)
+end
+```
+
+`PRanges` will be used as axes for distributed vectors and matrices, and to define the communication pattern for synchronizing ghost values.
+
+## Distributed vectors: `PVector`
+
+A `PVector` is a distributed dense vector. Each rank stores a local segment that includes both **owned** entries (for which this rank is authoritative) and **ghost** entries (copies of entries owned by neighboring ranks).
+
+Two operations synchronize the distributed state:
+
+- `consistent!(x)` — broadcasts owned values to ghost copies on neighboring ranks. Call
+  this after a solve to ensure ghost entries reflect the current solution.
+- `assemble!(x)` — accumulates ghost contributions into the owning rank's entry. Used
+  during assembly to collect off-rank contributions.
+
+Both return a task; wait for completion with `|> wait` or `fetch`.
+
+A normal workflow would be:
+
+```julia
+  map(partition(x)) do x
+    # Do something locally with the values of x
+    # NO communication is allowed here!
+  end
+  consistent!(x) |> wait # Synchronize ghost values after local updates
+```
+
+## Distributed sparse matrices: `PSparseMatrix`
+
+A `PSparseMatrix` is a distributed sparse matrix stored in compressed form. Each rank owns a block of rows. Non-zero entries in off-rank columns are stored locally and communicated as needed during matrix-vector products (`mul!`).
+
+## GridapDistributed additions
+
+`GridapDistributed.jl` adds a small set of utilities on top of PartitionedArrays to support the FE assembly pipeline.
+
+### API
+
+```@docs
+GridapDistributed.permuted_variable_partition
+```
diff --git a/docs/src/assembly.md b/docs/src/assembly.md
new file mode 100644
index 00000000..eabec8e5
--- /dev/null
+++ b/docs/src/assembly.md
@@ -0,0 +1,80 @@
+# [Assembly](@id assembly)
+
+`GridapDistributed.jl` provides three assembly strategies that control how local
+contributions from different ranks are combined into the global system. These are implemented as Gridap `AssemblyStrategy` instances, and can be specified via the `strategy` argument of `SparseMatrixAssembler`.
+
+```julia
+assem = SparseMatrixAssembler(U, V, strategy)
+op = AffineFEOperator(a, l, U, V, assem)
+```
+
+## `Assembled` (default)
+
+Standard parallel assembly:
+
+- Each rank integrates over its **owned** cells. Contributions to owned rows coming from other ranks are communicated and assembled.
+- The result is a fully assembled system, where each processor holds the correct portions of the global matrix and vector for its owned rows.
+- Extra rows corresponding to ghost DOFs are allocated as cache for in-place re-assembly (e.g. for nonlinear or transient problems), but may contain garbage at all times.
+
+```julia
+assem = SparseMatrixAssembler(U, V, Assembled())
+```
+
+Unless you know what you are doing, this is the assembly strategy you should use.
+
+## `SubAssembled`
+
+- Each rank integrates over its **owned** cells, and assembles the local contributions for both owned and ghost rows. No communication of ghost contributions is performed.
+- The result is a sub-assembled system, where local matrices are incomplete (sub-assembled). As a consequence, matrix-vector products require assembly of the resulting vector (which makes it a bit more expensive than the `Assembled` case).
+- The advantage: The row and column layout of the resulting system is closer to the DOF layout of the FESpace. This is ideal for Domain Decomposition methods (or other geometrical solvers).
+
+```julia
+assem = SparseMatrixAssembler(U, V, SubAssembled())
+```
+
+## `LocallyAssembled`
+
+- Each rank integrates over its **owned and ghost** cells. Contributions to ghost rows are discarded, and no communication is performed. **This assumes that all local contributions to owned rows can be computed without communication**. This is in general **NOT** true.
+- If the invariant holds, the result is a fully assembled system like in the `Assembled` case, but without the communication cost.
+- No extra rows corresponding to ghost DOFs are required for in-place re-assembly.
+
+```julia
+assem = SparseMatrixAssembler(U, V, LocallyAssembled())
+```
+
+!!! warning
+    `LocallyAssembled` produces incorrect results if the assumption above is violated.
+    Use only when you are sure that owned-row contributions are purely local.
+
+## Changing the sparse matrix format
+
+The default format is `SparseMatrixCSC`. To use a different format (e.g. CSR for
+better SpMV performance), pass a custom assembler:
+
+```julia
+using SparseMatricesCSR
+assem = SparseMatrixAssembler(
+  SparseMatrixCSR{0,Float64,Int},
+  Vector{Float64},
+  U, V, strategy
+)
+op = AffineFEOperator(a, l, U, V, assem)
+```
+
+## API
+
+### Strategies
+
+```@docs
+Assembled
+SubAssembled
+LocallyAssembled
+```
+
+### Builder types
+
+```@docs
+GridapDistributed.DistributedArrayBuilder
+GridapDistributed.DistributedCounter
+GridapDistributed.DistributedAllocation
+```
diff --git a/docs/src/backends.md b/docs/src/backends.md
new file mode 100644
index 00000000..cae27f48
--- /dev/null
+++ b/docs/src/backends.md
@@ -0,0 +1,102 @@
+# [Backends and Philosophy](@id backends)
+
+`GridapDistributed.jl` is a parallel extension of [`Gridap.jl`](https://github.com/gridap/Gridap.jl) for distributed-memory environments. The central design goal is **minimal API change**: a simulation written for a single process can be made parallel with only a handful of modifications.
+
+## A sequential Poisson problem
+
+Consider the Poisson equation
+
+```math
+-\Delta u = f \quad \text{in } \Omega = (0,1)^2, \qquad u = 0 \text{ on } \partial\Omega,
+```
+
+solved with Gridap on a single process:
+
+```julia
+using Gridap
+
+model = CartesianDiscreteModel((0,1,0,1), (8,8))
+
+reffe = ReferenceFE(lagrangian, Float64, 1)
+V  = TestFESpace(model, reffe, dirichlet_tags="boundary")
+U  = TrialFESpace(V, x -> 0.0)
+
+Ω  = Triangulation(model)
+dΩ = Measure(Ω, 2)
+
+a(u,v) = ∫( ∇(v)⋅∇(u) )dΩ
+l(v)   = ∫( v*1 )dΩ
+
+op = AffineFEOperator(a, l, U, V)
+uh = solve(op)
+```
+
+## Converting to distributed
+
+To run the same problem across multiple MPI ranks, two changes are needed:
+
+1. Wrap the driver into a function that accepts a `distribute` function argument and the number of processors `parts`.
+2. Distribute the model. In this case, we can use `CartesianDiscreteModel(ranks, parts, domain, cells)`.
+
+Everything else will dispatch on the distributed model, and thus the rest of the driver can be left unchanged:
+
+```julia
+using Gridap, GridapDistributed, PartitionedArrays
+
+function poisson(distribute, parts)
+  ranks = distribute(LinearIndices((prod(parts),)))
+
+  model = CartesianDiscreteModel(ranks, parts, (0,1,0,1), (8,8))
+
+  reffe = ReferenceFE(lagrangian, Float64, 1)
+  V  = TestFESpace(model, reffe, dirichlet_tags="boundary")
+  U  = TrialFESpace(V, x -> 0.0)
+
+  Ω  = Triangulation(model)
+  dΩ = Measure(Ω, 2)
+
+  a(u,v) = ∫( ∇(v)⋅∇(u) )dΩ
+  l(v)   = ∫( v*1 )dΩ
+
+  op = AffineFEOperator(a, l, U, V)
+  solve(op)
+end
+```
+
+`distribute` is a function that maps a `LinearIndices` range onto a
+set of MPI ranks and returns an array of rank IDs.
+
+## The two backends
+
+### `with_debug` — single-process simulation
+
+Unfortunately, good MPI debugging tools are scarce. Moreover, they never integrate well within Julia's development workflow (that is using `Revise.jl` to edit code and re-run it interactively).
+
+To facilitate development and debugging, `PartitionedArrays.jl` provides a `DebugArray` backend that **simulates multiple MPI ranks inside a single Julia process**. This allows you to print, inspect, and debug distributed objects within the REPL using your ordinary Julia workflow. To use it, wrap your driver function with `with_debug`:
+
+```julia
+with_debug() do distribute
+  poisson(distribute, (2,2))
+end
+```
+
+Use this during development and for running tests in CI without MPI.
+
+!!!note
+    The only thing `DebugArray` does is wrap serial arrays with `DebugArray`, which is a serial array that does not allow scalar indexing (e.g. `a[i]`). If you know what you are doing, you can directly use arrays and call `poisson(collect, (2,2))`.
+
+### `with_mpi` — true MPI execution
+
+Once the code is working with `with_debug`, you can switch to the real MPI backend by replacing `with_debug` with `with_mpi`. This will run the code across multiple processes using MPI.
+
+Place the following at the bottom of `poisson.jl`:
+
+```julia
+with_mpi() do distribute
+  poisson(distribute, (2,2))
+end
+```
+
+```bash
+mpiexecjl --project=. -n 4 julia poisson.jl
+```
diff --git a/docs/src/fespaces.md b/docs/src/fespaces.md
new file mode 100644
index 00000000..c2654e41
--- /dev/null
+++ b/docs/src/fespaces.md
@@ -0,0 +1,81 @@
+# [FE Spaces and Cell Data](@id fespaces)
+
+## Distributed FESpaces
+
+`DistributedSingleFieldFESpace` is the parallel counterpart of Gridap's
+`SingleFieldFESpace`. It is constructed with the same syntax:
+
+```julia
+reffe = ReferenceFE(lagrangian, Float64, 1)
+V = TestFESpace(model, reffe, dirichlet_tags="boundary")
+U = TrialFESpace(V, u_dirichlet)
+```
+
+where `model` is a `DistributedDiscreteModel`.
+
+Internally, each rank holds a local `FESpace` on its local portion of the mesh
+(owned + ghost cells). These are accessible via `local_views`:
+
+```julia
+map(local_views(V)) do local_V
+  println("Local ndofs = ", num_free_dofs(local_V))
+end
+```
+
+This API also extends to multi-field spaces.
+
+## Global DOF numbering and ownership
+
+Global DOF IDs are stored as a `PRange` (see [Algebra](@ref algebra)), which can be accessed via `get_free_dof_gids(V)`. Some general considerations:
+
+- DOFs are **owned by the rank that owns the cell(s) they are attached to**. DOFs on the interface between two partitions are owned by the rank **with the highest rank ID**. This convention ensures that each DOF is owned by exactly one rank and appears as a ghost DOF on all neighboring ranks.
+- The global DOF numbering is such that DOFs owned by rank 1 come first, followed by those owned by rank 2, and so on. This means that the global DOF IDs are contiguous within each rank's owned portion, which is convenient for assembly and solver interfaces.
+- We preserve the local DOF numbering of the serial `FESpace` on each rank. That means that locally, owned DOFs are not necessarily (locally) numbered before ghost DOFs.
+
+## Ghost DOF consistency
+
+Within the library, we internally maintain the consistency of ghost DOF values as needed. When implementing your own low-level operations, it is important to do the same. In particular, before using an `FEFunction` for assembly it is important to ensure that ghost DOF values are consistent with their owned counterparts. See for instance the following example:
+
+```julia
+uh = zero(U)
+r(v) = ∫( v*uh )dΩ
+
+x = get_free_dof_values(uh)
+map(partition(x)) do x
+  # Do something locally with the values of x
+  # After this, ghost values may be inconsistent!
+end
+
+b = assemble_vector(r, V) # May be WRONG!
+
+consistent!(x) |> wait
+b = assemble_vector(r, V) # OK
+```
+
+## API
+
+### Abstract interface
+
+```@docs
+GridapDistributed.DistributedFESpace
+```
+
+### Single-field spaces
+
+```@docs
+GridapDistributed.DistributedSingleFieldFESpace
+```
+
+### Multi-field spaces
+
+```@docs
+GridapDistributed.DistributedMultiFieldFESpace
+```
+
+### Cell data
+
+```@docs
+GridapDistributed.DistributedCellField
+GridapDistributed.DistributedMeasure
+GridapDistributed.DistributedDomainContribution
+```
diff --git a/docs/src/geometry.md b/docs/src/geometry.md
new file mode 100644
index 00000000..7af262e9
--- /dev/null
+++ b/docs/src/geometry.md
@@ -0,0 +1,124 @@
+# [Geometry](@id geometry)
+
+## The concept of a distributed mesh
+
+Each MPI rank holds a local portion of the global mesh divided into two parts:
+
+- **Owned cells** — cells that this rank is responsible for. Each cell is owned by exactly one rank.
+- **Ghost cells** — cells owned by neighboring ranks, but needed locally to ensure consistency during local operations.
+
+This overlapping layout means every rank has (in most cases) enough context to perform local operations to its owned cells without communication. Ghost cells are synchronized as needed to ensure consistency across ranks.
+
+## `DistributedDiscreteModel`
+
+The parallel counterpart of Gridap's `DiscreteModel`. Internally it wraps a distributed array of local `DiscreteModel` objects, one per rank.
+
+### Cartesian meshes
+
+The simplest way to build a distributed mesh is `CartesianDiscreteModel`, which partitions a structured Cartesian grid according to a `parts` tuple:
+
+```julia
+parts = (2, 2) # 2×2 rank layout
+ranks = distribute(LinearIndices((prod(parts),)))
+
+# 2D Cartesian mesh on [0,1]² with 8×8 cells
+model = CartesianDiscreteModel(ranks, parts, (0,1,0,1), (8,8))
+```
+
+In this case, each rank holds a `3x3` block of cells, of which only `2x2` are owned and the remaining are ghost cells from neighboring ranks.
+
+### Distributing a serial mesh
+
+An existing serial `DiscreteModel` can be distributed by providing a cell-to-rank assignment vector:
+
+```julia
+serial_model  = DiscreteModelFromFile("mesh.msh")
+cell_to_part  = ...  # integer vector of length num_cells(serial_model)
+
+model = DiscreteModel(ranks, serial_model, cell_to_part)
+```
+
+The `cell_to_part` vector assigns each cell in the serial model to a rank. This array can be generated by a graph partitioner like METIS or Scotch, or by a custom geometric criterion.
+
+This is the mechanism used internally by [`GridapGmsh.jl`](https://github.com/gridap/GridapGmsh.jl): it reads the Gmsh file on rank 0, partitions it with METIS, and distributes it via this function.
+
+## Accessing local data: `local_views` and `map`
+
+The central idiom in `GridapDistributed.jl` is `map` over `local_views`:
+
+```julia
+map(local_views(model)) do local_model
+  # This block runs independently on every rank
+  n = num_cells(local_model)   # includes ghost cells
+end
+```
+
+`local_views(x)` returns the distributed array of per-rank local objects. `map` applies a function to each local piece simultaneously (sequentially in `DebugArray` mode; one MPI process per rank in `MPIArray` mode). Most distributed objects support `local_views`: models, triangulations, FE spaces, cell fields, vectors, matrices.
+
+`get_parts(x)` returns the rank array associated with `x`, useful when you need to build new distributed objects with the same communicator.
+
+## Global cell and face IDs
+
+```julia
+cell_gids = get_cell_gids(model)       # PRange of global cell IDs
+face_gids = get_face_gids(model, d)    # PRange of global d-face IDs
+```
+
+These `PRange` objects (see [Algebra](@ref algebra)) record, for each local cell/face, its global ID and which rank owns it. They are the foundation for assembling global sparse matrices and vectors.
+
+## Ghost vs. no-ghost triangulations
+
+Triangulations can include or exclude ghost cells:
+
+```julia
+Ωg  = Triangulation(model, with_ghost) # owned + ghost cells (default)
+Ωng = Triangulation(model, no_ghost)   # owned cells only
+```
+
+Generally speaking, triangulations used for assembly should exclude ghost cells to avoid double-counting contributions, while triangulations used to create other objects (e.g. FE spaces) should include ghost cells to ensure overlapping consistency.
+
+The default is `no_ghost`, but no special care is needed when building FE spaces.
+
+## API
+
+### Discrete models
+
+```@docs
+GridapDistributed.DistributedDiscreteModel
+GridapDistributed.GenericDistributedDiscreteModel
+```
+
+### Global index ranges
+
+```@docs
+get_cell_gids
+get_face_gids
+```
+
+### Local data access
+
+```@docs
+local_views
+get_parts
+```
+
+### Ghost control
+
+```@docs
+with_ghost
+no_ghost
+```
+
+### Triangulations
+
+```@docs
+GridapDistributed.DistributedTriangulation
+```
+
+### Supporting types
+
+```@docs
+GridapDistributed.DistributedGrid
+GridapDistributed.DistributedGridTopology
+GridapDistributed.DistributedFaceLabeling
+```
diff --git a/docs/src/getting-started.md b/docs/src/getting-started.md
deleted file mode 100644
index c580ebe2..00000000
--- a/docs/src/getting-started.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Getting Started
-
-## Installation requirements
-
-GridapDistributed is tested on Linux, but it should be also possible to use it on Mac OS and Windows since it is written exclusively in Julia and it only depends on registered Julia packages.
-
-## Installation
-
-GridapDistributed is a registered package. Thus, the installation should be straight forward using the Julia's package manager [Pkg](https://julialang.github.io/Pkg.jl/v1/). To this end, open the Julia REPL (i.e., execute the `julia` binary), type `]` to enter package mode, and install GridapDistributed as follows
-
-```julia
-pkg> add GridapDistributed
-```
-
-You will also need the `PartitionedArrays` package. 
-
-```julia
-pkg> add PartitionedArrays
-```
-
-If you want to leverage the satellite packages of `GridapDistributed.jl`, i.e., `GridapPETSc.jl`, `GridapP4est.jl`, and/or `GridapGmsh.jl`, you have to install them separately as 
-
-```julia
-pkg> add GridapPETSc
-```
-
-```julia
-pkg> add GridapGmsh
-```
-
-```julia
-pkg> add GridapP4est
-```
-
-Please note that these three packages depend on binary builds of the corresponding libraries they wrap (i.e., PETSc, Gmsh, and P4est). By default, they will leverage binary builds available at the Julia package registry. However, one may also use custom installations of these libraries. See the documentation of the corresponding package for more details
-
-
-For further information about how to install and manage Julia packages, see the
-[Pkg documentation](https://julialang.github.io/Pkg.jl/v1/).
-
-## Further steps
-
-If you are new to the `Gridap` ecosystem of packages, we recommend that you first follow the [Gridap Tutorials](https://gridap.github.io/Tutorials/dev/) step by step in order to get familiar with the `Gridap.jl` library. `GridapDistributed.jl` and `Gridap.jl` share almost the same high-level API. Therefore, some familiarity with `Gridap.jl` is highly recommended (if not essential) before starting with `GridapDistributed.jl`. 
-
-If you are already familiarized with `Gridap.jl`, we recommend you to start straight away with the following [tutorial](https://gridap.github.io/Tutorials/dev/pages/t016_poisson_distributed).
\ No newline at end of file
diff --git a/docs/src/index.md b/docs/src/index.md
index 9fcca624..e386e588 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,46 +1,39 @@
 # GridapDistributed.jl
 
-Documentation of the `GridapDistributed.jl` library.
+**Parallel finite element computations on distributed-memory machines.**
 
-!!! note
+`GridapDistributed.jl` extends [`Gridap.jl`](https://github.com/gridap/Gridap.jl) to distributed-memory parallel environments using MPI. It mirrors the Gridap sequential API almost exactly: a simulation script written for a single process can be made parallel with minimal changes.
 
-     These documentation pages are under construction.
+## Installation
 
-## Introduction
+`GridapDistributed.jl` is a registered Julia package. We recommend a minimum environment containing `Gridap` (our Finite Elements backend) and `PartitionedArrays` (our distributed arrays backend). To install it, simply run:
 
-The ever-increasing demand for resolution and accuracy in mathematical models of physical processes governed by systems of Partial Differential Equations (PDEs) 
-can only be addressed using fully-parallel advanced numerical discretization methods and scalable solution methods, thus able to exploit the vast amount of computational resources in state-of-the-art supercomputers. To this end, `GridapDistributed.jl` is a registered software package which provides 
-fully-parallel distributed memory data structures and associated methods
-for the Finite Element (FE) numerical solution of PDEs on parallel computers. Thus, it can be run on multi-core CPU desktop computers at small scales, as well as on HPC clusters and supercomputers at medium/large scales. The data structures in `GridapDistributed.jl` are designed to mirror as far as possible their counterparts in the `Gridap.jl` Julia software package, while implementing/leveraging most of their abstract interfaces. As a result, sequential Julia scripts written in the high-level Application Programming Interface (API) of `Gridap.jl` can be used verbatim up to minor adjustments in a parallel distributed memory context using `GridapDistributed.jl`.
-This equips end-users with a tool for the development of simulation codes able to solve real-world application problems on massively parallel supercomputers while using a highly expressive, compact syntax, that resembles mathematical notation. This is indeed one of the main advantages of `GridapDistributed.jl` and a major design goal that we pursue.
+```julia
+pkg> add Gridap, GridapDistributed, PartitionedArrays
+```
 
-In order to scale FE simulations to large core counts, the mesh used to discretize the computational domain on which the PDE is posed must be partitioned (distributed) among the parallel tasks such that each of these only holds a local portion of the global mesh. The same requirement applies to the rest of data structures in the FE simulation pipeline, i.e., FE space, linear system, solvers, data output, etc. The local portion of each task is composed by a set of cells that it owns, i.e., the **local cells** of the task, and a set of off-processor cells (owned by remote processors) which are in touch with its local cells, i.e., the **ghost cells** of the task.
-This overlapped mesh partition is used by `GridapDistributed.jl`, among others, to exchange data among nearest neighbors, and to glue together global Degrees of Freedom (DoFs) which are sitting on the interface among subdomains. Following this design principle, `GridapDistributed.jl` provides scalable parallel data structures and associated methods for simple grid handling (in particular, Cartesian-like meshes of arbitrary-dimensional, topologically n-cube domains), FE spaces setup, and distributed linear system assembly. It is in our future plans to provide highly scalable linear and nonlinear solvers tailored for the FE discretization of PDEs (e.g., linear and nonlinear matrix-free geometric multigrid and domain decomposition preconditioners). In the meantime, however, `GridapDistributed.jl` can be combined with other Julia packages in order to realize the full potential required in real-world applications. These packages and their relation with `GridapDistributed.jl` are overviewed in the next section. 
+## Documentation overview
 
-## Building blocks and composability
+| Section | What you will find |
+|---|---|
+| [Backends](@ref backends) | Design goals, serial-to-distributed conversion, debug vs MPI backends |
+| [Algebra](@ref algebra) | PartitionedArrays overview: `PRange`, `PVector`, `PSparseMatrix` |
+| [Geometry](@ref geometry) | Distributed meshes, ghost cells, Cartesian and unstructured models |
+| [FESpaces](@ref fespaces) | DOF numbering, ghost DOFs, multi-field spaces, cell fields |
+| [Assembly](@ref assembly) | `Assembled`, `SubAssembled`, `LocallyAssembled` strategies |
+| [Adaptivity](@ref adaptivity) | AMR interface, uniform refinement, redistribution, GridapP4est |
+| [Visualization](@ref visualization) | Parallel VTK output for ParaView |
 
-The figure below depicts the relation among `GridapDistributed.jl` and other packages in the Julia package ecosystem. The interaction of `GridapDistributed.jl` and its dependencies is mainly designed with separation of concerns in mind towards high composability and modularity. On the one hand, `Gridap.jl` provides a rich set of abstract types/interfaces suitable for the FE solution of PDEs. It also provides realizations (implementations) of these abstractions tailored to serial/multi-threaded computing environments. `GridapDistributed.jl` **implements** these abstractions for parallel distributed-memory computing environments. To this end, `GridapDistributed.jl` also leverages (**uses**) the serial realizations in `Gridap.jl` and associated methods to handle the local portion on each parallel task. (See arrow labels in the figure below.)  On the other hand, `GridapDistributed.jl` relies on `PartitionedArrays.jl` in order to handle the parallel execution model (e.g., message-passing via the Message Passing Interface (MPI)), global data distribution layout, and communication among tasks. `PartitionedArrays.jl` also provides a parallel implementation of partitioned global linear systems (i.e., linear algebra vectors and sparse matrices) as needed in grid-based numerical simulations. While `PartitionedArrays.jl` is an stand-alone package, segregated from `GridapDistributed.jl`, it was designed with parallel FE packages such as `GridapDistributed.jl` in mind. In any case, `GridapDistributed.jl` is designed so that a different distributed linear algebra library from `PartitionedArrays.jl` might be used as well, as far as it is able to provide the same functionality. 
+## Package ecosystem
 
+`GridapDistributed.jl` is integrated into the Gridap ecosystem. In particular, it can be used together with most packages within the ecosystem, such as:
 
-| ![fig:packages](./packages_sketchy.png) |
-|:--:|
-|`GridapDistributed.jl` and its relation to other packages in the Julia package ecosystem. In this diagram, each rectangle represents  a Julia package, while the (directed) arrows represent relations (dependencies) among packages. Both the direction of the arrow and the label attached to the arrows are used to denote the nature of the relation. Thus, e.g., `GridapDistributed.jl` depends on `Gridap.jl` and `PartitionedArrays.jl` , and GridapPETSc depends on `Gridap.jl`  and `PartitionedArrays.jl` . Note that, in the diagram, the arrow direction is relevant, e.g., GridapP4est depends on `GridapDistributed.jl` but not conversely.|
-
-## How to use this documentation
-
-* The first step for new users is to visit the [Getting Started](@ref) page.
-
-* A set of tutorials written as Jupyter notebooks and html pages are available [here](https://github.com/gridap/Tutorials).
-
-* The detailed documentation is in the rest of sections of the left hand side menu. (WIP)
+- [`GridapGmsh.jl`](https://github.com/gridap/GridapGmsh.jl) for mesh generation and import.
+- [`GridapSolvers.jl`](https://github.com/gridap/GridapSolvers.jl) and [`GridapPETSc.jl`](https://github.com/gridap/GridapPETSc.jl) for distributed solvers and preconditioners.
+- [`GridapP4est.jl`](https://github.com/gridap/GridapP4est.jl) for parallel adaptive mesh refinement.
+- [`GridapEmbedded.jl`](https://github.com/gridap/GridapEmbedded.jl) for embedded boundary methods.
+- [`GridapTopOpt.jl`](https://github.com/zjwegert/GridapTopOpt.jl) for parallel topology optimization.
 
-* Guidelines for developers of the Gridap project is found in the [Gridap wiki](https://github.com/gridap/Gridap.jl/wiki) page.
-
-## Julia educational resources
-
-A basic knowledge of the Julia programming language is needed to use the GridapDistributed package.
-Here, one can find a list of resources to get started with this programming language.
-
-* First steps to learn Julia form the [Gridap wiki](https://github.com/gridap/Gridap.jl/wiki/Start-learning-Julia) page.
-* Official webpage [docs.julialang.org](https://docs.julialang.org/)
-* Official list of learning resources [julialang.org/learning](https://julialang.org/learning/)
\ No newline at end of file
+| ![packages](./packages_sketchy.png) |
+|:--:|
+| `GridapDistributed.jl` and its relation to other packages. `Gridap.jl` provides the sequential FE abstractions; `PartitionedArrays.jl` provides the distributed arrays and MPI backend; `GridapDistributed.jl` implements the Gridap abstractions on top of PartitionedArrays. |
diff --git a/docs/src/visualization.md b/docs/src/visualization.md
new file mode 100644
index 00000000..3f2f76c8
--- /dev/null
+++ b/docs/src/visualization.md
@@ -0,0 +1,39 @@
+# [Visualization](@id visualization)
+
+`GridapDistributed.jl` supports parallel VTK output via `writevtk`, using the same
+interface as sequential Gridap. Each MPI rank writes its local portion as a `.vtu` file;
+the VTK parallel format (`.pvtu`) is written by rank 0 and references all pieces.
+Open the `.pvtu` file in [ParaView](https://www.paraview.org/) to visualise the global
+solution.
+
+## Static output
+
+```julia
+writevtk(Ω, "solution", cellfields=["uh" => uh, "eh" => eh])
+```
+
+This generates `solution.pvtu` (the parallel descriptor) and one `solution_n.vtu` per
+rank. Ghost cells are omitted by default; each cell appears in exactly one `.vtu` file.
+
+## Time-dependent output
+
+Use `createpvd` / `savepvd` to collect snapshots from a time loop into a single `.pvd`
+collection that ParaView can animate:
+
+```julia
+pvd = createpvd(get_parts(model), "simulation")
+
+for (t, uh) in time_steps
+  pvd[t] = createvtk(Ω, "solution_$(t)", cellfields=["uh" => uh])
+end
+
+savepvd(pvd)
+```
+
+`get_parts(model)` provides the rank array needed to coordinate parallel file I/O.
+
+## API
+
+```@docs
+GridapDistributed.DistributedPvd
+```

From e1d6e4cbdb3d50e8d4f5cf318a18259f3d6690be Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Mon, 18 May 2026 12:27:21 +1000
Subject: [PATCH 46/48] More docs

---
 docs/make.jl              |  2 +-
 docs/src/adaptivity.md    | 54 +++++++++++++++++----------------------
 docs/src/algebra.md       | 10 ++++++++
 docs/src/assembly.md      | 10 +++++++-
 docs/src/fespaces.md      | 24 +++++++++++++++++
 docs/src/geometry.md      | 11 ++++++++
 docs/src/index.md         | 26 ++++++++++---------
 docs/src/visualization.md |  1 +
 8 files changed, 94 insertions(+), 44 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 0a13c924..1bd2438d 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,4 +1,4 @@
-using Documenter, GridapDistributed
+using Documenter, GridapDistributed, Gridap
 
 pages = [
   "Home"          => "index.md",
diff --git a/docs/src/adaptivity.md b/docs/src/adaptivity.md
index a55695bd..ed86167d 100644
--- a/docs/src/adaptivity.md
+++ b/docs/src/adaptivity.md
@@ -1,7 +1,10 @@
 # [Adaptivity](@id adaptivity)
 
-`GridapDistributed.jl` provides an interface for the adaptive mesh refinement (AMR) abstractions defined in `Gridap.jl`. Moreover, we also provide redistribution of meshes.
-The framework here works out of the box for Gridap's built-in refinement routines, but also interfaces with [`GridapP4est.jl`](https://github.com/gridap/GridapP4est.jl) for scalable AMR on unstructured meshes.
+`GridapDistributed.jl` provides an interface for the adaptive mesh refinement (AMR)
+abstractions defined in `Gridap.jl`. The framework works out of the box for Gridap's
+built-in refinement routines. For scalable AMR on unstructured meshes with full MPI
+performance, see [`GridapP4est.jl`](https://github.com/gridap/GridapP4est.jl), which
+integrates the p4est library with `GridapDistributed.jl`.
 
 ## Adapted discrete models
 
@@ -39,7 +42,9 @@ halves the cell size in every direction.
 
 ## Sub-communicators
 
-AMR workflows sometimes involve different sets of ranks at different stages (e.g. a coarse mesh on 2 ranks that is refined onto 8). Use `generate_subparts` to carve out a sub-communicator of `n` ranks from the full set:
+AMR workflows sometimes involve different sets of ranks at different stages (e.g. a coarse
+mesh on 2 ranks that is refined onto 8). Use `generate_subparts` to carve out a
+sub-communicator of `n` ranks from the full set:
 
 ```julia
 fine_ranks   = distribute(LinearIndices((8,)))
@@ -52,46 +57,35 @@ end
 
 Ranks outside the sub-communicator receive an inert placeholder and skip the guarded block.
 
-## Redistribution
+## API
 
-After refinement the load may become unbalanced. `redistribute` moves a distributed mesh from one partition layout to another and returns a `RedistributeGlue` that records the data movement:
+### Adapted models
 
-```julia
-# Move from a 2×2 to a 4×1 partition
-new_desc       = DistributedCartesianDescriptor(ranks, (4,1), (0,1,0,1), (8,8))
-new_model, glue = redistribute(old_model, new_desc)
+```@docs
+GridapDistributed.DistributedAdaptedDiscreteModel
 ```
 
-The `RedistributeGlue` can then migrate free-DOF values from the old to the new partition:
+### Sub-communicators
 
-```julia
-redistribute_free_values(
-  new_free_values, new_fespace,
-  old_free_values, old_dir_values, old_fespace,
-  new_model, glue
-)
+```@docs
+GridapDistributed.generate_subparts
+GridapDistributed.i_am_in
 ```
 
-Redistribution for unstructured meshes is handled inside `GridapP4est.jl`.
-
-## API
-
-### Adapted models
+### Refinement internals
 
 ```@docs
-GridapDistributed.DistributedAdaptedDiscreteModel
+GridapDistributed.refine_local_models
+GridapDistributed.refine_cell_gids
 ```
 
 ### Redistribution
 
 ```@docs
-redistribute
 GridapDistributed.RedistributeGlue
-```
-
-### Sub-communicators
-
-```@docs
-GridapDistributed.generate_subparts
-GridapDistributed.i_am_in
+GridapDistributed.redistribute
+GridapDistributed.redistribute_cartesian
+GridapDistributed.redistribute_array_by_cells
+GridapDistributed.redistribution_local_indices
+GridapDistributed.redistribute_indices
 ```
diff --git a/docs/src/algebra.md b/docs/src/algebra.md
index 5203e029..47ba97ea 100644
--- a/docs/src/algebra.md
+++ b/docs/src/algebra.md
@@ -62,4 +62,14 @@ A `PSparseMatrix` is a distributed sparse matrix stored in compressed form. Each
 
 ```@docs
 GridapDistributed.permuted_variable_partition
+GridapDistributed.unpermute
+GridapDistributed.locally_repartition
+GridapDistributed.filter_and_replace_ghost
+```
+
+### Block arrays
+
+```@docs
+GridapDistributed.BlockPRange
+GridapDistributed.BlockPArray
 ```
diff --git a/docs/src/assembly.md b/docs/src/assembly.md
index eabec8e5..a2e789e6 100644
--- a/docs/src/assembly.md
+++ b/docs/src/assembly.md
@@ -20,7 +20,8 @@ Standard parallel assembly:
 assem = SparseMatrixAssembler(U, V, Assembled())
 ```
 
-Unless you know what you are doing, this is the assembly strategy you should use.
+!!! note
+    Unless you know what you are doing, this is the assembly strategy you should use.
 
 ## `SubAssembled`
 
@@ -71,10 +72,17 @@ SubAssembled
 LocallyAssembled
 ```
 
+### Assembler type
+
+```@docs
+GridapDistributed.DistributedSparseMatrixAssembler
+```
+
 ### Builder types
 
 ```@docs
 GridapDistributed.DistributedArrayBuilder
 GridapDistributed.DistributedCounter
 GridapDistributed.DistributedAllocation
+GridapDistributed.TrackedArrayAllocation
 ```
diff --git a/docs/src/fespaces.md b/docs/src/fespaces.md
index c2654e41..b7dc4c15 100644
--- a/docs/src/fespaces.md
+++ b/docs/src/fespaces.md
@@ -78,4 +78,28 @@ GridapDistributed.DistributedMultiFieldFESpace
 GridapDistributed.DistributedCellField
 GridapDistributed.DistributedMeasure
 GridapDistributed.DistributedDomainContribution
+GridapDistributed.DistributedCellPoint
+```
+
+### FE functions
+
+```@docs
+GridapDistributed.DistributedFEFunctionData
+GridapDistributed.DistributedSingleFieldFEFunction
+```
+
+### Constant FE space
+
+```@docs
+Gridap.FESpaces.ConstantFESpace(::GridapDistributed.DistributedDiscreteModel; constraint_type, kwargs...)
+```
+
+### DOF numbering utilities
+
+```@docs
+GridapDistributed.generate_gids
+GridapDistributed.generate_posneg_gids
+GridapDistributed.generate_gids_by_color
+GridapDistributed.split_gids_by_color
+GridapDistributed.vcat_gids
 ```
diff --git a/docs/src/geometry.md b/docs/src/geometry.md
index 7af262e9..35c16ecb 100644
--- a/docs/src/geometry.md
+++ b/docs/src/geometry.md
@@ -122,3 +122,14 @@ GridapDistributed.DistributedGrid
 GridapDistributed.DistributedGridTopology
 GridapDistributed.DistributedFaceLabeling
 ```
+
+### Macro-element models
+
+```@docs
+GridapDistributed.MacroDiscreteModel
+GridapDistributed.classify_interfaces
+GridapDistributed.generate_nbors_and_keys
+GridapDistributed.generate_interface_gids
+GridapDistributed.get_local_face_labeling
+GridapDistributed.get_global_face_labeling
+```
diff --git a/docs/src/index.md b/docs/src/index.md
index e386e588..01398ee6 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -12,6 +12,16 @@
 pkg> add Gridap, GridapDistributed, PartitionedArrays
 ```
 
+## Package ecosystem
+
+`GridapDistributed.jl` is integrated into the Gridap ecosystem. In particular, it can be used together with most packages within the ecosystem, such as:
+
+- [`GridapGmsh.jl`](https://github.com/gridap/GridapGmsh.jl) for mesh generation and import.
+- [`GridapSolvers.jl`](https://github.com/gridap/GridapSolvers.jl) and [`GridapPETSc.jl`](https://github.com/gridap/GridapPETSc.jl) for distributed solvers and preconditioners.
+- [`GridapP4est.jl`](https://github.com/gridap/GridapP4est.jl) for parallel adaptive mesh refinement.
+- [`GridapEmbedded.jl`](https://github.com/gridap/GridapEmbedded.jl) for embedded boundary methods.
+- [`GridapTopOpt.jl`](https://github.com/zjwegert/GridapTopOpt.jl) for parallel topology optimization.
+
 ## Documentation overview
 
 | Section | What you will find |
@@ -21,19 +31,11 @@ pkg> add Gridap, GridapDistributed, PartitionedArrays
 | [Geometry](@ref geometry) | Distributed meshes, ghost cells, Cartesian and unstructured models |
 | [FESpaces](@ref fespaces) | DOF numbering, ghost DOFs, multi-field spaces, cell fields |
 | [Assembly](@ref assembly) | `Assembled`, `SubAssembled`, `LocallyAssembled` strategies |
-| [Adaptivity](@ref adaptivity) | AMR interface, uniform refinement, redistribution, GridapP4est |
+| [Adaptivity](@ref adaptivity) | AMR interface, uniform refinement, GridapP4est |
 | [Visualization](@ref visualization) | Parallel VTK output for ParaView |
 
-## Package ecosystem
+## Tutorials
 
-`GridapDistributed.jl` is integrated into the Gridap ecosystem. In particular, it can be used together with most packages within the ecosystem, such as:
-
-- [`GridapGmsh.jl`](https://github.com/gridap/GridapGmsh.jl) for mesh generation and import.
-- [`GridapSolvers.jl`](https://github.com/gridap/GridapSolvers.jl) and [`GridapPETSc.jl`](https://github.com/gridap/GridapPETSc.jl) for distributed solvers and preconditioners.
-- [`GridapP4est.jl`](https://github.com/gridap/GridapP4est.jl) for parallel adaptive mesh refinement.
-- [`GridapEmbedded.jl`](https://github.com/gridap/GridapEmbedded.jl) for embedded boundary methods.
-- [`GridapTopOpt.jl`](https://github.com/zjwegert/GridapTopOpt.jl) for parallel topology optimization.
+If you are new to the `Gridap` ecosystem of packages, we recommend that you first follow the [Gridap Tutorials](https://gridap.github.io/Tutorials/dev/) step by step in order to get familiar with the `Gridap.jl` library. `GridapDistributed.jl` and `Gridap.jl` share almost the same high-level API. Therefore, some familiarity with `Gridap.jl` is highly recommended (if not essential) before starting with `GridapDistributed.jl`
 
-| ![packages](./packages_sketchy.png) |
-|:--:|
-| `GridapDistributed.jl` and its relation to other packages. `Gridap.jl` provides the sequential FE abstractions; `PartitionedArrays.jl` provides the distributed arrays and MPI backend; `GridapDistributed.jl` implements the Gridap abstractions on top of PartitionedArrays. |
+We also provide some distributed-specific tutorials in [Gridap Tutorials](https://gridap.github.io/Tutorials/dev/). Examples with distributed solvers can also be found within the documentation of [`GridapSolvers.jl`](https://github.com/gridap/GridapSolvers.jl).
diff --git a/docs/src/visualization.md b/docs/src/visualization.md
index 3f2f76c8..d294e7c9 100644
--- a/docs/src/visualization.md
+++ b/docs/src/visualization.md
@@ -36,4 +36,5 @@ savepvd(pvd)
 
 ```@docs
 GridapDistributed.DistributedPvd
+GridapDistributed.DistributedVisualizationData
 ```

From dc61f5a2283f7b39860e56fe942176056b3d6eb7 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Mon, 18 May 2026 12:39:22 +1000
Subject: [PATCH 47/48] Minor

---
 docs/src/changelog.md |  3 +++
 docs/src/index.md     | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 docs/src/changelog.md

diff --git a/docs/src/changelog.md b/docs/src/changelog.md
new file mode 100644
index 00000000..a7b10c54
--- /dev/null
+++ b/docs/src/changelog.md
@@ -0,0 +1,3 @@
+# Changelog
+
+
diff --git a/docs/src/index.md b/docs/src/index.md
index 01398ee6..3cc2de9e 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -12,6 +12,12 @@
 pkg> add Gridap, GridapDistributed, PartitionedArrays
 ```
 
+Running `GridapDistributed.jl` requires an MPI implementation (e.g., MPICH, OpenMPI) and the [`MPI.jl` Julia wrapper](https://juliaparallel.org/MPI.jl/stable).
+
+By default, `MPI.jl` will download and link against an MPI artifact (Microsoft MPI on Windows and MPICH on all other platforms). Then drivers should be run using [Julia's wrapper for `mpiexec`](https://juliaparallel.org/MPI.jl/stable/usage/#Julia-wrapper-for-mpiexec).
+
+Within clusters, however, you might want to use the (often custom) MPI implementation provided by your system administrators. To do so, you should use [`MPIPreferences.jl`](https://juliaparallel.org/MPI.jl/stable/configuration/#using_system_mpi) and follow the instructions provided in [the `MPI.jl` documentation](https://juliaparallel.org/MPI.jl/stable/configuration/).
+
 ## Package ecosystem
 
 `GridapDistributed.jl` is integrated into the Gridap ecosystem. In particular, it can be used together with most packages within the ecosystem, such as:
@@ -39,3 +45,22 @@ pkg> add Gridap, GridapDistributed, PartitionedArrays
 If you are new to the `Gridap` ecosystem of packages, we recommend that you first follow the [Gridap Tutorials](https://gridap.github.io/Tutorials/dev/) step by step in order to get familiar with the `Gridap.jl` library. `GridapDistributed.jl` and `Gridap.jl` share almost the same high-level API. Therefore, some familiarity with `Gridap.jl` is highly recommended (if not essential) before starting with `GridapDistributed.jl`
 
 We also provide some distributed-specific tutorials in [Gridap Tutorials](https://gridap.github.io/Tutorials/dev/). Examples with distributed solvers can also be found within the documentation of [`GridapSolvers.jl`](https://github.com/gridap/GridapSolvers.jl).
+
+## Citation
+
+In order to give credit to the `Gridap` and `GridapDistributed` contributors, we simply ask you to cite the `Gridap` main project as indicated [here](https://github.com/gridap/Gridap.jl#how-to-cite-gridap) and the sub-packages you use as indicated in the corresponding repositories. Please, use the reference below in any publication in which you have made use of `GridapDistributed`:
+
+```latex
+@article{Badia2022,
+  doi = {10.21105/joss.04157},
+  url = {https://doi.org/10.21105/joss.04157},
+  year = {2022},
+  publisher = {The Open Journal},
+  volume = {7},
+  number = {74},
+  pages = {4157},
+  author = {Santiago Badia and Alberto F. Martín and Francesc Verdugo},
+  title = {GridapDistributed: a massively parallel finite element toolbox in Julia},
+  journal = {Journal of Open Source Software}
+}
+```

From e491eae8eb1e70023c1149300a230ea779ee34d5 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Mon, 18 May 2026 16:27:30 +1000
Subject: [PATCH 48/48] Fixed tests

---
 docs/Project.toml     |  1 +
 test/AutodiffTests.jl | 34 ++++++++++++++++++++++++++++++----
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index db33639a..a0838543 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,3 +1,4 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Gridap = "56d4f2e9-7ea1-5844-9cf6-b9c51ca7ce8e"
 GridapDistributed = "f9701e48-63b3-45aa-9a63-9bc6c271f355"
diff --git a/test/AutodiffTests.jl b/test/AutodiffTests.jl
index f86a79d8..ab61fe6c 100644
--- a/test/AutodiffTests.jl
+++ b/test/AutodiffTests.jl
@@ -7,6 +7,32 @@ using PartitionedArrays
 using SparseArrays
 using ForwardDiff
 
+function same_matrix(A,B;debug=false)
+  same_own = map(≈, own_own_values(A), own_own_values(B))
+  same_ghost = map(≈, own_ghost_values(A), own_ghost_values(B))
+
+  if debug
+    so = reduce(&, same_own)
+    println("Same own values: ", so)
+    if !so 
+      display(same_own)
+      map(own_own_values(A),own_own_values(B)) do a, b
+        display(a - b)
+      end
+    end
+    sg = reduce(&, same_ghost)
+    println("Same ghost values: ", sg)
+    if !sg
+      display(same_ghost)
+      map(own_ghost_values(A),own_ghost_values(B)) do a, b
+        display(a - b)
+      end
+    end
+  end
+
+  return reduce(&, same_own) && reduce(&, same_ghost)
+end
+
 function main_sf(distribute,parts)
   ranks = distribute(LinearIndices((prod(parts),)))
 
@@ -35,7 +61,7 @@ function main_sf(distribute,parts)
   uh = interpolate(1.0,U)
   A = jacobian(op,uh)
   A_AD = jacobian(op_AD,uh)
-  @test reduce(&,map(≈,partition(A),partition(A_AD)))
+  @test same_matrix(A,A_AD)
 
   g(v) = ∫(0.5*v⋅v)dΩ
   dg(v) = ∫(uh⋅v)dΩ
@@ -95,7 +121,7 @@ function main_mf(distribute,parts)
   uh, ph = xh
   A = jacobian(op,xh)
   A_AD = jacobian(op_AD,xh)
-  @test reduce(&,map(≈,partition(A),partition(A_AD)))
+  @test same_matrix(A,A_AD)
 
   g((v,q)) = ∫(0.5*v⋅v + 0.5*q*q)dΩ
   dg((v,q)) = ∫(uh⋅v + ph*q)dΩ
@@ -166,7 +192,7 @@ function mf_different_fespace_trians(distribute,parts)
     op = FEOperator(f2,f2_jac,X,X)
     J_fwd = jacobian(op,uh)
 
-    @test reduce(&,map(≈,partition(J),partition(J_fwd)))
+    @test same_matrix(J,J_fwd)
   end
 end
 
@@ -202,7 +228,7 @@ function skeleton_mf_different_fespace_trians(distribute,parts)
     op = FEOperator(f2,f2_jac,X,X)
     J_fwd = jacobian(op,uh)
 
-    @test reduce(&,map(≈,partition(J),partition(J_fwd)))
+    @test same_matrix(J,J_fwd)
   end
 end