From 3f456929e2a0f1f62885437cb7a49f6c9637690c Mon Sep 17 00:00:00 2001
From: Cody Johnston <cojohnston@microsoft.com>
Date: Wed, 29 Apr 2026 16:37:32 +0000
Subject: [PATCH 1/8] Add OrbitalEvaluator and Gaussian cube writer utility

Adds a public, host-parallel point-set evaluator for molecular orbitals
and densities, plus a self-contained Gaussian cube file writer. The two
are deliberately decoupled: OrbitalEvaluator returns plain numerical
arrays, and write_cube depends only on Molecule + raw field data. Either
can be used independently.

OrbitalEvaluator (include/gauxc/orbital_evaluator.hpp,
                  src/orbital_evaluator.cxx)
- Constructed from a BasisSet; caches the LocalHostWorkDriver internally
  so it can be reused across many evaluations for the same molecule.
- eval_orbital / eval_orbitals: chi_j(r) = sum_mu C[mu, j] * phi_mu(r),
  one or many MOs at a time on an arbitrary AoS point set.
- eval_density: rho(r) = sum_{mu,nu} D[mu, nu] * phi_mu(r) * phi_nu(r),
  for a symmetric AO density matrix.
- Internally batches points with an adaptive batch size (~16 MB AO
  scratch per thread) and parallelises the batch loop with OpenMP. Each
  thread owns its own scratch.
- Currently Host-only; the public API takes ExecutionSpace so device
  backends can be slotted in later without an ABI break.

write_cube + CubeGrid (include/gauxc/external/cube.hpp,
                       src/external/cube.cxx)
- Standard Gaussian cube text format, byte-compatible with the output of
  PySCF / Gaussian / NWChem cubegen utilities.
- CubeGrid::from_molecule builds a default axis-aligned grid that
  encloses the molecule with a configurable margin (default 3.0 Bohr,
  matching PySCF cubegen).
- The data block is formatted in parallel: each (ix, iy) row is
  serialised independently into a pre-sized byte buffer and committed
  with a single fwrite at the end. The per-value formatter is a
  hand-rolled %13.5E that matches glibc snprintf bit-for-bit on the
  fast path (1-2 digit exponent) and falls back to snprintf for the
  rare 3-digit-exponent case. This is the same writer that produced the
  measured 5x end-to-end speedup over PySCF cubegen on the QDK-Chemistry
  workloads that motivated this PR.
- Self-contained: no third-party dependencies, no GauXC kernel coupling.

Tests (tests/orbital_evaluator_test.cxx, [orbital_evaluator] / [cube]):
- Water cc-pVDZ kernel checks against direct eval_collocation reference:
  one-hot MO, multi-MO contraction, identity density, rank-1 density.
- CubeGrid layout: first/last point coordinates, iz-fastest ordering.
- write_cube round-trip: parses back the written file and verifies
  header (atoms, origin, axes) and field values to %13.5E precision.
- write_cube formatter: byte-exact comparison of the data block against
  glibc snprintf("%13.5E", v) for a battery of edge-case values
  (signed zero, near-power-of-ten boundaries, 3-digit exponents).
- All 1570 assertions pass; existing [collocation] suite unaffected.
---
 include/gauxc/external/cube.hpp     |  95 ++++++++
 include/gauxc/orbital_evaluator.hpp | 105 +++++++++
 src/CMakeLists.txt                  |   1 +
 src/external/CMakeLists.txt         |   3 +
 src/external/cube.cxx               | 299 +++++++++++++++++++++++++
 src/orbital_evaluator.cxx           | 211 ++++++++++++++++++
 tests/CMakeLists.txt                |   1 +
 tests/orbital_evaluator_test.cxx    | 332 ++++++++++++++++++++++++++++
 8 files changed, 1047 insertions(+)
 create mode 100644 include/gauxc/external/cube.hpp
 create mode 100644 include/gauxc/orbital_evaluator.hpp
 create mode 100644 src/external/cube.cxx
 create mode 100644 src/orbital_evaluator.cxx
 create mode 100644 tests/orbital_evaluator_test.cxx

diff --git a/include/gauxc/external/cube.hpp b/include/gauxc/external/cube.hpp
new file mode 100644
index 00000000..15ceb034
--- /dev/null
+++ b/include/gauxc/external/cube.hpp
@@ -0,0 +1,95 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2026, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gauxc/molecule.hpp>
+
+namespace GauXC {
+
+/** @brief Specification of a Gaussian-cube-format 3D rectangular grid.
+ *
+ *  The grid is axis-aligned with the Cartesian frame (the standard
+ *  cube-format axis vectors are simply (spacing[0], 0, 0), (0, spacing[1], 0),
+ *  (0, 0, spacing[2])). All quantities are in atomic units (Bohr).
+ *
+ *  Total number of points: nx*ny*nz. Storage cost per scalar field:
+ *  8*nx*ny*nz bytes (double precision).
+ */
+struct CubeGrid {
+  std::array<double, 3> origin{0.0, 0.0, 0.0};   ///< (0,0,0) corner, Bohr
+  std::array<double, 3> spacing{0.2, 0.2, 0.2};  ///< Step on each axis, Bohr
+  int64_t nx = 80;
+  int64_t ny = 80;
+  int64_t nz = 80;
+
+  /// Total number of grid points.
+  int64_t num_points() const noexcept { return nx * ny * nz; }
+
+  /** @brief Build a default grid that tightly encloses a molecule.
+   *
+   *  The bounding box of the atomic centres is extended by `margin` Bohr on
+   *  each side and discretised with the requested number of points along
+   *  each axis. Spacing is chosen so that the first and last grid points
+   *  coincide with the extended bounding-box corners (PySCF cubegen
+   *  convention).
+   *
+   *  @param mol     Molecule whose atomic centres define the bounding box.
+   *  @param nx,ny,nz Number of grid points along each axis.
+   *  @param margin  Margin (Bohr) added on each side. Default 3.0 matches
+   *                 the PySCF cubegen default.
+   */
+  static CubeGrid from_molecule(const Molecule& mol,
+                                int64_t nx = 80, int64_t ny = 80,
+                                int64_t nz = 80, double margin = 3.0);
+
+  /** @brief Materialise the grid points as an AoS coordinate array.
+   *
+   *  Returns a vector of length 3*num_points() laid out in row-major
+   *  (ix, iy, iz) order with iz varying fastest, matching the field-element
+   *  ordering expected by `write_cube`. Suitable to be passed directly as
+   *  the `points` argument to `OrbitalEvaluator::eval_orbital` /
+   *  `eval_density`.
+   */
+  std::vector<double> points() const;
+};
+
+/** @brief Write a Gaussian cube file.
+ *
+ *  Field layout: row-major (ix, iy, iz) with iz varying fastest. Length must
+ *  equal `grid.num_points()`. Values are written in fixed `%13.5E` format
+ *  with six values per line, matching the standard cube-file convention
+ *  produced by PySCF / Gaussian / NWChem.
+ *
+ *  This routine performs only file I/O; it has no dependency on the GauXC
+ *  numerical kernels and may be used independently of `OrbitalEvaluator`.
+ *
+ *  @param path     Output path. Parent directory must exist.
+ *  @param mol      Molecule (atomic numbers + Cartesian coordinates in Bohr)
+ *                  written into the cube-file header.
+ *  @param grid     Grid specification.
+ *  @param field    Length-`grid.num_points()` scalar field.
+ *  @param comment  Optional first comment line (the second comment line is
+ *                  always "Generated by GauXC"). If empty, a default first
+ *                  line is used.
+ */
+void write_cube(const std::string& path,
+                const Molecule& mol,
+                const CubeGrid& grid,
+                const double* field,
+                const std::string& comment = "");
+
+}  // namespace GauXC
diff --git a/include/gauxc/orbital_evaluator.hpp b/include/gauxc/orbital_evaluator.hpp
new file mode 100644
index 00000000..f0542e3e
--- /dev/null
+++ b/include/gauxc/orbital_evaluator.hpp
@@ -0,0 +1,105 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2026, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include <gauxc/basisset.hpp>
+#include <gauxc/enums.hpp>
+
+namespace GauXC {
+
+/** @brief Evaluate molecular orbitals and densities on arbitrary point sets.
+ *
+ *  Wraps the host collocation kernel exposed by the local work driver with a
+ *  thread-parallel batched evaluation loop, hiding the driver factory and
+ *  the per-thread AO scratch from callers. The class is designed to be
+ *  constructed once per molecule and reused across many evaluations (e.g.
+ *  one per active-space orbital) without re-initialising the driver.
+ *
+ *  The evaluator is intentionally decoupled from any particular file format:
+ *  it returns plain numerical arrays. For Gaussian cube file I/O, see
+ *  `gauxc/external/cube.hpp`.
+ *
+ *  Currently only `ExecutionSpace::Host` is supported; passing any other
+ *  value will throw on construction. Device support can be added later by
+ *  routing through a device-side collocation driver while keeping the same
+ *  signatures.
+ */
+class OrbitalEvaluator {
+ public:
+  /** @brief Construct an evaluator bound to a basis set.
+   *
+   *  @param basis Basis set (copied internally).
+   *  @param exec  Execution space; only `ExecutionSpace::Host` is supported.
+   */
+  explicit OrbitalEvaluator(BasisSet<double> basis,
+                            ExecutionSpace exec = ExecutionSpace::Host);
+
+  ~OrbitalEvaluator() noexcept;
+
+  // Non-copyable, movable
+  OrbitalEvaluator(const OrbitalEvaluator&) = delete;
+  OrbitalEvaluator& operator=(const OrbitalEvaluator&) = delete;
+  OrbitalEvaluator(OrbitalEvaluator&&) noexcept;
+  OrbitalEvaluator& operator=(OrbitalEvaluator&&) noexcept;
+
+  /// Number of basis functions (rows of the AO matrix).
+  int32_t nbf() const noexcept;
+
+  /// Underlying basis set.
+  const BasisSet<double>& basis() const noexcept;
+
+  /** @brief Evaluate a single MO chi(r) = sum_mu C[mu] * phi_mu(r).
+   *
+   *  @param[in]  npts    Number of evaluation points.
+   *  @param[in]  points  AoS array of length 3*npts: (x0,y0,z0,x1,y1,z1,...)
+   *                      in atomic units (Bohr).
+   *  @param[in]  C       MO coefficient vector, length nbf().
+   *  @param[out] out     Length-npts array of MO values.
+   */
+  void eval_orbital(int64_t npts, const double* points,
+                    const double* C, double* out) const;
+
+  /** @brief Evaluate `nmo` MOs simultaneously.
+   *
+   *  Storage layout:
+   *    - `C`   : (nbf, nmo) column-major, leading dimension `ldc` (>= nbf).
+   *    - `out` : (npts, nmo) column-major, leading dimension `ldo` (>= npts).
+   *
+   *  Equivalent to calling `eval_orbital` `nmo` times but amortises the AO
+   *  collocation evaluation across all MOs (single AO buffer, GEMM contraction).
+   */
+  void eval_orbitals(int64_t npts, const double* points,
+                     int32_t nmo, const double* C, int64_t ldc,
+                     double* out, int64_t ldo) const;
+
+  /** @brief Evaluate the electron density
+   *         rho(r) = sum_{mu,nu} D[mu,nu] * phi_mu(r) * phi_nu(r).
+   *
+   *  @param[in]  npts    Number of evaluation points.
+   *  @param[in]  points  AoS array of length 3*npts in Bohr.
+   *  @param[in]  D       (nbf, nbf) symmetric density matrix, column-major,
+   *                      leading dimension `ldd` (>= nbf).
+   *  @param[out] out     Length-npts array of density values.
+   */
+  void eval_density(int64_t npts, const double* points,
+                    const double* D, int64_t ldd,
+                    double* out) const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> pimpl_;
+};
+
+}  // namespace GauXC
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1aed4b42..8cd463ce 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -41,6 +41,7 @@ add_library( gauxc
   molgrid_impl.cxx 
   molgrid_defaults.cxx 
   atomic_radii.cxx 
+  orbital_evaluator.cxx
 )
 
 target_include_directories( gauxc
diff --git a/src/external/CMakeLists.txt b/src/external/CMakeLists.txt
index 46612c81..b0add283 100644
--- a/src/external/CMakeLists.txt
+++ b/src/external/CMakeLists.txt
@@ -9,6 +9,9 @@
 #
 # See LICENSE.txt for details
 #
+# Cube file writer is a self-contained utility (no third-party deps); always build it.
+target_sources( gauxc PRIVATE cube.cxx )
+
 if( GAUXC_ENABLE_HDF5 )
   include(FetchContent)
   find_package(HDF5)
diff --git a/src/external/cube.cxx b/src/external/cube.cxx
new file mode 100644
index 00000000..018ad0c5
--- /dev/null
+++ b/src/external/cube.cxx
@@ -0,0 +1,299 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2026, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include <gauxc/external/cube.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <gauxc/exceptions.hpp>
+
+namespace GauXC {
+
+// =============================================================================
+// CubeGrid
+// =============================================================================
+
+CubeGrid CubeGrid::from_molecule(const Molecule& mol, int64_t nx, int64_t ny,
+                                 int64_t nz, double margin) {
+  if (mol.empty()) {
+    GAUXC_GENERIC_EXCEPTION(
+        "CubeGrid::from_molecule: molecule has no atoms.");
+  }
+  if (nx < 1 || ny < 1 || nz < 1) {
+    GAUXC_GENERIC_EXCEPTION(
+        "CubeGrid::from_molecule: nx, ny, nz must be >= 1.");
+  }
+
+  double xmin = mol[0].x, xmax = mol[0].x;
+  double ymin = mol[0].y, ymax = mol[0].y;
+  double zmin = mol[0].z, zmax = mol[0].z;
+  for (const auto& a : mol) {
+    xmin = std::min(xmin, a.x);
+    xmax = std::max(xmax, a.x);
+    ymin = std::min(ymin, a.y);
+    ymax = std::max(ymax, a.y);
+    zmin = std::min(zmin, a.z);
+    zmax = std::max(zmax, a.z);
+  }
+
+  CubeGrid grid;
+  grid.origin = {xmin - margin, ymin - margin, zmin - margin};
+  grid.nx = nx;
+  grid.ny = ny;
+  grid.nz = nz;
+  const double ex = (xmax - xmin) + 2.0 * margin;
+  const double ey = (ymax - ymin) + 2.0 * margin;
+  const double ez = (zmax - zmin) + 2.0 * margin;
+  grid.spacing[0] = nx > 1 ? ex / static_cast<double>(nx - 1) : 0.0;
+  grid.spacing[1] = ny > 1 ? ey / static_cast<double>(ny - 1) : 0.0;
+  grid.spacing[2] = nz > 1 ? ez / static_cast<double>(nz - 1) : 0.0;
+  return grid;
+}
+
+std::vector<double> CubeGrid::points() const {
+  std::vector<double> pts(static_cast<size_t>(num_points()) * 3);
+  size_t k = 0;
+  for (int64_t ix = 0; ix < nx; ++ix) {
+    const double x = origin[0] + spacing[0] * static_cast<double>(ix);
+    for (int64_t iy = 0; iy < ny; ++iy) {
+      const double y = origin[1] + spacing[1] * static_cast<double>(iy);
+      for (int64_t iz = 0; iz < nz; ++iz) {
+        const double z = origin[2] + spacing[2] * static_cast<double>(iz);
+        pts[3 * k + 0] = x;
+        pts[3 * k + 1] = y;
+        pts[3 * k + 2] = z;
+        ++k;
+      }
+    }
+  }
+  return pts;
+}
+
+// Hand-rolled %13.5E formatter. Required field is fixed 13 chars
+// (sign + D + '.' + 5d + 'E' + sign + 2d). snprintf in the inner loop
+// dominates write time on large grids; this matches glibc snprintf
+// bit-for-bit on the fast path (1-2 digit exponents) and falls back to
+// snprintf for 3-digit-exponent edge cases. Output buffer must be
+// exactly 13 bytes (no trailing NUL).
+
+namespace {
+
+inline void format_e13_5(double v, char* out) {
+  if (std::isnan(v)) {
+    std::memcpy(out, "          NaN", 13);
+    return;
+  }
+  if (std::isinf(v)) {
+    std::memcpy(out, v < 0 ? "         -Inf" : "          Inf", 13);
+    return;
+  }
+
+  bool negative = std::signbit(v);
+  double absv = std::fabs(v);
+
+  if (absv == 0.0) {
+    // glibc "%13.5E":  0.0 -> "  0.00000E+00";  -0.0 -> " -0.00000E+00".
+    out[0] = ' ';
+    out[1] = negative ? '-' : ' ';
+    out[2] = '0';
+    out[3] = '.';
+    out[4] = '0';
+    out[5] = '0';
+    out[6] = '0';
+    out[7] = '0';
+    out[8] = '0';
+    out[9] = 'E';
+    out[10] = '+';
+    out[11] = '0';
+    out[12] = '0';
+    return;
+  }
+
+  // Exponent via floor(log10), with corrections for FP edge cases
+  // (e.g. 9.99999 rounding up across a power-of-ten boundary).
+  int exp10 = static_cast<int>(std::floor(std::log10(absv)));
+  double scale = std::pow(10.0, -exp10);
+  double mant = absv * scale;
+
+  long long mant_int = static_cast<long long>(std::llround(mant * 1e5));
+  if (mant_int >= 1000000) {
+    mant_int = 100000;
+    ++exp10;
+  } else if (mant_int < 100000) {
+    --exp10;
+    scale = std::pow(10.0, -exp10);
+    mant = absv * scale;
+    mant_int = static_cast<long long>(std::llround(mant * 1e5));
+    if (mant_int >= 1000000) {
+      mant_int = 999999;
+    } else if (mant_int < 100000) {
+      mant_int = 100000;
+    }
+  }
+
+  // 3+ digit exponents have a different field layout; defer to snprintf.
+  if (exp10 > 99 || exp10 < -99) {
+    char tmp[32];
+    const int n = std::snprintf(tmp, sizeof(tmp), "%13.5E", v);
+    if (n >= 13) {
+      std::memcpy(out, tmp + (n - 13), 13);
+    } else {
+      const int pad = 13 - n;
+      for (int i = 0; i < pad; ++i) out[i] = ' ';
+      std::memcpy(out + pad, tmp, static_cast<size_t>(n));
+    }
+    return;
+  }
+
+  // Fast path. Layout: [sp][sign][D][.][d4 d3 d2 d1 d0][E][esign][e1 e0]
+  out[0] = ' ';
+  out[1] = negative ? '-' : ' ';
+
+  char digits[6];
+  for (int i = 5; i >= 0; --i) {
+    digits[i] = static_cast<char>('0' + (mant_int % 10));
+    mant_int /= 10;
+  }
+  out[2] = digits[0];
+  out[3] = '.';
+  out[4] = digits[1];
+  out[5] = digits[2];
+  out[6] = digits[3];
+  out[7] = digits[4];
+  out[8] = digits[5];
+  out[9] = 'E';
+  out[10] = exp10 < 0 ? '-' : '+';
+
+  const int aexp = exp10 < 0 ? -exp10 : exp10;
+  out[11] = static_cast<char>('0' + (aexp / 10));
+  out[12] = static_cast<char>('0' + (aexp % 10));
+}
+
+}  // namespace
+
+// =============================================================================
+// write_cube
+// =============================================================================
+
+void write_cube(const std::string& path, const Molecule& mol,
+                const CubeGrid& grid, const double* field,
+                const std::string& comment) {
+  if (field == nullptr) {
+    GAUXC_GENERIC_EXCEPTION("write_cube: field pointer is null.");
+  }
+  if (grid.num_points() <= 0) {
+    GAUXC_GENERIC_EXCEPTION("write_cube: grid has zero points.");
+  }
+
+  std::FILE* f = std::fopen(path.c_str(), "w");
+  if (f == nullptr) {
+    GAUXC_GENERIC_EXCEPTION("write_cube: failed to open output file: " + path);
+  }
+
+  // --- Header ---
+  std::fprintf(f, "%s\n",
+               comment.empty() ? "GauXC cube file" : comment.c_str());
+  std::fprintf(f, "Generated by GauXC\n");
+
+  // natoms + origin (Bohr).
+  std::fprintf(f, "%5lld %12.6f %12.6f %12.6f\n",
+               static_cast<long long>(mol.size()), grid.origin[0],
+               grid.origin[1], grid.origin[2]);
+
+  // Three voxel-axis lines (axis-aligned grid).
+  std::fprintf(f, "%5lld %12.6f %12.6f %12.6f\n",
+               static_cast<long long>(grid.nx), grid.spacing[0], 0.0, 0.0);
+  std::fprintf(f, "%5lld %12.6f %12.6f %12.6f\n",
+               static_cast<long long>(grid.ny), 0.0, grid.spacing[1], 0.0);
+  std::fprintf(f, "%5lld %12.6f %12.6f %12.6f\n",
+               static_cast<long long>(grid.nz), 0.0, 0.0, grid.spacing[2]);
+
+  // One line per atom: Z, partial charge (0.0), x, y, z (Bohr).
+  for (const auto& atom : mol) {
+    std::fprintf(f, "%5lld %12.6f %12.6f %12.6f %12.6f\n",
+                 static_cast<long long>(atom.Z.get()), 0.0, atom.x, atom.y,
+                 atom.z);
+  }
+
+  // --- Data block ---
+  // Each (ix, iy) row is grid.nz values, six per line, %13.5E. The cube
+  // format requires a newline at the end of every (ix, iy) row regardless
+  // of how many values land on the last line. Rows are independent, so we
+  // format them in parallel into a pre-sized buffer and commit with a
+  // single fwrite.
+  const int64_t nz = grid.nz;
+  const int64_t lines_per_row = (nz + 5) / 6;
+  // Worst case 6*13 + 1 = 79 bytes per line; over-allocates the trailing
+  // line of each row but avoids a precise sizing pass.
+  const int64_t bytes_per_row = lines_per_row * (6 * 13 + 1);
+  const int64_t n_rows = grid.nx * grid.ny;
+  std::vector<char> buf(static_cast<size_t>(bytes_per_row * n_rows));
+  std::vector<int64_t> row_byte_count(static_cast<size_t>(n_rows), 0);
+
+#pragma omp parallel for schedule(static)
+  for (int64_t row = 0; row < n_rows; ++row) {
+    const double* row_data =
+        field + static_cast<size_t>(row) * static_cast<size_t>(nz);
+    char* dst = buf.data() + static_cast<size_t>(row) *
+                                 static_cast<size_t>(bytes_per_row);
+    int64_t off = 0;
+    for (int64_t iz = 0; iz < nz; ++iz) {
+      format_e13_5(row_data[iz], dst + off);
+      off += 13;
+      // Every 6 values OR at the end of the row → newline.
+      if (((iz + 1) % 6 == 0) || (iz + 1 == nz)) {
+        dst[off++] = '\n';
+      }
+    }
+    row_byte_count[static_cast<size_t>(row)] = off;
+  }
+
+  // Compact rows in place (worst-case padding between them) and emit
+  // with a single fwrite.
+  if (n_rows > 1) {
+    int64_t write_off = row_byte_count[0];
+    for (int64_t row = 1; row < n_rows; ++row) {
+      const int64_t src_off = row * bytes_per_row;
+      const int64_t len = row_byte_count[static_cast<size_t>(row)];
+      std::memmove(buf.data() + write_off, buf.data() + src_off,
+                   static_cast<size_t>(len));
+      write_off += len;
+    }
+    if (std::fwrite(buf.data(), 1, static_cast<size_t>(write_off), f) !=
+        static_cast<size_t>(write_off)) {
+      std::fclose(f);
+      GAUXC_GENERIC_EXCEPTION("write_cube: short write to " + path);
+    }
+  } else {
+    if (std::fwrite(buf.data(), 1, static_cast<size_t>(row_byte_count[0]),
+                    f) != static_cast<size_t>(row_byte_count[0])) {
+      std::fclose(f);
+      GAUXC_GENERIC_EXCEPTION("write_cube: short write to " + path);
+    }
+  }
+
+  if (std::fclose(f) != 0) {
+    GAUXC_GENERIC_EXCEPTION("write_cube: failed to close " + path);
+  }
+}
+
+}  // namespace GauXC
diff --git a/src/orbital_evaluator.cxx b/src/orbital_evaluator.cxx
new file mode 100644
index 00000000..39f5b61c
--- /dev/null
+++ b/src/orbital_evaluator.cxx
@@ -0,0 +1,211 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2026, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include <gauxc/orbital_evaluator.hpp>
+
+#include <algorithm>
+#include <numeric>
+#include <stdexcept>
+#include <vector>
+
+#include <gauxc/exceptions.hpp>
+#include <gauxc/xc_integrator/local_work_driver.hpp>
+
+#include "xc_integrator/local_work_driver/host/blas.hpp"
+#include "xc_integrator/local_work_driver/host/local_host_work_driver.hpp"
+
+namespace GauXC {
+
+namespace {
+
+/// Per-thread point batch size, sized so the AO scratch buffer
+/// (nbf*batch doubles) fits comfortably in L2 (~16 MB target).
+inline int64_t choose_batch_size(int32_t nbf) {
+  constexpr int64_t kTargetBytesPerThread = 16ll * 1024 * 1024;
+  constexpr int64_t kMinBatch = 256;
+  constexpr int64_t kMaxBatch = 8192;
+  if (nbf <= 0) return kMaxBatch;
+  const int64_t b =
+      kTargetBytesPerThread / (static_cast<int64_t>(sizeof(double)) * nbf);
+  return std::clamp(b, kMinBatch, kMaxBatch);
+}
+
+/// Single-block submat_map for the no-screening case (nbe == nbf); makes
+/// `eval_xmat` route directly to GEMM without invoking the gather path.
+inline LocalHostWorkDriver::submat_map_t full_submat_map(int32_t nbf) {
+  return {{ {int32_t{0}, nbf, int32_t{0}} }};
+}
+
+}  // namespace
+
+struct OrbitalEvaluator::Impl {
+  BasisSet<double> basis;
+  std::unique_ptr<LocalWorkDriver> driver_owner;
+  LocalHostWorkDriver* host_driver = nullptr;  // non-owning view
+  std::vector<int32_t> shell_list;
+  int32_t nbf_ = 0;
+
+  void init(BasisSet<double> bs, ExecutionSpace exec) {
+    if (exec != ExecutionSpace::Host) {
+      GAUXC_GENERIC_EXCEPTION(
+          "OrbitalEvaluator: only ExecutionSpace::Host is currently supported.");
+    }
+
+    basis = std::move(bs);
+
+    driver_owner = LocalWorkDriverFactory::make_local_work_driver(
+        ExecutionSpace::Host, "Reference");
+    host_driver = dynamic_cast<LocalHostWorkDriver*>(driver_owner.get());
+    if (host_driver == nullptr) {
+      GAUXC_GENERIC_EXCEPTION(
+          "OrbitalEvaluator: LocalWorkDriverFactory did not return a "
+          "LocalHostWorkDriver.");
+    }
+
+    shell_list.resize(basis.size());
+    std::iota(shell_list.begin(), shell_list.end(), int32_t{0});
+    nbf_ = basis.nbf();
+  }
+};
+
+OrbitalEvaluator::OrbitalEvaluator(BasisSet<double> basis, ExecutionSpace exec)
+    : pimpl_(std::make_unique<Impl>()) {
+  pimpl_->init(std::move(basis), exec);
+}
+
+OrbitalEvaluator::~OrbitalEvaluator() noexcept = default;
+OrbitalEvaluator::OrbitalEvaluator(OrbitalEvaluator&&) noexcept = default;
+OrbitalEvaluator& OrbitalEvaluator::operator=(OrbitalEvaluator&&) noexcept =
+    default;
+
+int32_t OrbitalEvaluator::nbf() const noexcept { return pimpl_->nbf_; }
+
+const BasisSet<double>& OrbitalEvaluator::basis() const noexcept {
+  return pimpl_->basis;
+}
+
+void OrbitalEvaluator::eval_orbital(int64_t npts, const double* points,
+                                    const double* C, double* out) const {
+  eval_orbitals(npts, points, /*nmo=*/1, C, /*ldc=*/pimpl_->nbf_, out,
+                /*ldo=*/npts);
+}
+
+void OrbitalEvaluator::eval_orbitals(int64_t npts, const double* points,
+                                     int32_t nmo, const double* C, int64_t ldc,
+                                     double* out, int64_t ldo) const {
+  if (npts == 0 || nmo == 0) return;
+  if (points == nullptr || C == nullptr || out == nullptr) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_orbitals: null pointer argument.");
+  }
+  const int32_t nbf = pimpl_->nbf_;
+  if (ldc < nbf) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_orbitals: ldc must be >= nbf().");
+  }
+  if (ldo < npts) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_orbitals: ldo must be >= npts.");
+  }
+
+  const int32_t nshells = pimpl_->basis.nshells();
+  const int64_t batch_size = choose_batch_size(nbf);
+  const int64_t n_batches = (npts + batch_size - 1) / batch_size;
+  LocalHostWorkDriver* driver = pimpl_->host_driver;
+  const int32_t* shell_list = pimpl_->shell_list.data();
+  const BasisSet<double>& basis = pimpl_->basis;
+
+#pragma omp parallel
+  {
+    std::vector<double> ao_buf(static_cast<size_t>(nbf) * batch_size);
+
+#pragma omp for schedule(dynamic, 1)
+    for (int64_t b = 0; b < n_batches; ++b) {
+      const int64_t p0 = b * batch_size;
+      const int64_t np = std::min<int64_t>(batch_size, npts - p0);
+
+      driver->eval_collocation(static_cast<size_t>(np),
+                               static_cast<size_t>(nshells),
+                               static_cast<size_t>(nbf), points + 3 * p0,
+                               basis, shell_list, ao_buf.data());
+
+      // out_slab(np, nmo) = ao^T(np, nbf) @ C(nbf, nmo); j-th MO column
+      // lives at out + j*ldo + p0.
+      blas::gemm<double>(
+          /*TA=*/'T', /*TB=*/'N',
+          /*M=*/static_cast<int>(np), /*N=*/static_cast<int>(nmo),
+          /*K=*/nbf, /*ALPHA=*/1.0, /*A=*/ao_buf.data(), /*LDA=*/nbf,
+          /*B=*/C, /*LDB=*/static_cast<int>(ldc), /*BETA=*/0.0,
+          /*C=*/out + p0, /*LDC=*/static_cast<int>(ldo));
+    }
+  }
+}
+
+void OrbitalEvaluator::eval_density(int64_t npts, const double* points,
+                                    const double* D, int64_t ldd,
+                                    double* out) const {
+  if (npts == 0) return;
+  if (points == nullptr || D == nullptr || out == nullptr) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_density: null pointer argument.");
+  }
+  const int32_t nbf = pimpl_->nbf_;
+  if (ldd < nbf) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_density: ldd must be >= nbf().");
+  }
+
+  const int32_t nshells = pimpl_->basis.nshells();
+  const int64_t batch_size = choose_batch_size(nbf);
+  const int64_t n_batches = (npts + batch_size - 1) / batch_size;
+  LocalHostWorkDriver* driver = pimpl_->host_driver;
+  const int32_t* shell_list = pimpl_->shell_list.data();
+  const BasisSet<double>& basis = pimpl_->basis;
+
+#pragma omp parallel
+  {
+    std::vector<double> ao_buf(static_cast<size_t>(nbf) * batch_size);
+    std::vector<double> dm_ao_buf(static_cast<size_t>(nbf) * batch_size);
+    // Sized for the eval_xmat submat-gather contract; unused on the
+    // nbe == nbf fast path but allocated unconditionally for safety.
+    std::vector<double> xmat_scr(static_cast<size_t>(nbf) * nbf);
+    auto submat_map = full_submat_map(nbf);
+
+#pragma omp for schedule(dynamic, 1)
+    for (int64_t b = 0; b < n_batches; ++b) {
+      const int64_t p0 = b * batch_size;
+      const int64_t np = std::min<int64_t>(batch_size, npts - p0);
+
+      driver->eval_collocation(static_cast<size_t>(np),
+                               static_cast<size_t>(nshells),
+                               static_cast<size_t>(nbf), points + 3 * p0,
+                               basis, shell_list, ao_buf.data());
+
+      // dm_ao = D @ ao
+      driver->eval_xmat(
+          /*npts=*/static_cast<size_t>(np), /*nbf=*/static_cast<size_t>(nbf),
+          /*nbe=*/static_cast<size_t>(nbf), submat_map, /*fac=*/1.0,
+          /*P=*/D, /*ldp=*/static_cast<size_t>(ldd),
+          /*basis_eval=*/ao_buf.data(), /*ldb=*/static_cast<size_t>(nbf),
+          /*X=*/dm_ao_buf.data(), /*ldx=*/static_cast<size_t>(nbf),
+          /*scr=*/xmat_scr.data());
+
+      // rho[p] = sum_mu ao(mu, p) * dm_ao(mu, p)
+      driver->eval_uvvar_lda_rks(
+          /*npts=*/static_cast<size_t>(np), /*nbe=*/static_cast<size_t>(nbf),
+          /*basis_eval=*/ao_buf.data(),
+          /*X=*/dm_ao_buf.data(), /*ldx=*/static_cast<size_t>(nbf),
+          /*den_eval=*/out + p0);
+    }
+  }
+}
+
+}  // namespace GauXC
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 46dbe487..3d4c0dfd 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -69,6 +69,7 @@ add_executable( gauxc_test
   basis/parse_basis.cxx
   dd_psi_potential_test.cxx
   2nd_derivative_test.cxx
+  orbital_evaluator_test.cxx
 )
 target_link_libraries( gauxc_test PUBLIC gauxc gauxc_catch2 Eigen3::Eigen )
 if(GAUXC_ENABLE_CUTLASS)
diff --git a/tests/orbital_evaluator_test.cxx b/tests/orbital_evaluator_test.cxx
new file mode 100644
index 00000000..97562323
--- /dev/null
+++ b/tests/orbital_evaluator_test.cxx
@@ -0,0 +1,332 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2026, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include "ut_common.hpp"
+#include "catch2/catch.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <random>
+#include <sstream>
+#include <vector>
+
+#include <unistd.h>
+
+#include <gauxc/external/cube.hpp>
+#include <gauxc/orbital_evaluator.hpp>
+#include <gauxc/xc_integrator/local_work_driver.hpp>
+
+#include "standards.hpp"
+
+// Reference implementation lives in src/, behind the public header surface.
+#include "xc_integrator/local_work_driver/host/local_host_work_driver.hpp"
+
+using namespace GauXC;
+
+namespace {
+
+/// Build a deterministic PRNG-based set of points within a small bounding box
+/// around the molecule. Avoids putting samples too close to nuclei to keep
+/// values numerically well-behaved.
+std::vector<double> make_random_points(int64_t npts, unsigned seed = 1234u) {
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<double> u(-2.5, 2.5);
+  std::vector<double> pts(static_cast<size_t>(npts) * 3);
+  for (int64_t p = 0; p < npts; ++p) {
+    pts[3 * p + 0] = u(gen);
+    pts[3 * p + 1] = u(gen);
+    pts[3 * p + 2] = u(gen);
+  }
+  return pts;
+}
+
+}  // namespace
+
+TEST_CASE("OrbitalEvaluator / Water cc-pVDZ matches eval_collocation",
+          "[orbital_evaluator]") {
+  auto mol = make_water();
+  auto basis = make_ccpvdz(mol, SphericalType(true));
+  for (auto& sh : basis) sh.set_shell_tolerance(1e-12);
+
+  const int32_t nbf = basis.nbf();
+  REQUIRE(nbf > 0);
+
+  const int64_t npts = 137;  // not a multiple of any batch size
+  const auto pts = make_random_points(npts);
+
+  // Reference: AO collocation directly via the LocalHostWorkDriver.
+  std::vector<double> ao_ref(static_cast<size_t>(nbf) * npts);
+  {
+    auto drv = LocalWorkDriverFactory::make_local_work_driver(
+        ExecutionSpace::Host, "Reference");
+    auto* host_drv = dynamic_cast<LocalHostWorkDriver*>(drv.get());
+    REQUIRE(host_drv != nullptr);
+    std::vector<int32_t> shell_list(basis.size());
+    for (size_t i = 0; i < shell_list.size(); ++i)
+      shell_list[i] = static_cast<int32_t>(i);
+    host_drv->eval_collocation(static_cast<size_t>(npts),
+                               static_cast<size_t>(basis.nshells()),
+                               static_cast<size_t>(nbf), pts.data(), basis,
+                               shell_list.data(), ao_ref.data());
+  }
+
+  OrbitalEvaluator eval(basis);
+  REQUIRE(eval.nbf() == nbf);
+
+  SECTION("eval_orbital with one-hot coefficient reproduces single AO column") {
+    std::vector<double> C(nbf, 0.0);
+    std::vector<double> out(static_cast<size_t>(npts), 0.0);
+    for (int32_t mu : {0, nbf / 3, nbf / 2, nbf - 1}) {
+      std::fill(C.begin(), C.end(), 0.0);
+      C[static_cast<size_t>(mu)] = 1.0;
+      std::fill(out.begin(), out.end(), 0.0);
+      eval.eval_orbital(npts, pts.data(), C.data(), out.data());
+      for (int64_t p = 0; p < npts; ++p) {
+        const double ref =
+            ao_ref[static_cast<size_t>(p) * nbf + static_cast<size_t>(mu)];
+        CHECK(out[static_cast<size_t>(p)] == Approx(ref).margin(1e-12));
+      }
+    }
+  }
+
+  SECTION("eval_orbitals with random C matches AO ^T @ C") {
+    const int32_t nmo = 4;
+    std::vector<double> C(static_cast<size_t>(nbf) * nmo);
+    std::mt19937 gen(99);
+    std::uniform_real_distribution<double> u(-1.0, 1.0);
+    for (auto& v : C) v = u(gen);
+
+    std::vector<double> out(static_cast<size_t>(npts) * nmo, 0.0);
+    eval.eval_orbitals(npts, pts.data(), nmo, C.data(), nbf, out.data(), npts);
+
+    for (int32_t j = 0; j < nmo; ++j) {
+      for (int64_t p = 0; p < npts; ++p) {
+        double ref = 0.0;
+        for (int32_t mu = 0; mu < nbf; ++mu) {
+          ref += C[static_cast<size_t>(j) * nbf + mu] *
+                 ao_ref[static_cast<size_t>(p) * nbf + mu];
+        }
+        const double got =
+            out[static_cast<size_t>(j) * npts + static_cast<size_t>(p)];
+        CHECK(got == Approx(ref).margin(1e-10));
+      }
+    }
+  }
+
+  SECTION("eval_density with identity D equals sum of squared AO values") {
+    std::vector<double> D(static_cast<size_t>(nbf) * nbf, 0.0);
+    for (int32_t mu = 0; mu < nbf; ++mu) {
+      D[static_cast<size_t>(mu) * nbf + mu] = 1.0;
+    }
+    std::vector<double> out(static_cast<size_t>(npts), 0.0);
+    eval.eval_density(npts, pts.data(), D.data(), nbf, out.data());
+
+    for (int64_t p = 0; p < npts; ++p) {
+      double ref = 0.0;
+      for (int32_t mu = 0; mu < nbf; ++mu) {
+        const double a = ao_ref[static_cast<size_t>(p) * nbf + mu];
+        ref += a * a;
+      }
+      CHECK(out[static_cast<size_t>(p)] == Approx(ref).margin(1e-10));
+    }
+  }
+
+  SECTION("eval_density with rank-1 D = c c^T equals (c.AO)^2") {
+    std::vector<double> c(nbf);
+    std::mt19937 gen(7);
+    std::uniform_real_distribution<double> u(-1.0, 1.0);
+    for (auto& v : c) v = u(gen);
+
+    std::vector<double> D(static_cast<size_t>(nbf) * nbf, 0.0);
+    for (int32_t mu = 0; mu < nbf; ++mu) {
+      for (int32_t nu = 0; nu < nbf; ++nu) {
+        D[static_cast<size_t>(mu) * nbf + nu] = c[mu] * c[nu];
+      }
+    }
+    std::vector<double> out(static_cast<size_t>(npts), 0.0);
+    eval.eval_density(npts, pts.data(), D.data(), nbf, out.data());
+
+    std::vector<double> orb(static_cast<size_t>(npts), 0.0);
+    eval.eval_orbital(npts, pts.data(), c.data(), orb.data());
+
+    for (int64_t p = 0; p < npts; ++p) {
+      const double ref = orb[static_cast<size_t>(p)] * orb[static_cast<size_t>(p)];
+      CHECK(out[static_cast<size_t>(p)] == Approx(ref).margin(1e-10));
+    }
+  }
+}
+
+TEST_CASE("CubeGrid construction and grid-points layout", "[cube]") {
+  auto mol = make_water();
+  CubeGrid g = CubeGrid::from_molecule(mol, /*nx=*/16, /*ny=*/12, /*nz=*/8,
+                                       /*margin=*/2.0);
+  REQUIRE(g.num_points() == 16 * 12 * 8);
+
+  auto pts = g.points();
+  REQUIRE(pts.size() == static_cast<size_t>(g.num_points()) * 3);
+
+  // Check first and last grid points.
+  CHECK(pts[0] == Approx(g.origin[0]));
+  CHECK(pts[1] == Approx(g.origin[1]));
+  CHECK(pts[2] == Approx(g.origin[2]));
+
+  const size_t last = static_cast<size_t>(g.num_points()) - 1;
+  CHECK(pts[3 * last + 0] ==
+        Approx(g.origin[0] + g.spacing[0] * (g.nx - 1)));
+  CHECK(pts[3 * last + 1] ==
+        Approx(g.origin[1] + g.spacing[1] * (g.ny - 1)));
+  CHECK(pts[3 * last + 2] ==
+        Approx(g.origin[2] + g.spacing[2] * (g.nz - 1)));
+
+  // Check ordering: iz varies fastest. Point (1, 0, 0) should be at offset
+  // ny*nz, point (0, 1, 0) at nz, point (0, 0, 1) at 1.
+  const int64_t off_x = g.ny * g.nz;
+  const int64_t off_y = g.nz;
+  CHECK(pts[3 * static_cast<size_t>(off_x) + 0] ==
+        Approx(g.origin[0] + g.spacing[0]));
+  CHECK(pts[3 * static_cast<size_t>(off_y) + 1] ==
+        Approx(g.origin[1] + g.spacing[1]));
+  CHECK(pts[3 * 1 + 2] == Approx(g.origin[2] + g.spacing[2]));
+}
+
+TEST_CASE("write_cube round-trips header and field data", "[cube]") {
+  auto mol = make_water();
+  CubeGrid grid = CubeGrid::from_molecule(mol, /*nx=*/5, /*ny=*/4, /*nz=*/7,
+                                          /*margin=*/2.0);
+  std::vector<double> field(static_cast<size_t>(grid.num_points()));
+  for (int64_t i = 0; i < grid.num_points(); ++i) {
+    // Mix of magnitudes to exercise the formatter (negative, near-zero, etc.)
+    field[static_cast<size_t>(i)] = std::sin(0.13 * static_cast<double>(i)) *
+                                    std::pow(10.0, (i % 5) - 2);
+  }
+
+  // Use a tmp path under /tmp.
+  const std::string path = std::string("/tmp/gauxc_cube_test_") +
+                           std::to_string(::getpid()) + ".cube";
+  write_cube(path, mol, grid, field.data(), "Test cube");
+
+  // Parse back.
+  std::ifstream in(path);
+  REQUIRE(in.is_open());
+  std::string l1, l2;
+  std::getline(in, l1);
+  std::getline(in, l2);
+  CHECK(l1 == "Test cube");
+  CHECK(l2 == "Generated by GauXC");
+
+  long long natoms_read;
+  double ox, oy, oz;
+  in >> natoms_read >> ox >> oy >> oz;
+  CHECK(natoms_read == static_cast<long long>(mol.size()));
+  CHECK(ox == Approx(grid.origin[0]));
+  CHECK(oy == Approx(grid.origin[1]));
+  CHECK(oz == Approx(grid.origin[2]));
+
+  // 3 axis lines.
+  for (int axis = 0; axis < 3; ++axis) {
+    long long n;
+    double a, b, c;
+    in >> n >> a >> b >> c;
+    if (axis == 0) {
+      CHECK(n == grid.nx);
+      CHECK(a == Approx(grid.spacing[0]));
+      CHECK(b == Approx(0.0));
+      CHECK(c == Approx(0.0));
+    } else if (axis == 1) {
+      CHECK(n == grid.ny);
+      CHECK(a == Approx(0.0));
+      CHECK(b == Approx(grid.spacing[1]));
+      CHECK(c == Approx(0.0));
+    } else {
+      CHECK(n == grid.nz);
+      CHECK(a == Approx(0.0));
+      CHECK(b == Approx(0.0));
+      CHECK(c == Approx(grid.spacing[2]));
+    }
+  }
+
+  // Atoms.
+  for (size_t i = 0; i < mol.size(); ++i) {
+    long long Z;
+    double q, x, y, z;
+    in >> Z >> q >> x >> y >> z;
+    CHECK(Z == mol[i].Z.get());
+    CHECK(q == Approx(0.0));
+    CHECK(x == Approx(mol[i].x));
+    CHECK(y == Approx(mol[i].y));
+    CHECK(z == Approx(mol[i].z));
+  }
+
+  // Field. Read all remaining whitespace-separated doubles and compare.
+  std::vector<double> field_read;
+  field_read.reserve(static_cast<size_t>(grid.num_points()));
+  double v;
+  while (in >> v) field_read.push_back(v);
+
+  REQUIRE(field_read.size() == field.size());
+  // %13.5E gives 5 significant digits → relative tolerance ~1e-5 for the
+  // round-trip.
+  for (size_t i = 0; i < field.size(); ++i) {
+    CHECK(field_read[i] == Approx(field[i]).epsilon(1e-4).margin(1e-30));
+  }
+
+  std::remove(path.c_str());
+}
+
+TEST_CASE("write_cube agrees with snprintf %13.5E formatting", "[cube]") {
+  // Spot-check the custom formatter against snprintf for a battery of values
+  // by writing a tiny cube file and parsing it back. This is a stronger check
+  // than the round-trip above since we compare the byte stream.
+  auto mol = make_water();
+  CubeGrid grid;
+  grid.origin = {0.0, 0.0, 0.0};
+  grid.spacing = {0.1, 0.1, 0.1};
+  grid.nx = 1;
+  grid.ny = 1;
+  grid.nz = 12;
+
+  std::vector<double> field = {0.0,        -0.0,    1.23456e-10, -9.99995e-1,
+                               1.0e+99,    -1.0e+99,  3.14159265358979,
+                               -2.71828,   1.0,      -1.0,       1e-300,
+                               1.234e+05};
+
+  const std::string path = std::string("/tmp/gauxc_cube_format_") +
+                           std::to_string(::getpid()) + ".cube";
+  write_cube(path, mol, grid, field.data(), "fmt");
+
+  std::ifstream in(path);
+  REQUIRE(in.is_open());
+  // Skip header (2 comments + 4 grid lines + natoms atom lines).
+  std::string skip;
+  for (int i = 0; i < 2; ++i) std::getline(in, skip);
+  std::getline(in, skip);  // natoms line
+  for (int i = 0; i < 3; ++i) std::getline(in, skip);  // 3 axis lines
+  for (size_t i = 0; i < mol.size(); ++i) std::getline(in, skip);
+
+  // Read the field-block as raw text and compare against snprintf
+  // line-by-line. Six values per line, then row terminator after 12 (single
+  // (ix, iy) row here so no early newline).
+  std::string data_block((std::istreambuf_iterator<char>(in)),
+                         std::istreambuf_iterator<char>());
+
+  std::ostringstream expected;
+  for (size_t i = 0; i < field.size(); ++i) {
+    char buf[32];
+    std::snprintf(buf, sizeof(buf), "%13.5E", field[i]);
+    expected << buf;
+    if ((i + 1) % 6 == 0 || (i + 1) == field.size()) expected << '\n';
+  }
+
+  CHECK(data_block == expected.str());
+
+  std::remove(path.c_str());
+}

From 7ff6c623e1e049835ea5fc58a869b6b19737cf89 Mon Sep 17 00:00:00 2001
From: Cody Johnston <cojohnston@microsoft.com>
Date: Wed, 29 Apr 2026 18:40:23 +0000
Subject: [PATCH 2/8] OrbitalEvaluator: add per-batch shell screening

Use gen_compressed_submat_map + Shell::cutoff_radius() to skip shells
whose cutoff sphere doesn't overlap any grid point in the batch.
Reduces collocation work for sparse systems (e.g. water at 80^3/16t:
10.4x -> 11.2x vs PySCF).
---
 src/orbital_evaluator.cxx | 184 ++++++++++++++++++++++++++++++++------
 1 file changed, 157 insertions(+), 27 deletions(-)

diff --git a/src/orbital_evaluator.cxx b/src/orbital_evaluator.cxx
index 39f5b61c..1879f684 100644
--- a/src/orbital_evaluator.cxx
+++ b/src/orbital_evaluator.cxx
@@ -12,13 +12,18 @@
 #include <gauxc/orbital_evaluator.hpp>
 
 #include <algorithm>
+#include <array>
 #include <numeric>
 #include <stdexcept>
+#include <tuple>
 #include <vector>
 
+#include <gauxc/basisset_map.hpp>
 #include <gauxc/exceptions.hpp>
+#include <gauxc/molecule.hpp>
 #include <gauxc/xc_integrator/local_work_driver.hpp>
 
+#include "xc_integrator/integrator_util/integrator_common.hpp"
 #include "xc_integrator/local_work_driver/host/blas.hpp"
 #include "xc_integrator/local_work_driver/host/local_host_work_driver.hpp"
 
@@ -44,6 +49,42 @@ inline LocalHostWorkDriver::submat_map_t full_submat_map(int32_t nbf) {
   return {{ {int32_t{0}, nbf, int32_t{0}} }};
 }
 
+/// Axis-aligned bbox of a set of npts AoS points (length 3*npts).
+struct PointBbox {
+  std::array<double, 3> lo;
+  std::array<double, 3> hi;
+};
+inline PointBbox compute_bbox(const double* points, int64_t npts) {
+  PointBbox b{{points[0], points[1], points[2]},
+              {points[0], points[1], points[2]}};
+  for (int64_t p = 1; p < npts; ++p) {
+    const double* xyz = points + 3 * p;
+    for (int k = 0; k < 3; ++k) {
+      if (xyz[k] < b.lo[k]) b.lo[k] = xyz[k];
+      if (xyz[k] > b.hi[k]) b.hi[k] = xyz[k];
+    }
+  }
+  return b;
+}
+
+/// Squared distance from `center` to the nearest point in the axis-aligned
+/// bbox `[lo, hi]^3`. Zero if the center lies inside the bbox.
+inline double dist2_center_to_bbox(const double* center,
+                                   const PointBbox& bbox) {
+  double d2 = 0.0;
+  for (int k = 0; k < 3; ++k) {
+    const double c = center[k];
+    if (c < bbox.lo[k]) {
+      const double dx = bbox.lo[k] - c;
+      d2 += dx * dx;
+    } else if (c > bbox.hi[k]) {
+      const double dx = c - bbox.hi[k];
+      d2 += dx * dx;
+    }
+  }
+  return d2;
+}
+
 }  // namespace
 
 struct OrbitalEvaluator::Impl {
@@ -51,6 +92,13 @@ struct OrbitalEvaluator::Impl {
   std::unique_ptr<LocalWorkDriver> driver_owner;
   LocalHostWorkDriver* host_driver = nullptr;  // non-owning view
   std::vector<int32_t> shell_list;
+  // BasisSetMap powers shell -> AO range lookups for the screened
+  // submat_map. We don't have a Molecule available so we pass an empty
+  // one; the only field that needs it (shell_to_center) is unused here.
+  std::unique_ptr<BasisSetMap> basis_map;
+  // Per-shell squared cutoff radius, cached so the screening loop is just
+  // a comparison instead of a square root per shell per batch.
+  std::vector<double> shell_cutoff_r2;
   int32_t nbf_ = 0;
 
   void init(BasisSet<double> bs, ExecutionSpace exec) {
@@ -73,6 +121,14 @@ struct OrbitalEvaluator::Impl {
     shell_list.resize(basis.size());
     std::iota(shell_list.begin(), shell_list.end(), int32_t{0});
     nbf_ = basis.nbf();
+
+    basis_map = std::make_unique<BasisSetMap>(basis, Molecule{});
+
+    shell_cutoff_r2.resize(basis.size());
+    for (size_t s = 0; s < basis.size(); ++s) {
+      const double r = basis[s].cutoff_radius();
+      shell_cutoff_r2[s] = r * r;
+    }
   }
 };
 
@@ -116,34 +172,79 @@ void OrbitalEvaluator::eval_orbitals(int64_t npts, const double* points,
         "OrbitalEvaluator::eval_orbitals: ldo must be >= npts.");
   }
 
-  const int32_t nshells = pimpl_->basis.nshells();
+  const int32_t nshells_total = pimpl_->basis.nshells();
   const int64_t batch_size = choose_batch_size(nbf);
   const int64_t n_batches = (npts + batch_size - 1) / batch_size;
   LocalHostWorkDriver* driver = pimpl_->host_driver;
-  const int32_t* shell_list = pimpl_->shell_list.data();
   const BasisSet<double>& basis = pimpl_->basis;
+  const auto& shell_cutoff_r2 = pimpl_->shell_cutoff_r2;
+  const BasisSetMap& basis_map = *pimpl_->basis_map;
 
 #pragma omp parallel
   {
     std::vector<double> ao_buf(static_cast<size_t>(nbf) * batch_size);
+    std::vector<double> C_compressed(static_cast<size_t>(nbf) * nmo);
+    std::vector<int32_t> screened_shells;
+    screened_shells.reserve(nshells_total);
 
 #pragma omp for schedule(dynamic, 1)
     for (int64_t b = 0; b < n_batches; ++b) {
       const int64_t p0 = b * batch_size;
       const int64_t np = std::min<int64_t>(batch_size, npts - p0);
 
-      driver->eval_collocation(static_cast<size_t>(np),
-                               static_cast<size_t>(nshells),
-                               static_cast<size_t>(nbf), points + 3 * p0,
-                               basis, shell_list, ao_buf.data());
-
-      // out_slab(np, nmo) = ao^T(np, nbf) @ C(nbf, nmo); j-th MO column
-      // lives at out + j*ldo + p0.
+      // Per-batch shell screening: keep shells whose cutoff_radius reaches
+      // any point of this batch's bounding box.
+      const PointBbox bbox = compute_bbox(points + 3 * p0, np);
+      screened_shells.clear();
+      int32_t nbe = 0;
+      for (int32_t s = 0; s < nshells_total; ++s) {
+        if (dist2_center_to_bbox(basis[s].O_data(), bbox) <
+            shell_cutoff_r2[s]) {
+          screened_shells.push_back(s);
+          nbe += basis[s].size();
+        }
+      }
+      if (nbe == 0) {
+        // All shells screen out for this batch -> orbital is identically
+        // zero. Zero the slab and skip eval.
+        for (int32_t j = 0; j < nmo; ++j) {
+          double* out_col = out + static_cast<size_t>(j) * ldo + p0;
+          std::fill(out_col, out_col + np, 0.0);
+        }
+        continue;
+      }
+
+      driver->eval_collocation(
+          static_cast<size_t>(np),
+          static_cast<size_t>(screened_shells.size()),
+          static_cast<size_t>(nbe), points + 3 * p0, basis,
+          screened_shells.data(), ao_buf.data());
+
+      // Gather the rows of C corresponding to surviving shells into a
+      // contiguous (nbe, nmo) col-major buffer so the contraction is a
+      // dense GEMM.
+      {
+        int32_t row = 0;
+        for (int32_t s : screened_shells) {
+          const auto rng = basis_map.shell_to_ao_range(s);
+          const int32_t shell_nbe = rng.second - rng.first;
+          for (int32_t j = 0; j < nmo; ++j) {
+            const double* C_col = C + static_cast<size_t>(j) * ldc;
+            double* dst = C_compressed.data() +
+                          static_cast<size_t>(j) * nbe + row;
+            std::copy(C_col + rng.first, C_col + rng.second, dst);
+          }
+          row += shell_nbe;
+        }
+      }
+
+      // out_slab(np, nmo) = ao^T(np, nbe) @ C_compressed(nbe, nmo); j-th MO
+      // column lives at out + j*ldo + p0.
       blas::gemm<double>(
           /*TA=*/'T', /*TB=*/'N',
           /*M=*/static_cast<int>(np), /*N=*/static_cast<int>(nmo),
-          /*K=*/nbf, /*ALPHA=*/1.0, /*A=*/ao_buf.data(), /*LDA=*/nbf,
-          /*B=*/C, /*LDB=*/static_cast<int>(ldc), /*BETA=*/0.0,
+          /*K=*/nbe, /*ALPHA=*/1.0, /*A=*/ao_buf.data(), /*LDA=*/nbe,
+          /*B=*/C_compressed.data(), /*LDB=*/nbe, /*BETA=*/0.0,
           /*C=*/out + p0, /*LDC=*/static_cast<int>(ldo));
     }
   }
@@ -163,46 +264,75 @@ void OrbitalEvaluator::eval_density(int64_t npts, const double* points,
         "OrbitalEvaluator::eval_density: ldd must be >= nbf().");
   }
 
-  const int32_t nshells = pimpl_->basis.nshells();
+  const int32_t nshells_total = pimpl_->basis.nshells();
   const int64_t batch_size = choose_batch_size(nbf);
   const int64_t n_batches = (npts + batch_size - 1) / batch_size;
   LocalHostWorkDriver* driver = pimpl_->host_driver;
-  const int32_t* shell_list = pimpl_->shell_list.data();
   const BasisSet<double>& basis = pimpl_->basis;
+  const auto& shell_cutoff_r2 = pimpl_->shell_cutoff_r2;
+  const BasisSetMap& basis_map = *pimpl_->basis_map;
 
 #pragma omp parallel
   {
     std::vector<double> ao_buf(static_cast<size_t>(nbf) * batch_size);
     std::vector<double> dm_ao_buf(static_cast<size_t>(nbf) * batch_size);
-    // Sized for the eval_xmat submat-gather contract; unused on the
-    // nbe == nbf fast path but allocated unconditionally for safety.
+    // eval_xmat scratch when the submat-gather path is taken; sized for
+    // worst case (nbe == nbf).
     std::vector<double> xmat_scr(static_cast<size_t>(nbf) * nbf);
-    auto submat_map = full_submat_map(nbf);
+    std::vector<int32_t> screened_shells;
+    screened_shells.reserve(nshells_total);
 
 #pragma omp for schedule(dynamic, 1)
     for (int64_t b = 0; b < n_batches; ++b) {
       const int64_t p0 = b * batch_size;
       const int64_t np = std::min<int64_t>(batch_size, npts - p0);
 
-      driver->eval_collocation(static_cast<size_t>(np),
-                               static_cast<size_t>(nshells),
-                               static_cast<size_t>(nbf), points + 3 * p0,
-                               basis, shell_list, ao_buf.data());
-
-      // dm_ao = D @ ao
+      const PointBbox bbox = compute_bbox(points + 3 * p0, np);
+      screened_shells.clear();
+      int32_t nbe = 0;
+      for (int32_t s = 0; s < nshells_total; ++s) {
+        if (dist2_center_to_bbox(basis[s].O_data(), bbox) <
+            shell_cutoff_r2[s]) {
+          screened_shells.push_back(s);
+          nbe += basis[s].size();
+        }
+      }
+      if (nbe == 0) {
+        std::fill(out + p0, out + p0 + np, 0.0);
+        continue;
+      }
+
+      driver->eval_collocation(
+          static_cast<size_t>(np),
+          static_cast<size_t>(screened_shells.size()),
+          static_cast<size_t>(nbe), points + 3 * p0, basis,
+          screened_shells.data(), ao_buf.data());
+
+      // Build the compressed submat_map for this batch (gather the rows
+      // of D corresponding to surviving shells). Falls through to a single
+      // direct-gemm submat_map when nbe == nbf.
+      LocalHostWorkDriver::submat_map_t submat_map;
+      if (nbe == nbf) {
+        submat_map = full_submat_map(nbf);
+      } else {
+        std::tie(submat_map, std::ignore) =
+            gen_compressed_submat_map(basis_map, screened_shells, nbf, nbf);
+      }
+
+      // dm_ao = D_compressed @ ao
       driver->eval_xmat(
           /*npts=*/static_cast<size_t>(np), /*nbf=*/static_cast<size_t>(nbf),
-          /*nbe=*/static_cast<size_t>(nbf), submat_map, /*fac=*/1.0,
+          /*nbe=*/static_cast<size_t>(nbe), submat_map, /*fac=*/1.0,
           /*P=*/D, /*ldp=*/static_cast<size_t>(ldd),
-          /*basis_eval=*/ao_buf.data(), /*ldb=*/static_cast<size_t>(nbf),
-          /*X=*/dm_ao_buf.data(), /*ldx=*/static_cast<size_t>(nbf),
+          /*basis_eval=*/ao_buf.data(), /*ldb=*/static_cast<size_t>(nbe),
+          /*X=*/dm_ao_buf.data(), /*ldx=*/static_cast<size_t>(nbe),
           /*scr=*/xmat_scr.data());
 
       // rho[p] = sum_mu ao(mu, p) * dm_ao(mu, p)
       driver->eval_uvvar_lda_rks(
-          /*npts=*/static_cast<size_t>(np), /*nbe=*/static_cast<size_t>(nbf),
+          /*npts=*/static_cast<size_t>(np), /*nbe=*/static_cast<size_t>(nbe),
           /*basis_eval=*/ao_buf.data(),
-          /*X=*/dm_ao_buf.data(), /*ldx=*/static_cast<size_t>(nbf),
+          /*X=*/dm_ao_buf.data(), /*ldx=*/static_cast<size_t>(nbe),
           /*den_eval=*/out + p0);
     }
   }

From 32e6a0ee14c99c78de5a49700f5b7d23802ad726 Mon Sep 17 00:00:00 2001
From: Cody Johnston <cojohnston@microsoft.com>
Date: Wed, 29 Apr 2026 18:46:41 +0000
Subject: [PATCH 3/8] OrbitalEvaluator/CubeGrid: add grid-native eval overloads

Add eval_orbital/eval_orbitals/eval_density overloads that accept a
CubeGrid directly and generate per-batch point coordinates on-the-fly,
avoiding the 3*N*8-byte temporary coordinate array (~12 MB at 80^3,
~98 MB at 160^3).

Also add CubeGrid::points_into(double*) for callers that manage their
own buffer.

Benchmark improvement (vs shell, 16 threads):
  water 160^3 orb: 11.9x -> 12.5x
  benzene 160^3 orb: 2.9x -> 3.2x
  benzene 160^3 den: 3.1x -> 3.3x
---
 include/gauxc/external/cube.hpp     |   8 ++
 include/gauxc/orbital_evaluator.hpp |  21 +++
 src/external/cube.cxx               |  12 +-
 src/orbital_evaluator.cxx           | 202 ++++++++++++++++++++++++++++
 4 files changed, 239 insertions(+), 4 deletions(-)

diff --git a/include/gauxc/external/cube.hpp b/include/gauxc/external/cube.hpp
index 15ceb034..47037fe4 100644
--- a/include/gauxc/external/cube.hpp
+++ b/include/gauxc/external/cube.hpp
@@ -65,6 +65,14 @@ struct CubeGrid {
    *  `eval_density`.
    */
   std::vector<double> points() const;
+
+  /** @brief Write grid-point coordinates into a caller-supplied buffer.
+   *
+   *  Same layout as `points()` but writes into `out`, which must have room
+   *  for at least 3*num_points() doubles. Use this to avoid a heap
+   *  allocation when the caller already owns a suitably sized buffer.
+   */
+  void points_into(double* out) const;
 };
 
 /** @brief Write a Gaussian cube file.
diff --git a/include/gauxc/orbital_evaluator.hpp b/include/gauxc/orbital_evaluator.hpp
index f0542e3e..04106266 100644
--- a/include/gauxc/orbital_evaluator.hpp
+++ b/include/gauxc/orbital_evaluator.hpp
@@ -16,6 +16,7 @@
 
 #include <gauxc/basisset.hpp>
 #include <gauxc/enums.hpp>
+#include <gauxc/external/cube.hpp>
 
 namespace GauXC {
 
@@ -97,6 +98,26 @@ class OrbitalEvaluator {
                     const double* D, int64_t ldd,
                     double* out) const;
 
+  /** @brief Evaluate a single MO on a CubeGrid without materialising all 3*N
+   *         grid-point coordinates.
+   */
+  void eval_orbital(const CubeGrid& grid,
+                    const double* C, double* out) const;
+
+  /** @brief Evaluate `nmo` MOs on a CubeGrid without materialising all 3*N
+   *         grid-point coordinates.
+   */
+  void eval_orbitals(const CubeGrid& grid,
+                     int32_t nmo, const double* C, int64_t ldc,
+                     double* out, int64_t ldo) const;
+
+  /** @brief Evaluate the electron density on a CubeGrid without materialising
+   *         all 3*N grid-point coordinates.
+   */
+  void eval_density(const CubeGrid& grid,
+                    const double* D, int64_t ldd,
+                    double* out) const;
+
  private:
   struct Impl;
   std::unique_ptr<Impl> pimpl_;
diff --git a/src/external/cube.cxx b/src/external/cube.cxx
index 018ad0c5..aead8206 100644
--- a/src/external/cube.cxx
+++ b/src/external/cube.cxx
@@ -71,6 +71,11 @@ CubeGrid CubeGrid::from_molecule(const Molecule& mol, int64_t nx, int64_t ny,
 
 std::vector<double> CubeGrid::points() const {
   std::vector<double> pts(static_cast<size_t>(num_points()) * 3);
+  points_into(pts.data());
+  return pts;
+}
+
+void CubeGrid::points_into(double* out) const {
   size_t k = 0;
   for (int64_t ix = 0; ix < nx; ++ix) {
     const double x = origin[0] + spacing[0] * static_cast<double>(ix);
@@ -78,14 +83,13 @@ std::vector<double> CubeGrid::points() const {
       const double y = origin[1] + spacing[1] * static_cast<double>(iy);
       for (int64_t iz = 0; iz < nz; ++iz) {
         const double z = origin[2] + spacing[2] * static_cast<double>(iz);
-        pts[3 * k + 0] = x;
-        pts[3 * k + 1] = y;
-        pts[3 * k + 2] = z;
+        out[3 * k + 0] = x;
+        out[3 * k + 1] = y;
+        out[3 * k + 2] = z;
         ++k;
       }
     }
   }
-  return pts;
 }
 
 // Hand-rolled %13.5E formatter. Required field is fixed 13 chars
diff --git a/src/orbital_evaluator.cxx b/src/orbital_evaluator.cxx
index 1879f684..0e5ce22d 100644
--- a/src/orbital_evaluator.cxx
+++ b/src/orbital_evaluator.cxx
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include <gauxc/basisset_map.hpp>
+#include <gauxc/external/cube.hpp>
 #include <gauxc/exceptions.hpp>
 #include <gauxc/molecule.hpp>
 #include <gauxc/xc_integrator/local_work_driver.hpp>
@@ -49,6 +50,22 @@ inline LocalHostWorkDriver::submat_map_t full_submat_map(int32_t nbf) {
   return {{ {int32_t{0}, nbf, int32_t{0}} }};
 }
 
+/// Fill `pts[3*np]` with grid-point coordinates for batch [p0, p0+np).
+/// Avoids materialising the full 3*N point array for grid-based evaluation.
+inline void fill_batch_pts(const CubeGrid& g, int64_t p0, int64_t np,
+                           double* pts) {
+  const int64_t nz = g.nz, ny = g.ny;
+  for (int64_t i = 0; i < np; ++i) {
+    const int64_t k  = p0 + i;
+    const int64_t iz = k % nz;
+    const int64_t iy = (k / nz) % ny;
+    const int64_t ix = k / (ny * nz);
+    pts[3 * i + 0] = g.origin[0] + static_cast<double>(ix) * g.spacing[0];
+    pts[3 * i + 1] = g.origin[1] + static_cast<double>(iy) * g.spacing[1];
+    pts[3 * i + 2] = g.origin[2] + static_cast<double>(iz) * g.spacing[2];
+  }
+}
+
 /// Axis-aligned bbox of a set of npts AoS points (length 3*npts).
 struct PointBbox {
   std::array<double, 3> lo;
@@ -338,4 +355,189 @@ void OrbitalEvaluator::eval_density(int64_t npts, const double* points,
   }
 }
 
+// ---------------------------------------------------------------------------
+// CubeGrid overloads (3b): generate per-batch point coordinates on-the-fly,
+// avoiding the 3*num_points()*8 byte temporary coordinate array.
+// ---------------------------------------------------------------------------
+
+void OrbitalEvaluator::eval_orbital(const CubeGrid& grid,
+                                    const double* C, double* out) const {
+  eval_orbitals(grid, /*nmo=*/1, C, /*ldc=*/pimpl_->nbf_, out,
+                /*ldo=*/grid.num_points());
+}
+
+void OrbitalEvaluator::eval_orbitals(const CubeGrid& grid,
+                                     int32_t nmo, const double* C, int64_t ldc,
+                                     double* out, int64_t ldo) const {
+  const int64_t npts = grid.num_points();
+  if (npts == 0 || nmo == 0) return;
+  if (C == nullptr || out == nullptr) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_orbitals(grid): null pointer argument.");
+  }
+  const int32_t nbf = pimpl_->nbf_;
+  if (ldc < nbf) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_orbitals(grid): ldc must be >= nbf().");
+  }
+  if (ldo < npts) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_orbitals(grid): ldo must be >= npts.");
+  }
+
+  const int32_t nshells_total = pimpl_->basis.nshells();
+  const int64_t batch_size = choose_batch_size(nbf);
+  const int64_t n_batches = (npts + batch_size - 1) / batch_size;
+  LocalHostWorkDriver* driver = pimpl_->host_driver;
+  const BasisSet<double>& basis = pimpl_->basis;
+  const auto& shell_cutoff_r2 = pimpl_->shell_cutoff_r2;
+  const BasisSetMap& basis_map = *pimpl_->basis_map;
+
+#pragma omp parallel
+  {
+    // batch_pts replaces the full 3*npts coordinate array; only batch_size
+    // coordinates are live at a time (~192 KB for batch_size=8192).
+    std::vector<double> batch_pts(static_cast<size_t>(batch_size) * 3);
+    std::vector<double> ao_buf(static_cast<size_t>(nbf) * batch_size);
+    std::vector<double> C_compressed(static_cast<size_t>(nbf) * nmo);
+    std::vector<int32_t> screened_shells;
+    screened_shells.reserve(nshells_total);
+
+#pragma omp for schedule(dynamic, 1)
+    for (int64_t b = 0; b < n_batches; ++b) {
+      const int64_t p0 = b * batch_size;
+      const int64_t np = std::min<int64_t>(batch_size, npts - p0);
+
+      fill_batch_pts(grid, p0, np, batch_pts.data());
+      const PointBbox bbox = compute_bbox(batch_pts.data(), np);
+      screened_shells.clear();
+      int32_t nbe = 0;
+      for (int32_t s = 0; s < nshells_total; ++s) {
+        if (dist2_center_to_bbox(basis[s].O_data(), bbox) <
+            shell_cutoff_r2[s]) {
+          screened_shells.push_back(s);
+          nbe += basis[s].size();
+        }
+      }
+      if (nbe == 0) {
+        for (int32_t j = 0; j < nmo; ++j) {
+          double* out_col = out + static_cast<size_t>(j) * ldo + p0;
+          std::fill(out_col, out_col + np, 0.0);
+        }
+        continue;
+      }
+
+      driver->eval_collocation(
+          static_cast<size_t>(np),
+          static_cast<size_t>(screened_shells.size()),
+          static_cast<size_t>(nbe), batch_pts.data(), basis,
+          screened_shells.data(), ao_buf.data());
+
+      {
+        int32_t row = 0;
+        for (int32_t s : screened_shells) {
+          const auto rng = basis_map.shell_to_ao_range(s);
+          const int32_t shell_nbe = rng.second - rng.first;
+          for (int32_t j = 0; j < nmo; ++j) {
+            const double* C_col = C + static_cast<size_t>(j) * ldc;
+            double* dst = C_compressed.data() +
+                          static_cast<size_t>(j) * nbe + row;
+            std::copy(C_col + rng.first, C_col + rng.second, dst);
+          }
+          row += shell_nbe;
+        }
+      }
+
+      blas::gemm<double>(
+          'T', 'N', static_cast<int>(np), static_cast<int>(nmo), nbe,
+          1.0, ao_buf.data(), nbe, C_compressed.data(), nbe,
+          0.0, out + p0, static_cast<int>(ldo));
+    }
+  }
+}
+
+void OrbitalEvaluator::eval_density(const CubeGrid& grid,
+                                    const double* D, int64_t ldd,
+                                    double* out) const {
+  const int64_t npts = grid.num_points();
+  if (npts == 0) return;
+  if (D == nullptr || out == nullptr) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_density(grid): null pointer argument.");
+  }
+  const int32_t nbf = pimpl_->nbf_;
+  if (ldd < nbf) {
+    GAUXC_GENERIC_EXCEPTION(
+        "OrbitalEvaluator::eval_density(grid): ldd must be >= nbf().");
+  }
+
+  const int32_t nshells_total = pimpl_->basis.nshells();
+  const int64_t batch_size = choose_batch_size(nbf);
+  const int64_t n_batches = (npts + batch_size - 1) / batch_size;
+  LocalHostWorkDriver* driver = pimpl_->host_driver;
+  const BasisSet<double>& basis = pimpl_->basis;
+  const auto& shell_cutoff_r2 = pimpl_->shell_cutoff_r2;
+  const BasisSetMap& basis_map = *pimpl_->basis_map;
+
+#pragma omp parallel
+  {
+    std::vector<double> batch_pts(static_cast<size_t>(batch_size) * 3);
+    std::vector<double> ao_buf(static_cast<size_t>(nbf) * batch_size);
+    std::vector<double> dm_ao_buf(static_cast<size_t>(nbf) * batch_size);
+    std::vector<double> xmat_scr(static_cast<size_t>(nbf) * nbf);
+    std::vector<int32_t> screened_shells;
+    screened_shells.reserve(nshells_total);
+
+#pragma omp for schedule(dynamic, 1)
+    for (int64_t b = 0; b < n_batches; ++b) {
+      const int64_t p0 = b * batch_size;
+      const int64_t np = std::min<int64_t>(batch_size, npts - p0);
+
+      fill_batch_pts(grid, p0, np, batch_pts.data());
+      const PointBbox bbox = compute_bbox(batch_pts.data(), np);
+      screened_shells.clear();
+      int32_t nbe = 0;
+      for (int32_t s = 0; s < nshells_total; ++s) {
+        if (dist2_center_to_bbox(basis[s].O_data(), bbox) <
+            shell_cutoff_r2[s]) {
+          screened_shells.push_back(s);
+          nbe += basis[s].size();
+        }
+      }
+      if (nbe == 0) {
+        std::fill(out + p0, out + p0 + np, 0.0);
+        continue;
+      }
+
+      driver->eval_collocation(
+          static_cast<size_t>(np),
+          static_cast<size_t>(screened_shells.size()),
+          static_cast<size_t>(nbe), batch_pts.data(), basis,
+          screened_shells.data(), ao_buf.data());
+
+      LocalHostWorkDriver::submat_map_t submat_map;
+      if (nbe == nbf) {
+        submat_map = full_submat_map(nbf);
+      } else {
+        std::tie(submat_map, std::ignore) =
+            gen_compressed_submat_map(basis_map, screened_shells, nbf, nbf);
+      }
+
+      driver->eval_xmat(
+          static_cast<size_t>(np), static_cast<size_t>(nbf),
+          static_cast<size_t>(nbe), submat_map, 1.0,
+          D, static_cast<size_t>(ldd),
+          ao_buf.data(), static_cast<size_t>(nbe),
+          dm_ao_buf.data(), static_cast<size_t>(nbe),
+          xmat_scr.data());
+
+      driver->eval_uvvar_lda_rks(
+          static_cast<size_t>(np), static_cast<size_t>(nbe),
+          ao_buf.data(),
+          dm_ao_buf.data(), static_cast<size_t>(nbe),
+          out + p0);
+    }
+  }
+}
+
 }  // namespace GauXC

From 9d417ca1c819ffff44cd6d57df6b0b3639146006 Mon Sep 17 00:00:00 2001
From: Cody Johnston <cojohnston@microsoft.com>
Date: Wed, 29 Apr 2026 19:20:47 +0000
Subject: [PATCH 4/8] OrbitalEvaluator: pipelined collocation+GEMM via OMP
 tasks

Replace the omp-for loop in the CubeGrid eval_orbitals overload with a
hybrid dispatch: when n_batches < 2*nthreads, use an OMP task pipeline
that overlaps batch B+1's collocation with batch B's GEMM via dependent
tasks on a ring of pre-allocated buffer slots. Otherwise fall back to
the plain omp-for loop which has lower overhead at saturation.

Benchmark (orbital speedup vs PySCF, benzene cc-pVDZ):
  80^3/16t:  3.0x -> 3.6x (+20%)
  80^3/64t:  4.7x -> 7.0x (+49%)
  160^3/4t:  1.3x -> 1.9x (+46%)
  160^3/16t: 3.2x -> 3.6x (+13%)
  160^3/64t: 8.6x -> 7.6x (within noise, no regression)
  160^3/208t: 11.3x -> 11.9x (+5%)
---
 src/orbital_evaluator.cxx | 114 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 112 insertions(+), 2 deletions(-)

diff --git a/src/orbital_evaluator.cxx b/src/orbital_evaluator.cxx
index 0e5ce22d..30ad0a29 100644
--- a/src/orbital_evaluator.cxx
+++ b/src/orbital_evaluator.cxx
@@ -18,6 +18,8 @@
 #include <tuple>
 #include <vector>
 
+#include <omp.h>
+
 #include <gauxc/basisset_map.hpp>
 #include <gauxc/external/cube.hpp>
 #include <gauxc/exceptions.hpp>
@@ -393,10 +395,117 @@ void OrbitalEvaluator::eval_orbitals(const CubeGrid& grid,
   const auto& shell_cutoff_r2 = pimpl_->shell_cutoff_r2;
   const BasisSetMap& basis_map = *pimpl_->basis_map;
 
+  // Choose between task pipeline and omp-for based on saturation.
+  // When n_batches >> nthreads, omp-for already fully saturates all cores
+  // and the task pipeline's scheduling overhead is a net negative.
+  const int nthreads = omp_get_max_threads();
+  const bool use_pipeline = (n_batches < 2 * nthreads);
+
+  if (use_pipeline) {
+  // Pipelined evaluation (3d): overlap collocation of batch B+1 with
+  // GEMM of batch B using OMP tasks with dependency tags.
+  struct Slot {
+    std::vector<double> pts;
+    std::vector<double> ao;
+    std::vector<double> C_comp;
+    std::vector<int32_t> shells;
+    int32_t nbe;
+    int64_t p0, np;
+  };
+
+  const int nslots =
+      std::min<int64_t>(nthreads, n_batches);
+  std::vector<Slot> slots(nslots);
+  for (auto& s : slots) {
+    s.pts.resize(static_cast<size_t>(batch_size) * 3);
+    s.ao.resize(static_cast<size_t>(nbf) * batch_size);
+    s.C_comp.resize(static_cast<size_t>(nbf) * nmo);
+    s.shells.reserve(nshells_total);
+    s.nbe = 0;
+    s.p0 = 0;
+    s.np = 0;
+  }
+
+  // Dependency tag array — raw array for omp depend().
+  std::unique_ptr<char[]> dtag(new char[nslots]{});
+  char* dtag_ptr = dtag.get();
+  (void)dtag_ptr;  // used only in omp depend() clauses
+
+#pragma omp parallel
+#pragma omp single
+  {
+    for (int64_t b = 0; b < n_batches; ++b) {
+      const int si = static_cast<int>(b % nslots);
+
+      // Phase A: generate points, screen shells, eval collocation.
+#pragma omp task shared(slots, grid, basis, shell_cutoff_r2) \
+    depend(out: dtag_ptr[si]) firstprivate(b, si)
+      {
+        Slot& sl = slots[si];
+        sl.p0 = b * batch_size;
+        sl.np = std::min<int64_t>(batch_size, npts - sl.p0);
+
+        fill_batch_pts(grid, sl.p0, sl.np, sl.pts.data());
+        const PointBbox bbox = compute_bbox(sl.pts.data(), sl.np);
+        sl.shells.clear();
+        sl.nbe = 0;
+        for (int32_t s = 0; s < nshells_total; ++s) {
+          if (dist2_center_to_bbox(basis[s].O_data(), bbox) <
+              shell_cutoff_r2[s]) {
+            sl.shells.push_back(s);
+            sl.nbe += basis[s].size();
+          }
+        }
+        if (sl.nbe > 0) {
+          driver->eval_collocation(
+              static_cast<size_t>(sl.np),
+              static_cast<size_t>(sl.shells.size()),
+              static_cast<size_t>(sl.nbe), sl.pts.data(), basis,
+              sl.shells.data(), sl.ao.data());
+        }
+      }
+
+      // Phase B: gather C + GEMM → out.
+#pragma omp task shared(slots, out, C, basis_map) \
+    depend(in: dtag_ptr[si]) firstprivate(b, si)
+      {
+        Slot& sl = slots[si];
+        if (sl.nbe == 0) {
+          for (int32_t j = 0; j < nmo; ++j) {
+            double* oc = out + static_cast<size_t>(j) * ldo + sl.p0;
+            std::fill(oc, oc + sl.np, 0.0);
+          }
+        } else {
+          const int32_t nbe = sl.nbe;
+          {
+            int32_t row = 0;
+            for (int32_t s : sl.shells) {
+              const auto rng = basis_map.shell_to_ao_range(s);
+              const int32_t shell_nbe = rng.second - rng.first;
+              for (int32_t j = 0; j < nmo; ++j) {
+                const double* C_col = C + static_cast<size_t>(j) * ldc;
+                double* dst = sl.C_comp.data() +
+                              static_cast<size_t>(j) * nbe + row;
+                std::copy(C_col + rng.first, C_col + rng.second, dst);
+              }
+              row += shell_nbe;
+            }
+          }
+
+          blas::gemm<double>(
+              'T', 'N', static_cast<int>(sl.np), static_cast<int>(nmo),
+              nbe, 1.0, sl.ao.data(), nbe, sl.C_comp.data(), nbe,
+              0.0, out + sl.p0, static_cast<int>(ldo));
+        }
+      }
+    }
+  }
+
+  } else {
+    // Saturated path: plain omp-for, lower overhead at high thread counts.
+
 #pragma omp parallel
   {
-    // batch_pts replaces the full 3*npts coordinate array; only batch_size
-    // coordinates are live at a time (~192 KB for batch_size=8192).
     std::vector<double> batch_pts(static_cast<size_t>(batch_size) * 3);
     std::vector<double> ao_buf(static_cast<size_t>(nbf) * batch_size);
     std::vector<double> C_compressed(static_cast<size_t>(nbf) * nmo);
@@ -454,6 +563,7 @@ void OrbitalEvaluator::eval_orbitals(const CubeGrid& grid,
           0.0, out + p0, static_cast<int>(ldo));
     }
   }
+  }  // end saturated fallback
 }
 
 void OrbitalEvaluator::eval_density(const CubeGrid& grid,

From 5dc6806f2854f99f296e9c5bba9d1729c61c8fbb Mon Sep 17 00:00:00 2001
From: Cody Johnston <cojohnston@microsoft.com>
Date: Wed, 29 Apr 2026 19:40:16 +0000
Subject: [PATCH 5/8] OrbitalEvaluator: hoist xmat_scr to per-evaluator scratch
 pool

Move the nbf*nbf eval_xmat scratch buffer from per-omp-parallel-region
allocation to a pre-allocated per-thread pool in the Impl. Saves
nthreads * nbf^2 * 8 bytes of allocation churn per eval_density call
when the evaluator is reused across multiple density evaluations (e.g.
dumping 30 MO densities).

No measurable impact in single-call benchmarks (allocation was already
per-thread, not per-batch), but eliminates a malloc hot-spot in the
multi-call pattern.
---
 src/orbital_evaluator.cxx | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/orbital_evaluator.cxx b/src/orbital_evaluator.cxx
index 30ad0a29..8f909422 100644
--- a/src/orbital_evaluator.cxx
+++ b/src/orbital_evaluator.cxx
@@ -120,6 +120,20 @@ struct OrbitalEvaluator::Impl {
   std::vector<double> shell_cutoff_r2;
   int32_t nbf_ = 0;
 
+  // Per-thread scratch for eval_xmat in eval_density. Allocated once
+  // per evaluator rather than once per eval_density call, saving
+  // nthreads * nbf^2 * 8 bytes of allocation churn per call.
+  mutable std::vector<std::vector<double>> xmat_scratch_pool;
+
+  /// Return a pointer to thread-local xmat scratch of size nbf*nbf.
+  /// Must be called from inside an omp parallel region whose thread
+  /// count does not exceed the value of omp_get_max_threads() at
+  /// construction time.
+  double* xmat_scratch() const {
+    auto& buf = xmat_scratch_pool[omp_get_thread_num()];
+    return buf.data();
+  }
+
   void init(BasisSet<double> bs, ExecutionSpace exec) {
     if (exec != ExecutionSpace::Host) {
       GAUXC_GENERIC_EXCEPTION(
@@ -148,6 +162,12 @@ struct OrbitalEvaluator::Impl {
       const double r = basis[s].cutoff_radius();
       shell_cutoff_r2[s] = r * r;
     }
+
+    // Pre-size the scratch pool for the current OMP thread count.
+    xmat_scratch_pool.resize(omp_get_max_threads());
+    for (auto& buf : xmat_scratch_pool) {
+      buf.resize(static_cast<size_t>(nbf_) * nbf_);
+    }
   }
 };
 
@@ -295,9 +315,6 @@ void OrbitalEvaluator::eval_density(int64_t npts, const double* points,
   {
     std::vector<double> ao_buf(static_cast<size_t>(nbf) * batch_size);
     std::vector<double> dm_ao_buf(static_cast<size_t>(nbf) * batch_size);
-    // eval_xmat scratch when the submat-gather path is taken; sized for
-    // worst case (nbe == nbf).
-    std::vector<double> xmat_scr(static_cast<size_t>(nbf) * nbf);
     std::vector<int32_t> screened_shells;
     screened_shells.reserve(nshells_total);
 
@@ -345,7 +362,7 @@ void OrbitalEvaluator::eval_density(int64_t npts, const double* points,
           /*P=*/D, /*ldp=*/static_cast<size_t>(ldd),
           /*basis_eval=*/ao_buf.data(), /*ldb=*/static_cast<size_t>(nbe),
           /*X=*/dm_ao_buf.data(), /*ldx=*/static_cast<size_t>(nbe),
-          /*scr=*/xmat_scr.data());
+          /*scr=*/pimpl_->xmat_scratch());
 
       // rho[p] = sum_mu ao(mu, p) * dm_ao(mu, p)
       driver->eval_uvvar_lda_rks(
@@ -594,7 +611,6 @@ void OrbitalEvaluator::eval_density(const CubeGrid& grid,
     std::vector<double> batch_pts(static_cast<size_t>(batch_size) * 3);
     std::vector<double> ao_buf(static_cast<size_t>(nbf) * batch_size);
     std::vector<double> dm_ao_buf(static_cast<size_t>(nbf) * batch_size);
-    std::vector<double> xmat_scr(static_cast<size_t>(nbf) * nbf);
     std::vector<int32_t> screened_shells;
     screened_shells.reserve(nshells_total);
 
@@ -639,7 +655,7 @@ void OrbitalEvaluator::eval_density(const CubeGrid& grid,
           D, static_cast<size_t>(ldd),
           ao_buf.data(), static_cast<size_t>(nbe),
           dm_ao_buf.data(), static_cast<size_t>(nbe),
-          xmat_scr.data());
+          pimpl_->xmat_scratch());
 
       driver->eval_uvvar_lda_rks(
           static_cast<size_t>(np), static_cast<size_t>(nbe),

From 40afaaf4439759c969ec426d87ffa664db9e53ff Mon Sep 17 00:00:00 2001
From: Cody Johnston <cojohnston@microsoft.com>
Date: Wed, 29 Apr 2026 20:21:52 +0000
Subject: [PATCH 6/8] Add write_cube_hdf5 for HDF5 cube field output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add write_cube_hdf5() alongside write_cube() for writing cube field
data to HDF5 files. Guarded behind GAUXC_HAS_HDF5 and uses the
existing HighFive dependency. The HDF5 layout stores:

  /field          (nx, ny, nz) float64 — the scalar field
  /grid/origin    (3,) float64
  /grid/spacing   (3,) float64
  /grid/shape     (3,) int64
  /atoms/Z        (natom,) int64
  /atoms/coords   (natom, 3) float64
  /comment        string attribute on /field

This is useful for downstream analysis tools (h5py, HDFView) that
can read the field directly as a numpy array without parsing text.

Includes a round-trip test verifying field values, grid metadata,
atomic geometry, and comment attribute.
---
 include/gauxc/external/cube.hpp  | 30 ++++++++++++
 src/external/CMakeLists.txt      |  2 +-
 src/external/cube_hdf5.cxx       | 81 ++++++++++++++++++++++++++++++++
 tests/orbital_evaluator_test.cxx | 78 ++++++++++++++++++++++++++++++
 4 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 src/external/cube_hdf5.cxx

diff --git a/include/gauxc/external/cube.hpp b/include/gauxc/external/cube.hpp
index 47037fe4..0cdd8f7f 100644
--- a/include/gauxc/external/cube.hpp
+++ b/include/gauxc/external/cube.hpp
@@ -100,4 +100,34 @@ void write_cube(const std::string& path,
                 const double* field,
                 const std::string& comment = "");
 
+#ifdef GAUXC_HAS_HDF5
+/** @brief Write a cube field to an HDF5 file.
+ *
+ *  Stores the grid specification, molecular geometry, and scalar field in a
+ *  single HDF5 file for downstream analysis. The field is stored as a 3D
+ *  dataset of shape (nx, ny, nz) in row-major order with iz varying fastest
+ *  (matching the cube-file convention).
+ *
+ *  HDF5 layout:
+ *    /field          (nx, ny, nz) float64 — the scalar field
+ *    /grid/origin    (3,) float64
+ *    /grid/spacing   (3,) float64
+ *    /grid/shape     (3,) int64   — {nx, ny, nz}
+ *    /atoms/Z        (natom,) int64
+ *    /atoms/coords   (natom, 3) float64
+ *    /comment        scalar string attribute on /field
+ *
+ *  @param path     Output path. Parent directory must exist.
+ *  @param mol      Molecule (atomic numbers + Cartesian coordinates in Bohr).
+ *  @param grid     Grid specification.
+ *  @param field    Length-`grid.num_points()` scalar field.
+ *  @param comment  Optional comment stored as an attribute on /field.
+ */
+void write_cube_hdf5(const std::string& path,
+                     const Molecule& mol,
+                     const CubeGrid& grid,
+                     const double* field,
+                     const std::string& comment = "");
+#endif
+
 }  // namespace GauXC
diff --git a/src/external/CMakeLists.txt b/src/external/CMakeLists.txt
index b0add283..9df2df95 100644
--- a/src/external/CMakeLists.txt
+++ b/src/external/CMakeLists.txt
@@ -35,7 +35,7 @@ if( GAUXC_ENABLE_HDF5 )
       FetchContent_MakeAvailable( HighFive )
     
     endif()
-    target_sources( gauxc PRIVATE hdf5_write.cxx hdf5_read.cxx )
+    target_sources( gauxc PRIVATE hdf5_write.cxx hdf5_read.cxx cube_hdf5.cxx )
     target_link_libraries( gauxc PUBLIC HighFive )
   else()
     message(WARNING "GAUXC_ENABLE_HDF5 was enabled, but HDF5 was not found, Disabling HDF5 Bindings")
diff --git a/src/external/cube_hdf5.cxx b/src/external/cube_hdf5.cxx
new file mode 100644
index 00000000..13b8f0d1
--- /dev/null
+++ b/src/external/cube_hdf5.cxx
@@ -0,0 +1,81 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2026, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include <gauxc/gauxc_config.hpp>
+#ifdef GAUXC_HAS_HDF5
+
+#include <gauxc/external/cube.hpp>
+#include <gauxc/exceptions.hpp>
+
+#include <highfive/H5File.hpp>
+
+#include <vector>
+
+namespace GauXC {
+
+void write_cube_hdf5(const std::string& path,
+                     const Molecule& mol,
+                     const CubeGrid& grid,
+                     const double* field,
+                     const std::string& comment) {
+  if (grid.num_points() <= 0) {
+    GAUXC_GENERIC_EXCEPTION("write_cube_hdf5: grid has zero points.");
+  }
+  if (field == nullptr) {
+    GAUXC_GENERIC_EXCEPTION("write_cube_hdf5: null field pointer.");
+  }
+
+  HighFive::File file(path, HighFive::File::Overwrite);
+
+  // --- Field: 3D dataset (nx, ny, nz) ---
+  const auto nx = static_cast<size_t>(grid.nx);
+  const auto ny = static_cast<size_t>(grid.ny);
+  const auto nz = static_cast<size_t>(grid.nz);
+
+  auto ds_field = file.createDataSet<double>(
+      "field", HighFive::DataSpace({nx, ny, nz}));
+  ds_field.write_raw(field);
+
+  // Comment attribute on /field.
+  const std::string cmt =
+      comment.empty() ? "Cube field generated by GauXC" : comment;
+  ds_field.createAttribute("comment", cmt);
+
+  // --- Grid metadata ---
+  auto grp_grid = file.createGroup("grid");
+  grp_grid.createDataSet("origin",
+      std::vector<double>(grid.origin.begin(), grid.origin.end()));
+  grp_grid.createDataSet("spacing",
+      std::vector<double>(grid.spacing.begin(), grid.spacing.end()));
+  grp_grid.createDataSet("shape",
+      std::vector<int64_t>{grid.nx, grid.ny, grid.nz});
+
+  // --- Atoms ---
+  const size_t natom = mol.size();
+  auto grp_atoms = file.createGroup("atoms");
+
+  std::vector<int64_t> atomic_numbers(natom);
+  std::vector<double> coords(natom * 3);
+  for (size_t i = 0; i < natom; ++i) {
+    atomic_numbers[i] = static_cast<int64_t>(mol[i].Z.get());
+    coords[3 * i + 0] = mol[i].x;
+    coords[3 * i + 1] = mol[i].y;
+    coords[3 * i + 2] = mol[i].z;
+  }
+
+  grp_atoms.createDataSet("Z", atomic_numbers);
+  grp_atoms.createDataSet<double>("coords",
+      HighFive::DataSpace({natom, 3ul})).write_raw(coords.data());
+}
+
+}  // namespace GauXC
+
+#endif  // GAUXC_HAS_HDF5
diff --git a/tests/orbital_evaluator_test.cxx b/tests/orbital_evaluator_test.cxx
index 97562323..82971187 100644
--- a/tests/orbital_evaluator_test.cxx
+++ b/tests/orbital_evaluator_test.cxx
@@ -24,6 +24,10 @@
 #include <gauxc/external/cube.hpp>
 #include <gauxc/orbital_evaluator.hpp>
 #include <gauxc/xc_integrator/local_work_driver.hpp>
+#include <gauxc/gauxc_config.hpp>
+#ifdef GAUXC_HAS_HDF5
+#include <highfive/H5File.hpp>
+#endif
 
 #include "standards.hpp"
 
@@ -330,3 +334,77 @@ TEST_CASE("write_cube agrees with snprintf %13.5E formatting", "[cube]") {
 
   std::remove(path.c_str());
 }
+
+#ifdef GAUXC_HAS_HDF5
+TEST_CASE("write_cube_hdf5 round-trip", "[cube]") {
+  auto mol = make_water();
+  auto grid = CubeGrid::from_molecule(mol, 4, 5, 6);
+
+  // Fill a small field with known values.
+  const int64_t npts = grid.num_points();
+  std::vector<double> field(npts);
+  for (int64_t i = 0; i < npts; ++i) field[i] = 0.01 * i - 0.5;
+
+  const std::string path = "/tmp/_gauxc_test_cube.h5";
+  write_cube_hdf5(path, mol, grid, field.data(), "test cube");
+
+  // Read back and verify.
+  HighFive::File file(path, HighFive::File::ReadOnly);
+
+  // Field shape and values.
+  auto ds = file.getDataSet("field");
+  auto dims = ds.getDimensions();
+  REQUIRE(dims.size() == 3);
+  CHECK(dims[0] == static_cast<size_t>(grid.nx));
+  CHECK(dims[1] == static_cast<size_t>(grid.ny));
+  CHECK(dims[2] == static_cast<size_t>(grid.nz));
+
+  std::vector<double> read_field(npts);
+  ds.read(read_field.data());
+  for (int64_t i = 0; i < npts; ++i) {
+    CHECK(read_field[i] == Approx(field[i]).epsilon(1e-14));
+  }
+
+  // Comment attribute.
+  std::string cmt;
+  ds.getAttribute("comment").read(cmt);
+  CHECK(cmt == "test cube");
+
+  // Grid metadata.
+  auto grp_grid = file.getGroup("grid");
+  std::vector<double> origin, spacing;
+  std::vector<int64_t> shape;
+  grp_grid.getDataSet("origin").read(origin);
+  grp_grid.getDataSet("spacing").read(spacing);
+  grp_grid.getDataSet("shape").read(shape);
+  REQUIRE(origin.size() == 3);
+  REQUIRE(spacing.size() == 3);
+  REQUIRE(shape.size() == 3);
+  for (int k = 0; k < 3; ++k) {
+    CHECK(origin[k] == Approx(grid.origin[k]).epsilon(1e-14));
+    CHECK(spacing[k] == Approx(grid.spacing[k]).epsilon(1e-14));
+  }
+  CHECK(shape[0] == grid.nx);
+  CHECK(shape[1] == grid.ny);
+  CHECK(shape[2] == grid.nz);
+
+  // Atoms.
+  auto grp_atoms = file.getGroup("atoms");
+  std::vector<int64_t> Z;
+  grp_atoms.getDataSet("Z").read(Z);
+  REQUIRE(Z.size() == mol.size());
+  for (size_t i = 0; i < mol.size(); ++i) {
+    CHECK(Z[i] == static_cast<int64_t>(mol[i].Z.get()));
+  }
+
+  std::vector<double> coords(mol.size() * 3);
+  grp_atoms.getDataSet("coords").read(coords.data());
+  for (size_t i = 0; i < mol.size(); ++i) {
+    CHECK(coords[3 * i + 0] == Approx(mol[i].x).epsilon(1e-14));
+    CHECK(coords[3 * i + 1] == Approx(mol[i].y).epsilon(1e-14));
+    CHECK(coords[3 * i + 2] == Approx(mol[i].z).epsilon(1e-14));
+  }
+
+  std::remove(path.c_str());
+}
+#endif

From 10f59a8f42ae5f66e72c287831980a6448e36aa2 Mon Sep 17 00:00:00 2001
From: Cody Johnston <cojohnston@microsoft.com>
Date: Wed, 29 Apr 2026 20:56:16 +0000
Subject: [PATCH 7/8] Fix portability issues and add CubeGrid overload tests

- Guard #include <omp.h> with #ifdef _OPENMP and provide serial
  fallbacks for omp_get_max_threads / omp_get_thread_num, so the code
  compiles without OpenMP enabled.
- Remove leftover internal optimization labels from comments.
- Replace POSIX-only unistd.h / getpid() with a portable temp-path
  helper to fix Windows builds.
- Add test case for CubeGrid eval_orbital / eval_density overloads,
  verifying they produce identical results to the pointer-based API.
---
 src/orbital_evaluator.cxx        |  9 ++++-
 tests/orbital_evaluator_test.cxx | 67 ++++++++++++++++++++++++++++----
 2 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/src/orbital_evaluator.cxx b/src/orbital_evaluator.cxx
index 8f909422..05e7064a 100644
--- a/src/orbital_evaluator.cxx
+++ b/src/orbital_evaluator.cxx
@@ -18,7 +18,12 @@
 #include <tuple>
 #include <vector>
 
+#ifdef _OPENMP
 #include <omp.h>
+#else
+inline int omp_get_max_threads() { return 1; }
+inline int omp_get_thread_num()  { return 0; }
+#endif
 
 #include <gauxc/basisset_map.hpp>
 #include <gauxc/external/cube.hpp>
@@ -375,7 +380,7 @@ void OrbitalEvaluator::eval_density(int64_t npts, const double* points,
 }
 
 // ---------------------------------------------------------------------------
-// CubeGrid overloads (3b): generate per-batch point coordinates on-the-fly,
+// CubeGrid overloads: generate per-batch point coordinates on-the-fly,
 // avoiding the 3*num_points()*8 byte temporary coordinate array.
 // ---------------------------------------------------------------------------
 
@@ -419,7 +424,7 @@ void OrbitalEvaluator::eval_orbitals(const CubeGrid& grid,
   const bool use_pipeline = (n_batches < 2 * nthreads);
 
   if (use_pipeline) {
-  // Pipelined evaluation (3d): overlap collocation of batch B+1 with
+  // Pipelined evaluation: overlap collocation of batch B+1 with
   // GEMM of batch B using OMP tasks with dependency tags.
   struct Slot {
     std::vector<double> pts;
diff --git a/tests/orbital_evaluator_test.cxx b/tests/orbital_evaluator_test.cxx
index 82971187..de7a6118 100644
--- a/tests/orbital_evaluator_test.cxx
+++ b/tests/orbital_evaluator_test.cxx
@@ -17,10 +17,9 @@
 #include <fstream>
 #include <random>
 #include <sstream>
+#include <string>
 #include <vector>
 
-#include <unistd.h>
-
 #include <gauxc/external/cube.hpp>
 #include <gauxc/orbital_evaluator.hpp>
 #include <gauxc/xc_integrator/local_work_driver.hpp>
@@ -38,6 +37,12 @@ using namespace GauXC;
 
 namespace {
 
+/// Generate a unique temp path for test files (avoids tmpnam warning).
+std::string make_temp_path(const char* suffix) {
+  static int counter = 0;
+  return std::string("/tmp/gauxc_test_") + std::to_string(++counter) + suffix;
+}
+
 /// Build a deterministic PRNG-based set of points within a small bounding box
 /// around the molecule. Avoids putting samples too close to nuclei to keep
 /// values numerically well-behaved.
@@ -169,6 +174,54 @@ TEST_CASE("OrbitalEvaluator / Water cc-pVDZ matches eval_collocation",
   }
 }
 
+TEST_CASE("CubeGrid eval overloads match pointer-based eval",
+          "[orbital_evaluator]") {
+  auto mol = make_water();
+  auto basis = make_ccpvdz(mol, SphericalType(true));
+  for (auto& sh : basis) sh.set_shell_tolerance(1e-12);
+  const int32_t nbf = basis.nbf();
+  OrbitalEvaluator eval(basis);
+
+  // Small grid for fast testing.
+  auto grid = CubeGrid::from_molecule(mol, 6, 7, 8);
+  const int64_t npts = grid.num_points();
+  auto pts = grid.points();
+
+  // Random MO coefficients.
+  std::mt19937 rng(42u);
+  std::uniform_real_distribution<double> dist(-1.0, 1.0);
+  std::vector<double> C(static_cast<size_t>(nbf));
+  for (auto& v : C) v = dist(rng);
+
+  SECTION("eval_orbital(grid) matches eval_orbital(npts, points)") {
+    std::vector<double> ref(static_cast<size_t>(npts));
+    eval.eval_orbital(npts, pts.data(), C.data(), ref.data());
+
+    std::vector<double> out(static_cast<size_t>(npts));
+    eval.eval_orbital(grid, C.data(), out.data());
+
+    for (int64_t p = 0; p < npts; ++p) {
+      CHECK(out[p] == Approx(ref[p]).margin(1e-12));
+    }
+  }
+
+  SECTION("eval_density(grid) matches eval_density(npts, points)") {
+    // Identity density.
+    std::vector<double> D(static_cast<size_t>(nbf) * nbf, 0.0);
+    for (int32_t i = 0; i < nbf; ++i) D[i * nbf + i] = 1.0;
+
+    std::vector<double> ref(static_cast<size_t>(npts));
+    eval.eval_density(npts, pts.data(), D.data(), nbf, ref.data());
+
+    std::vector<double> out(static_cast<size_t>(npts));
+    eval.eval_density(grid, D.data(), nbf, out.data());
+
+    for (int64_t p = 0; p < npts; ++p) {
+      CHECK(out[p] == Approx(ref[p]).margin(1e-12));
+    }
+  }
+}
+
 TEST_CASE("CubeGrid construction and grid-points layout", "[cube]") {
   auto mol = make_water();
   CubeGrid g = CubeGrid::from_molecule(mol, /*nx=*/16, /*ny=*/12, /*nz=*/8,
@@ -213,9 +266,8 @@ TEST_CASE("write_cube round-trips header and field data", "[cube]") {
                                     std::pow(10.0, (i % 5) - 2);
   }
 
-  // Use a tmp path under /tmp.
-  const std::string path = std::string("/tmp/gauxc_cube_test_") +
-                           std::to_string(::getpid()) + ".cube";
+  // Use a tmp path.
+  const std::string path = make_temp_path(".cube");
   write_cube(path, mol, grid, field.data(), "Test cube");
 
   // Parse back.
@@ -303,8 +355,7 @@ TEST_CASE("write_cube agrees with snprintf %13.5E formatting", "[cube]") {
                                -2.71828,   1.0,      -1.0,       1e-300,
                                1.234e+05};
 
-  const std::string path = std::string("/tmp/gauxc_cube_format_") +
-                           std::to_string(::getpid()) + ".cube";
+  const std::string path = make_temp_path(".cube");
   write_cube(path, mol, grid, field.data(), "fmt");
 
   std::ifstream in(path);
@@ -345,7 +396,7 @@ TEST_CASE("write_cube_hdf5 round-trip", "[cube]") {
   std::vector<double> field(npts);
   for (int64_t i = 0; i < npts; ++i) field[i] = 0.01 * i - 0.5;
 
-  const std::string path = "/tmp/_gauxc_test_cube.h5";
+  const std::string path = make_temp_path(".h5");
   write_cube_hdf5(path, mol, grid, field.data(), "test cube");
 
   // Read back and verify.

From 2a1f46e9a658151ea53850d01053b286e9ff5582 Mon Sep 17 00:00:00 2001
From: Conrad Johnston <cojohnston@microsoft.com>
Date: Mon, 4 May 2026 10:46:41 -0700
Subject: [PATCH 8/8] Tests: guard cube-file I/O cases against MPI rank
 collisions

The new orbital_evaluator_test cases that round-trip cube/HDF5 files
all wrote to /tmp paths derived from a per-process static counter.
Under GAUXC_MPI_TEST (mpiexec -n 2) both ranks share /tmp and produced
identical paths, racing on fopen("w")/fwrite/ifstream/std::remove
and causing assertion failures.

Match the pattern used in moltypes_test / collocation / runtime:
gate the I/O TEST_CASEs on rank 0 via a small is_root_rank() helper
that uses MPI_Initialized to stay safe when MPI is disabled. Also
include the PID in make_temp_path() as a defensive measure for any
future I/O test that might forget the guard.

The pure-compute TEST_CASEs are left to run on all ranks (they have
no shared state).
---
 tests/orbital_evaluator_test.cxx | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/tests/orbital_evaluator_test.cxx b/tests/orbital_evaluator_test.cxx
index de7a6118..30ec1f54 100644
--- a/tests/orbital_evaluator_test.cxx
+++ b/tests/orbital_evaluator_test.cxx
@@ -20,6 +20,8 @@
 #include <string>
 #include <vector>
 
+#include <unistd.h>  // getpid
+
 #include <gauxc/external/cube.hpp>
 #include <gauxc/orbital_evaluator.hpp>
 #include <gauxc/xc_integrator/local_work_driver.hpp>
@@ -27,6 +29,9 @@
 #ifdef GAUXC_HAS_HDF5
 #include <highfive/H5File.hpp>
 #endif
+#ifdef GAUXC_HAS_MPI
+#include <mpi.h>
+#endif
 
 #include "standards.hpp"
 
@@ -38,9 +43,29 @@ using namespace GauXC;
 namespace {
 
 /// Generate a unique temp path for test files (avoids tmpnam warning).
+/// Includes the PID so concurrent MPI ranks (which share /tmp on a single
+/// node) do not collide on the same file.
 std::string make_temp_path(const char* suffix) {
   static int counter = 0;
-  return std::string("/tmp/gauxc_test_") + std::to_string(++counter) + suffix;
+  return std::string("/tmp/gauxc_test_") + std::to_string(::getpid()) +
+         "_" + std::to_string(++counter) + suffix;
+}
+
+/// Returns true on the root MPI rank (or always when MPI is disabled).
+/// I/O test cases below run only on rank 0 to avoid concurrent ranks racing
+/// on the same /tmp file paths during MPI test runs.
+bool is_root_rank() {
+#ifdef GAUXC_HAS_MPI
+  int rank = 0;
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (initialized) {
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  }
+  return rank == 0;
+#else
+  return true;
+#endif
 }
 
 /// Build a deterministic PRNG-based set of points within a small bounding box
@@ -256,6 +281,7 @@ TEST_CASE("CubeGrid construction and grid-points layout", "[cube]") {
 }
 
 TEST_CASE("write_cube round-trips header and field data", "[cube]") {
+  if (!is_root_rank()) return;  // I/O on /tmp; avoid MPI rank collisions.
   auto mol = make_water();
   CubeGrid grid = CubeGrid::from_molecule(mol, /*nx=*/5, /*ny=*/4, /*nz=*/7,
                                           /*margin=*/2.0);
@@ -339,6 +365,7 @@ TEST_CASE("write_cube round-trips header and field data", "[cube]") {
 }
 
 TEST_CASE("write_cube agrees with snprintf %13.5E formatting", "[cube]") {
+  if (!is_root_rank()) return;  // I/O on /tmp; avoid MPI rank collisions.
   // Spot-check the custom formatter against snprintf for a battery of values
   // by writing a tiny cube file and parsing it back. This is a stronger check
   // than the round-trip above since we compare the byte stream.
@@ -388,6 +415,7 @@ TEST_CASE("write_cube agrees with snprintf %13.5E formatting", "[cube]") {
 
 #ifdef GAUXC_HAS_HDF5
 TEST_CASE("write_cube_hdf5 round-trip", "[cube]") {
+  if (!is_root_rank()) return;  // I/O on /tmp; avoid MPI rank collisions.
   auto mol = make_water();
   auto grid = CubeGrid::from_molecule(mol, 4, 5, 6);