From 7d7684715e3bad004794223db8b9db2b94f7b5c2 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 18 May 2026 23:12:58 +0200 Subject: [PATCH 1/8] refactor(codec, engine): introduce Redactions collection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reworks the codec *Transform layer to take a generic Redactions collection keyed by span identity instead of a flat slice. New types in nvisy-codec/transform/: - `Redactions` (redactions.rs): groups payloads by span, with overlap detection on insert. Consumed via IntoIterator; no raw map access. - `Mergeable` (mergeable.rs): trait for the redaction payload `R`. `overlaps()` for detection, `try_merge()` for merging with honest failure semantics (returns `None` when outputs differ). - `ConflictPolicy` (policy.rs): Reject / Merge / Replace. Merge falls back to `InsertError::NotMergeable` when `try_merge` returns `None`, rather than picking a magic default. *Redaction structs lose their span_id field: - TextRedaction { start, end, output } - ImageRedaction { bounding_box, output } - AudioRedaction { time_span, output } - TabularRedaction { start, end, output } - Each gets a `::new()` constructor. - Mergeable impls reuse ontology primitives' overlaps()/union(). Transform traits now take `Redactions` by value: - TextTransform::redact_text - ImageTransform::redact_images - AudioTransform::redact_audio - TabularTransform::redact_tabular Transforms iterate `for (loc, mut items) in redactions` instead of re-grouping a flat slice. Overlap checking is no longer duplicated per handler — the collection enforces it on insert. Engine apply.rs builds redactions via `try_insert`; insertion failures surface as validation errors with the rejected/unmergeable reason. Tests use `*Redaction::new()` and `TabularLocationBuilder`. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + crates/nvisy-codec/Cargo.toml | 1 + crates/nvisy-codec/src/document/mod.rs | 8 +- .../src/transform/audio/instruction.rs | 43 ++- .../src/transform/audio/transform.rs | 9 +- .../src/transform/image/instruction.rs | 47 ++- .../src/transform/image/transform.rs | 81 ++--- crates/nvisy-codec/src/transform/mergeable.rs | 25 ++ crates/nvisy-codec/src/transform/mod.rs | 6 + crates/nvisy-codec/src/transform/policy.rs | 41 +++ .../nvisy-codec/src/transform/redactions.rs | 276 ++++++++++++++++++ .../src/transform/tabular/instruction.rs | 51 +++- .../src/transform/tabular/transform.rs | 171 +++++------ .../src/transform/text/instruction.rs | 56 +++- .../src/transform/text/transform.rs | 166 +++++------ .../src/operation/envelope/document.rs | 12 +- .../src/operation/redaction/apply.rs | 84 +++--- 17 files changed, 779 insertions(+), 299 deletions(-) create mode 100644 crates/nvisy-codec/src/transform/mergeable.rs create mode 100644 crates/nvisy-codec/src/transform/policy.rs create mode 100644 crates/nvisy-codec/src/transform/redactions.rs diff --git a/Cargo.lock b/Cargo.lock index 941e1dbd..00a7a003 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3015,6 +3015,7 @@ dependencies = [ "scraper", "serde", "serde_json", + "thiserror 2.0.18", "tokio", "tracing", "uuid", diff --git a/crates/nvisy-codec/Cargo.toml b/crates/nvisy-codec/Cargo.toml index 4bfe74bb..00142816 100644 --- a/crates/nvisy-codec/Cargo.toml +++ b/crates/nvisy-codec/Cargo.toml @@ -57,6 +57,7 @@ hipstr = { workspace = true, features = [] } # Derive macros and error handling derive_more = { workspace = true, features = ["as_ref", "deref", "deref_mut", "display", "from"] } +thiserror = { workspace = true, features = [] } # Image processing image = { workspace = true, features = [] } diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs index b44a3912..22866f58 100644 --- a/crates/nvisy-codec/src/document/mod.rs +++ b/crates/nvisy-codec/src/document/mod.rs @@ -23,7 +23,7 @@ use crate::handler::{ Mp3Params, PngLoader, PngParams, TextData, TextHandler, TiffLoader, TiffParams, TxtLoader, TxtParams, WavLoader, WavParams, XlsxLoader, XlsxParams, }; -use crate::transform::{AudioRedaction, ImageRedaction, TextRedaction}; +use crate::transform::{AudioRedaction, ImageRedaction, Redactions, TextRedaction}; /// A fully type-erased document that can hold any supported format. /// @@ -134,7 +134,7 @@ impl ContentHandle { /// Apply a batch of text redactions to the document. pub async fn apply_text_redactions( &mut self, - redactions: &[TextRedaction], + redactions: Redactions, ) -> Result<(), Error> { use crate::transform::TextTransform; match self { @@ -147,7 +147,7 @@ impl ContentHandle { /// Apply a batch of image redactions to the document. pub async fn apply_image_redactions( &mut self, - redactions: &[ImageRedaction], + redactions: Redactions, ) -> Result<(), Error> { use crate::transform::ImageTransform; match self { @@ -160,7 +160,7 @@ impl ContentHandle { /// Apply a batch of audio redactions to the document. pub async fn apply_audio_redactions( &mut self, - redactions: &[AudioRedaction], + redactions: Redactions, ) -> Result<(), Error> { use crate::transform::AudioTransform; match self { diff --git a/crates/nvisy-codec/src/transform/audio/instruction.rs b/crates/nvisy-codec/src/transform/audio/instruction.rs index c14e334d..cb2396f6 100644 --- a/crates/nvisy-codec/src/transform/audio/instruction.rs +++ b/crates/nvisy-codec/src/transform/audio/instruction.rs @@ -4,17 +4,29 @@ use nvisy_ontology::primitive::TimeSpan; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -/// A located audio redaction: pairs a span identifier and time range -/// with an [`AudioOutput`] that carries the method-specific parameters. -pub struct AudioRedaction { - /// Which audio span this redaction targets. - pub span_id: S, +use crate::transform::Mergeable; + +/// An audio redaction targeting a time range within its containing span. +/// +/// Span identity is supplied externally via [`Redactions`] — this +/// struct only carries the time span and the rendering method. +/// +/// [`Redactions`]: crate::transform::Redactions +#[derive(Debug, Clone, PartialEq)] +pub struct AudioRedaction { /// Time interval of the segment to redact. pub time_span: TimeSpan, /// The redaction output that determines the rendering method. pub output: AudioOutput, } +impl AudioRedaction { + /// Create a new audio redaction. + pub fn new(time_span: TimeSpan, output: AudioOutput) -> Self { + Self { time_span, output } + } +} + /// Audio redaction output — records the method used. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] @@ -26,3 +38,24 @@ pub enum AudioOutput { /// Segment replaced with provided audio data. Replace { data: Vec }, } + +impl Mergeable for AudioRedaction { + fn overlaps(&self, other: &Self) -> bool { + self.time_span.overlaps(&other.time_span) + } + + /// Merge two overlapping audio redactions. + /// + /// Returns `Some` only when both share the same [`AudioOutput`] + /// (method *and* parameters) — the merged redaction unions the + /// time spans. Returns `None` when the methods differ. + fn try_merge(self, other: Self) -> Option { + if self.output != other.output { + return None; + } + Some(Self { + time_span: self.time_span.union(&other.time_span), + output: self.output, + }) + } +} diff --git a/crates/nvisy-codec/src/transform/audio/transform.rs b/crates/nvisy-codec/src/transform/audio/transform.rs index 5760b773..402371d6 100644 --- a/crates/nvisy-codec/src/transform/audio/transform.rs +++ b/crates/nvisy-codec/src/transform/audio/transform.rs @@ -5,6 +5,7 @@ use nvisy_ontology::entity::AudioLocation; use super::instruction::AudioRedaction; use crate::handler::AudioHandler; +use crate::transform::Redactions; const TARGET: &str = "nvisy_codec::transform::audio"; @@ -12,9 +13,13 @@ const TARGET: &str = "nvisy_codec::transform::audio"; #[async_trait::async_trait] pub trait AudioTransform: AudioHandler { /// Apply a batch of audio redactions, mutating in place. + /// + /// Redactions are grouped by [`AudioLocation`] span in the input + /// [`Redactions`] collection. Time-span overlaps within a span are + /// resolved by the collection on insert. async fn redact_audio( &mut self, - redactions: &[AudioRedaction], + redactions: Redactions, ) -> Result<(), Error>; } @@ -22,7 +27,7 @@ pub trait AudioTransform: AudioHandler { impl AudioTransform for H { async fn redact_audio( &mut self, - redactions: &[AudioRedaction], + redactions: Redactions, ) -> Result<(), Error> { tracing::debug!( target: TARGET, diff --git a/crates/nvisy-codec/src/transform/image/instruction.rs b/crates/nvisy-codec/src/transform/image/instruction.rs index b15ba5f0..19edba35 100644 --- a/crates/nvisy-codec/src/transform/image/instruction.rs +++ b/crates/nvisy-codec/src/transform/image/instruction.rs @@ -4,17 +4,32 @@ use nvisy_ontology::primitive::{BoundingBox, Color}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -/// A located image redaction: pairs a span identifier, bounding box, -/// and an [`ImageOutput`] that carries the method-specific parameters. -pub struct ImageRedaction { - /// Which image span this redaction targets. - pub span_id: S, +use crate::transform::Mergeable; + +/// An image redaction targeting a bounding box within its containing span. +/// +/// Span identity is supplied externally via [`Redactions`] — this +/// struct only carries the bounding box and the rendering method. +/// +/// [`Redactions`]: crate::transform::Redactions +#[derive(Debug, Clone, PartialEq)] +pub struct ImageRedaction { /// Bounding box of the region to redact within the span. pub bounding_box: BoundingBox, /// The redaction output that determines the rendering method. pub output: ImageOutput, } +impl ImageRedaction { + /// Create a new image redaction. + pub fn new(bounding_box: BoundingBox, output: ImageOutput) -> Self { + Self { + bounding_box, + output, + } + } +} + /// Image redaction output: records the method used and its parameters. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] @@ -28,3 +43,25 @@ pub enum ImageOutput { /// Region replaced with provided image data. Replace { data: Vec }, } + +impl Mergeable for ImageRedaction { + fn overlaps(&self, other: &Self) -> bool { + self.bounding_box.overlaps(&other.bounding_box) + } + + /// Merge two overlapping image redactions. + /// + /// Returns `Some` only when both share the same [`ImageOutput`] + /// (method *and* parameters) — the merged redaction unions the + /// bounding boxes. Returns `None` when the methods differ (e.g. + /// `Blur { sigma: 5.0 }` vs `Pixelate { block_size: 10 }`). + fn try_merge(self, other: Self) -> Option { + if self.output != other.output { + return None; + } + Some(Self { + bounding_box: self.bounding_box.union(&other.bounding_box), + output: self.output, + }) + } +} diff --git a/crates/nvisy-codec/src/transform/image/transform.rs b/crates/nvisy-codec/src/transform/image/transform.rs index fb0c2f4d..04b98061 100644 --- a/crates/nvisy-codec/src/transform/image/transform.rs +++ b/crates/nvisy-codec/src/transform/image/transform.rs @@ -11,6 +11,7 @@ use super::instruction::{ImageOutput, ImageRedaction}; use super::ops::ImageOps; use crate::document::{Span, SpanStream}; use crate::handler::{ImageData, ImageHandler}; +use crate::transform::Redactions; const TARGET: &str = "nvisy_codec::transform::image"; @@ -18,9 +19,13 @@ const TARGET: &str = "nvisy_codec::transform::image"; #[async_trait::async_trait] pub trait ImageTransform: ImageHandler { /// Apply a batch of image redactions, mutating in place. + /// + /// Redactions are grouped by [`ImageLocation`] span in the input + /// [`Redactions`] collection. Bounding-box overlaps within a span + /// are resolved by the collection on insert. async fn redact_images( &mut self, - redactions: &[ImageRedaction], + redactions: Redactions, ) -> Result<(), Error>; } @@ -28,7 +33,7 @@ pub trait ImageTransform: ImageHandler { impl ImageTransform for H { async fn redact_images( &mut self, - redactions: &[ImageRedaction], + redactions: Redactions, ) -> Result<(), Error> { tracing::debug!( target: TARGET, @@ -48,37 +53,47 @@ impl ImageTransform for H { let image_data: ImageData = span.data; let mut img: DynamicImage = image_data.into_inner(); - for redaction in redactions { - let region = redaction.bounding_box.to_pixel(); - match &redaction.output { - ImageOutput::Blur { sigma } => { - img.apply_gaussian_blur(®ion, *sigma); - } - ImageOutput::Block { color } => { - img.apply_block_overlay(®ion, *color); - } - ImageOutput::Pixelate { block_size } => { - img.apply_pixelate(®ion, *block_size); - } - ImageOutput::Replace { data } => { - let replacement = match image::load_from_memory(data) { - Ok(r) => r, - Err(e) => { - tracing::warn!( - target: TARGET, - region = ?region, - error = %e, - "failed to decode replacement image data, skipping region" - ); - continue; - } - }; - let resized = replacement.resize_exact( - region.width, - region.height, - image::imageops::FilterType::Lanczos3, - ); - image::imageops::overlay(&mut img, &resized, region.x as i64, region.y as i64); + // Image handlers expose a single span; apply every redaction in + // the collection to that one image. The collection's grouping + // is preserved but does not gate application here. + for (_loc, items) in redactions { + for redaction in items { + let region = redaction.bounding_box.to_pixel(); + match &redaction.output { + ImageOutput::Blur { sigma } => { + img.apply_gaussian_blur(®ion, *sigma); + } + ImageOutput::Block { color } => { + img.apply_block_overlay(®ion, *color); + } + ImageOutput::Pixelate { block_size } => { + img.apply_pixelate(®ion, *block_size); + } + ImageOutput::Replace { data } => { + let replacement = match image::load_from_memory(data) { + Ok(r) => r, + Err(e) => { + tracing::warn!( + target: TARGET, + region = ?region, + error = %e, + "failed to decode replacement image data, skipping region" + ); + continue; + } + }; + let resized = replacement.resize_exact( + region.width, + region.height, + image::imageops::FilterType::Lanczos3, + ); + image::imageops::overlay( + &mut img, + &resized, + region.x as i64, + region.y as i64, + ); + } } } } diff --git a/crates/nvisy-codec/src/transform/mergeable.rs b/crates/nvisy-codec/src/transform/mergeable.rs new file mode 100644 index 00000000..d01797c7 --- /dev/null +++ b/crates/nvisy-codec/src/transform/mergeable.rs @@ -0,0 +1,25 @@ +//! [`Mergeable`] trait — overlap detection and merge semantics for +//! redaction payloads. + +/// Trait for redactions that have a comparable extent within a span. +/// +/// Required by [`Redactions`] to detect overlap on insert and to +/// produce a merged redaction under [`ConflictPolicy::Merge`]. +/// +/// [`Redactions`]: crate::transform::Redactions +/// [`ConflictPolicy::Merge`]: crate::transform::ConflictPolicy::Merge +pub trait Mergeable: Sized { + /// Returns `true` when this redaction overlaps with `other`. + /// + /// Both redactions are assumed to live in the same span — callers + /// must already group by span identity before calling this. + fn overlaps(&self, other: &Self) -> bool; + + /// Try to combine two overlapping redactions into one. + /// + /// Returns `Some(merged)` when the redactions can be meaningfully + /// combined (e.g. same replacement output, unioned extents). + /// Returns `None` when they overlap but cannot be reconciled + /// (e.g. different replacement strings, different image methods). + fn try_merge(self, other: Self) -> Option; +} diff --git a/crates/nvisy-codec/src/transform/mod.rs b/crates/nvisy-codec/src/transform/mod.rs index c9dacbc2..d5fd546c 100644 --- a/crates/nvisy-codec/src/transform/mod.rs +++ b/crates/nvisy-codec/src/transform/mod.rs @@ -2,10 +2,16 @@ mod audio; mod image; +mod mergeable; +mod policy; +mod redactions; mod tabular; mod text; pub use self::audio::{AudioOutput, AudioRedaction, AudioTransform}; pub use self::image::{ImageOutput, ImageRedaction, ImageTransform}; +pub use self::mergeable::Mergeable; +pub use self::policy::{ConflictPolicy, InsertError}; +pub use self::redactions::Redactions; pub use self::tabular::{TabularRedaction, TabularTransform}; pub use self::text::{TextOutput, TextRedaction, TextTransform}; diff --git a/crates/nvisy-codec/src/transform/policy.rs b/crates/nvisy-codec/src/transform/policy.rs new file mode 100644 index 00000000..2ab2ace4 --- /dev/null +++ b/crates/nvisy-codec/src/transform/policy.rs @@ -0,0 +1,41 @@ +//! Conflict resolution policy for [`Redactions`]. +//! +//! [`Redactions`]: crate::transform::Redactions + +use thiserror::Error; + +/// How [`Redactions`] resolves overlapping insertions within a span. +/// +/// [`Redactions`]: crate::transform::Redactions +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +pub enum ConflictPolicy { + /// Refuse to insert a redaction that overlaps with an existing one. + #[default] + Reject, + /// Attempt to merge overlapping redactions via + /// [`Mergeable::try_merge`]. + /// + /// When `try_merge` returns `None`, insertion fails with + /// [`InsertError::NotMergeable`]. + /// + /// [`Mergeable::try_merge`]: crate::transform::Mergeable::try_merge + Merge, + /// Drop the existing redaction and replace it with the new one. + Replace, +} + +/// Error returned by [`Redactions::try_insert`] when a conflict cannot +/// be resolved under the configured [`ConflictPolicy`]. +/// +/// [`Redactions::try_insert`]: crate::transform::Redactions::try_insert +#[derive(Debug, Error)] +pub enum InsertError { + /// [`ConflictPolicy::Reject`] is active and the new redaction + /// overlaps with an existing one. + #[error("overlapping redaction rejected")] + OverlapRejected, + /// [`ConflictPolicy::Merge`] is active but the two redactions + /// overlap and cannot be merged (e.g. different outputs). + #[error("overlapping redactions cannot be merged")] + NotMergeable, +} diff --git a/crates/nvisy-codec/src/transform/redactions.rs b/crates/nvisy-codec/src/transform/redactions.rs new file mode 100644 index 00000000..583756da --- /dev/null +++ b/crates/nvisy-codec/src/transform/redactions.rs @@ -0,0 +1,276 @@ +//! Generic [`Redactions`] collection keyed by span identity. +//! +//! Stores redactions grouped by their target span, with overlap +//! detection on insert. The [`ConflictPolicy`] decides what happens +//! when two redactions overlap within the same span. See the +//! [`mergeable`] and [`policy`] modules for the traits and types +//! that govern collection behavior. +//! +//! Transforms consume a `Redactions` instead of a flat slice, so the +//! grouping + overlap-checking work is done once at the engine +//! boundary rather than re-done in each handler. The collection does +//! not expose raw map access: callers consume the collection via +//! [`IntoIterator`]. +//! +//! [`Redactions`]: crate::transform::Redactions +//! [`ConflictPolicy`]: crate::transform::ConflictPolicy +//! [`mergeable`]: crate::transform::mergeable +//! [`policy`]: crate::transform::policy + +use std::fmt; + +use super::mergeable::Mergeable; +use super::policy::{ConflictPolicy, InsertError}; + +/// A set of redactions grouped by their target span, with overlap +/// detection on insert. +/// +/// `S` is the span identity (e.g. [`TextLocation`], [`ImageLocation`]) +/// and must implement [`PartialEq`] so the collection can find the +/// span an inserted redaction belongs to. +/// +/// `R` is the per-span redaction payload (e.g. [`TextRedaction`]) +/// and must implement [`Mergeable`] to support overlap detection. +/// +/// Internally backed by a `Vec<(S, Vec)>` — span counts are +/// typically small and insertion order matters for deterministic +/// downstream behavior, so a `HashMap` would be more cost than +/// benefit and would not work for keys with `f64` fields anyway. +/// +/// [`TextLocation`]: nvisy_ontology::entity::TextLocation +/// [`ImageLocation`]: nvisy_ontology::entity::ImageLocation +/// [`TextRedaction`]: crate::transform::TextRedaction +pub struct Redactions { + policy: ConflictPolicy, + spans: Vec<(S, Vec)>, +} + +impl Redactions { + /// Create an empty collection with the given conflict policy. + pub fn new(policy: ConflictPolicy) -> Self { + Self { + policy, + spans: Vec::new(), + } + } + + /// The conflict policy in effect. + pub fn policy(&self) -> ConflictPolicy { + self.policy + } + + /// Total number of redactions across all spans. + pub fn len(&self) -> usize { + self.spans.iter().map(|(_, rs)| rs.len()).sum() + } + + /// Number of distinct spans that hold redactions. + pub fn span_count(&self) -> usize { + self.spans.len() + } + + /// Returns `true` if the collection holds no redactions. + pub fn is_empty(&self) -> bool { + self.spans.iter().all(|(_, rs)| rs.is_empty()) + } +} + +impl Redactions +where + S: PartialEq, + R: Mergeable, +{ + /// Insert a redaction targeting the given span. + /// + /// If the span already holds an overlapping redaction, behavior + /// is determined by the configured [`ConflictPolicy`]: + /// + /// - [`Reject`]: returns [`InsertError::OverlapRejected`]. + /// - [`Merge`]: attempts to merge; returns + /// [`InsertError::NotMergeable`] when the merge fails. + /// - [`Replace`]: drops the existing overlapping redaction and + /// inserts the new one. + /// + /// [`Reject`]: ConflictPolicy::Reject + /// [`Merge`]: ConflictPolicy::Merge + /// [`Replace`]: ConflictPolicy::Replace + pub fn try_insert(&mut self, span: S, redaction: R) -> Result<(), InsertError> { + let bucket = match self.spans.iter().position(|(s, _)| s == &span) { + Some(idx) => &mut self.spans[idx].1, + None => { + self.spans.push((span, vec![redaction])); + return Ok(()); + } + }; + + let overlap_idx = bucket.iter().position(|r| r.overlaps(&redaction)); + let Some(idx) = overlap_idx else { + bucket.push(redaction); + return Ok(()); + }; + + match self.policy { + ConflictPolicy::Reject => Err(InsertError::OverlapRejected), + ConflictPolicy::Replace => { + bucket[idx] = redaction; + Ok(()) + } + ConflictPolicy::Merge => { + let existing = bucket.remove(idx); + match existing.try_merge(redaction) { + Some(merged) => { + bucket.push(merged); + Ok(()) + } + None => Err(InsertError::NotMergeable), + } + } + } + } +} + +impl Default for Redactions { + fn default() -> Self { + Self::new(ConflictPolicy::default()) + } +} + +impl IntoIterator for Redactions { + type IntoIter = std::vec::IntoIter<(S, Vec)>; + type Item = (S, Vec); + + /// Consume the collection, yielding each span paired with its + /// owned redactions in insertion order. + fn into_iter(self) -> Self::IntoIter { + self.spans.into_iter() + } +} + +impl fmt::Debug for Redactions { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Redactions") + .field("policy", &self.policy) + .field("spans", &self.spans.len()) + .field("redactions", &self.len()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Debug, Clone, PartialEq)] + struct R { + start: usize, + end: usize, + out: String, + } + + impl R { + fn new(start: usize, end: usize, out: &str) -> Self { + Self { + start, + end, + out: out.to_owned(), + } + } + } + + impl Mergeable for R { + fn overlaps(&self, other: &Self) -> bool { + self.start < other.end && other.start < self.end + } + + fn try_merge(self, other: Self) -> Option { + if self.out != other.out { + return None; + } + Some(R { + start: self.start.min(other.start), + end: self.end.max(other.end), + out: self.out, + }) + } + } + + #[test] + fn insert_into_new_span() { + let mut rs = Redactions::::new(ConflictPolicy::Reject); + rs.try_insert(0, R::new(0, 5, "x")).unwrap(); + assert_eq!(rs.len(), 1); + assert_eq!(rs.span_count(), 1); + } + + #[test] + fn insert_non_overlapping_into_same_span() { + let mut rs = Redactions::::new(ConflictPolicy::Reject); + rs.try_insert(0, R::new(0, 5, "x")).unwrap(); + rs.try_insert(0, R::new(10, 15, "y")).unwrap(); + assert_eq!(rs.len(), 2); + assert_eq!(rs.span_count(), 1); + } + + #[test] + fn insert_into_different_spans() { + let mut rs = Redactions::::new(ConflictPolicy::Reject); + rs.try_insert(0, R::new(0, 5, "x")).unwrap(); + rs.try_insert(1, R::new(0, 5, "x")).unwrap(); + assert_eq!(rs.span_count(), 2); + } + + #[test] + fn reject_policy_errors_on_overlap() { + let mut rs = Redactions::::new(ConflictPolicy::Reject); + rs.try_insert(0, R::new(0, 5, "x")).unwrap(); + let err = rs.try_insert(0, R::new(3, 8, "y")).unwrap_err(); + assert!(matches!(err, InsertError::OverlapRejected)); + assert_eq!(rs.len(), 1); + } + + #[test] + fn replace_policy_overwrites_overlap() { + let mut rs = Redactions::::new(ConflictPolicy::Replace); + rs.try_insert(0, R::new(0, 5, "x")).unwrap(); + rs.try_insert(0, R::new(3, 8, "y")).unwrap(); + assert_eq!(rs.len(), 1); + let (_, items) = rs.into_iter().next().unwrap(); + assert_eq!(items[0].out, "y"); + } + + #[test] + fn merge_policy_combines_same_output() { + let mut rs = Redactions::::new(ConflictPolicy::Merge); + rs.try_insert(0, R::new(0, 5, "x")).unwrap(); + rs.try_insert(0, R::new(3, 8, "x")).unwrap(); + assert_eq!(rs.len(), 1); + let (_, items) = rs.into_iter().next().unwrap(); + assert_eq!(items[0].start, 0); + assert_eq!(items[0].end, 8); + } + + #[test] + fn merge_policy_errors_when_unmergeable() { + let mut rs = Redactions::::new(ConflictPolicy::Merge); + rs.try_insert(0, R::new(0, 5, "x")).unwrap(); + let err = rs.try_insert(0, R::new(3, 8, "y")).unwrap_err(); + assert!(matches!(err, InsertError::NotMergeable)); + } + + #[test] + fn into_iter_preserves_insertion_order() { + let mut rs = Redactions::::new(ConflictPolicy::Reject); + rs.try_insert(2, R::new(0, 5, "x")).unwrap(); + rs.try_insert(0, R::new(0, 5, "x")).unwrap(); + rs.try_insert(1, R::new(0, 5, "x")).unwrap(); + let spans: Vec = rs.into_iter().map(|(s, _)| s).collect(); + assert_eq!(spans, vec![2, 0, 1]); + } + + #[test] + fn empty_and_len() { + let rs = Redactions::::new(ConflictPolicy::Reject); + assert!(rs.is_empty()); + assert_eq!(rs.len(), 0); + } +} diff --git a/crates/nvisy-codec/src/transform/tabular/instruction.rs b/crates/nvisy-codec/src/transform/tabular/instruction.rs index 78da1d36..f0432a80 100644 --- a/crates/nvisy-codec/src/transform/tabular/instruction.rs +++ b/crates/nvisy-codec/src/transform/tabular/instruction.rs @@ -1,22 +1,22 @@ //! Tabular redaction instruction types. -use nvisy_ontology::entity::TabularLocation; +use crate::transform::{Mergeable, TextOutput}; -use super::super::TextOutput; - -/// A located tabular redaction: pairs a [`TabularLocation`] (row/col -/// cell address) with intra-cell byte offsets and a [`TextOutput`] -/// that carries the replacement. +/// A tabular redaction targeting a byte range within its containing cell. +/// +/// Cell identity is supplied externally via [`Redactions`] — this +/// struct only carries the intra-cell byte range and the replacement +/// output. /// /// This is the tabular counterpart of [`TextRedaction`]: instead of -/// identifying a span by byte offsets into the serialized form, it -/// addresses a cell by `(row_index, column_index)` and specifies -/// where within that cell's content the redaction applies. +/// being grouped by a text span, it is grouped by a [`TabularLocation`] +/// cell. /// +/// [`Redactions`]: crate::transform::Redactions /// [`TextRedaction`]: crate::transform::TextRedaction +/// [`TabularLocation`]: nvisy_ontology::entity::TabularLocation +#[derive(Debug, Clone, PartialEq)] pub struct TabularRedaction { - /// Which cell this redaction targets. - pub location: TabularLocation, /// Byte offset where the redacted region starts within the cell value. pub start: usize, /// Byte offset where the redacted region ends (exclusive) within the cell value. @@ -24,3 +24,32 @@ pub struct TabularRedaction { /// The redaction output that carries the replacement value. pub output: TextOutput, } + +impl TabularRedaction { + /// Create a new tabular redaction. + pub fn new(start: usize, end: usize, output: TextOutput) -> Self { + Self { start, end, output } + } +} + +impl Mergeable for TabularRedaction { + fn overlaps(&self, other: &Self) -> bool { + self.start < other.end && other.start < self.end + } + + /// Merge two overlapping tabular redactions. + /// + /// Returns `Some` only when both share the same [`TextOutput`] — + /// the merged redaction unions the byte ranges. Returns `None` + /// when the outputs differ. + fn try_merge(self, other: Self) -> Option { + if self.output != other.output { + return None; + } + Some(Self { + start: self.start.min(other.start), + end: self.end.max(other.end), + output: self.output, + }) + } +} diff --git a/crates/nvisy-codec/src/transform/tabular/transform.rs b/crates/nvisy-codec/src/transform/tabular/transform.rs index f58afdaa..ebeb8174 100644 --- a/crates/nvisy-codec/src/transform/tabular/transform.rs +++ b/crates/nvisy-codec/src/transform/tabular/transform.rs @@ -3,22 +3,26 @@ //! Bridges [`TabularRedaction`] (cell-addressed by row/col) to the //! underlying [`TextHandler`] (byte-offset-addressed spans). //! -//! The blanket implementation collects text spans, builds a row/col -//! grid from `line_number`, maps each [`TabularRedaction`] to the -//! corresponding text span, applies intra-cell byte-offset +//! The blanket implementation walks the per-cell groups in a +//! [`Redactions`] collection, locates each cell's text span via +//! `line_number`/column-position, applies intra-cell byte-offset //! replacements right-to-left, and writes results back via //! [`TextHandler::edit_text`]. +//! +//! Overlap detection is owned by [`Redactions`]; this transform +//! trusts that ranges within a single cell do not overlap. use std::cmp::Reverse; use std::collections::HashMap; use futures::StreamExt; use nvisy_core::Error; -use nvisy_ontology::entity::TextLocation; +use nvisy_ontology::entity::{TabularLocation, TextLocation}; use super::instruction::TabularRedaction; use crate::document::{Span, SpanStream}; use crate::handler::{TextData, TextHandler}; +use crate::transform::Redactions; const TARGET: &str = "nvisy_codec::transform::tabular"; @@ -27,20 +31,26 @@ const TARGET: &str = "nvisy_codec::transform::tabular"; /// /// Implemented automatically for all [`TextHandler`] types via a /// blanket impl. The trait translates `(row, col)` cell addresses -/// into the byte-offset `TextLocation`s that the handler understands. +/// into the byte-offset [`TextLocation`]s that the handler understands. #[async_trait::async_trait] pub trait TabularTransform: TextHandler { /// Apply a batch of cell-addressed redactions, mutating in place. /// - /// Each [`TabularRedaction`] identifies a cell by - /// [`TabularLocation`](nvisy_ontology::entity::TabularLocation) - /// and an intra-cell byte range with a replacement value. - async fn redact_tabular(&mut self, redactions: &[TabularRedaction]) -> Result<(), Error>; + /// Redactions are grouped by [`TabularLocation`] cell in the + /// input [`Redactions`] collection. Overlap detection per cell is + /// handled by the collection on insert. + async fn redact_tabular( + &mut self, + redactions: Redactions, + ) -> Result<(), Error>; } #[async_trait::async_trait] impl TabularTransform for H { - async fn redact_tabular(&mut self, redactions: &[TabularRedaction]) -> Result<(), Error> { + async fn redact_tabular( + &mut self, + redactions: Redactions, + ) -> Result<(), Error> { tracing::debug!( target: TARGET, redaction_count = redactions.len(), @@ -64,25 +74,14 @@ impl TabularTransform for H { let mut line_numbers: Vec = rows.keys().copied().collect(); line_numbers.sort_unstable(); - // Group redactions by cell, collecting intra-cell replacements. - let mut by_cell: HashMap<(usize, usize), Vec<(usize, usize, String)>> = HashMap::new(); - for r in redactions { - let value = r.output.replacement_value().unwrap_or_default().to_string(); - by_cell - .entry((r.location.row_index, r.location.column_index)) - .or_default() - .push((r.start, r.end, value)); - } - - // For each affected cell, find the text span and apply replacements. let mut edits: Vec> = Vec::new(); - for ((row_idx, col_idx), replacements) in &mut by_cell { + for (cell, mut items) in redactions { // Map row_index -> line_number -> span indices. - let line_num = line_numbers.get(*row_idx).ok_or_else(|| { + let line_num = line_numbers.get(cell.row_index).ok_or_else(|| { Error::validation( format!( "row_index {} out of bounds (have {} rows)", - row_idx, + cell.row_index, line_numbers.len() ), "tabular-redact", @@ -94,12 +93,12 @@ impl TabularTransform for H { "tabular-redact", ) })?; - let &span_idx = row_spans.get(*col_idx).ok_or_else(|| { + let &span_idx = row_spans.get(cell.column_index).ok_or_else(|| { Error::validation( format!( "column_index {} out of bounds in row {} (have {} columns)", - col_idx, - row_idx, + cell.column_index, + cell.row_index, row_spans.len() ), "tabular-redact", @@ -110,27 +109,13 @@ impl TabularTransform for H { let content: &str = span.data.as_ref(); // Sort right-to-left so earlier byte offsets stay valid. - replacements.sort_by_key(|r| Reverse(r.0)); - - // Check for overlapping ranges. - for pair in replacements.windows(2) { - let (later_start, _, _) = &pair[0]; - let (earlier_start, earlier_end, _) = &pair[1]; - if *earlier_end > *later_start { - return Err(Error::validation( - format!( - "overlapping redaction ranges: {}..{} and {}..{}", - earlier_start, earlier_end, later_start, pair[0].1, - ), - "tabular-redact", - )); - } - } + items.sort_by_key(|r| Reverse(r.start)); let mut result = content.to_string(); - for (start, end, value) in replacements.iter() { - let s = (*start).min(result.len()); - let e = (*end).min(result.len()); + for r in &items { + let value = r.output.replacement_value().unwrap_or_default(); + let s = r.start.min(result.len()); + let e = r.end.min(result.len()); if s >= e { continue; } @@ -138,7 +123,9 @@ impl TabularTransform for H { return Err(Error::validation( format!( "redaction offset falls mid-character \ - (start={start}, end={end}, len={})", + (start={}, end={}, len={})", + r.start, + r.end, result.len() ), "tabular-redact", @@ -168,7 +155,7 @@ mod tests { use super::*; use crate::handler::{CsvData, CsvHandler}; - use crate::transform::TextOutput; + use crate::transform::{ConflictPolicy, TextOutput}; fn handler() -> CsvHandler { CsvHandler::new(CsvData { @@ -182,34 +169,26 @@ mod tests { }) } - fn redaction( - row: usize, - col: usize, - start: usize, - end: usize, - replacement: &str, - ) -> TabularRedaction { - TabularRedaction { - location: TabularLocation { - row_index: row, - column_index: col, - start_offset: None, - end_offset: None, - column_name: None, - sheet_name: None, - }, - start, - end, - output: TextOutput::replace(replacement), - } + fn cell(row: usize, col: usize) -> TabularLocation { + TabularLocation::builder() + .with_row_index(row) + .with_column_index(col) + .build() + .expect("required fields provided") + } + + fn redaction(start: usize, end: usize, replacement: &str) -> TabularRedaction { + TabularRedaction::new(start, end, TextOutput::replace(replacement)) } #[tokio::test] async fn single_cell_redaction() -> Result<()> { let mut h = handler(); + let mut rs = Redactions::new(ConflictPolicy::Reject); // row 1 = first data row (headers are row 0), col 1 = ssn - let r = redaction(1, 1, 0, 11, "[REDACTED]"); - TabularTransform::redact_tabular(&mut h, &[r]).await?; + rs.try_insert(cell(1, 1), redaction(0, 11, "[REDACTED]")) + .unwrap(); + TabularTransform::redact_tabular(&mut h, rs).await?; assert_eq!(h.cell(0, 1), Some("[REDACTED]")); Ok(()) } @@ -217,9 +196,11 @@ mod tests { #[tokio::test] async fn partial_cell_redaction() -> Result<()> { let mut h = handler(); + let mut rs = Redactions::new(ConflictPolicy::Reject); // Redact "Alice" (0..5) in the name cell at row 1, col 0. - let r = redaction(1, 0, 0, 5, "[NAME]"); - TabularTransform::redact_tabular(&mut h, &[r]).await?; + rs.try_insert(cell(1, 0), redaction(0, 5, "[NAME]")) + .unwrap(); + TabularTransform::redact_tabular(&mut h, rs).await?; assert_eq!(h.cell(0, 0), Some("[NAME] Smith")); Ok(()) } @@ -227,9 +208,12 @@ mod tests { #[tokio::test] async fn multiple_cells_redacted() -> Result<()> { let mut h = handler(); - let r1 = redaction(1, 1, 0, 11, "[REDACTED]"); - let r2 = redaction(2, 1, 0, 11, "[REDACTED]"); - TabularTransform::redact_tabular(&mut h, &[r1, r2]).await?; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert(cell(1, 1), redaction(0, 11, "[REDACTED]")) + .unwrap(); + rs.try_insert(cell(2, 1), redaction(0, 11, "[REDACTED]")) + .unwrap(); + TabularTransform::redact_tabular(&mut h, rs).await?; assert_eq!(h.cell(0, 1), Some("[REDACTED]")); assert_eq!(h.cell(1, 1), Some("[REDACTED]")); Ok(()) @@ -238,7 +222,8 @@ mod tests { #[tokio::test] async fn empty_redactions_is_noop() -> Result<()> { let mut h = handler(); - TabularTransform::redact_tabular(&mut h, &[]).await?; + let rs: Redactions = Redactions::default(); + TabularTransform::redact_tabular(&mut h, rs).await?; assert_eq!(h.cell(0, 0), Some("Alice Smith")); Ok(()) } @@ -246,20 +231,10 @@ mod tests { #[tokio::test] async fn remove_cell_content() -> Result<()> { let mut h = handler(); - let r = TabularRedaction { - location: TabularLocation { - row_index: 1, - column_index: 1, - start_offset: None, - end_offset: None, - column_name: None, - sheet_name: None, - }, - start: 0, - end: 11, - output: TextOutput::Remove, - }; - TabularTransform::redact_tabular(&mut h, &[r]).await?; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert(cell(1, 1), TabularRedaction::new(0, 11, TextOutput::Remove)) + .unwrap(); + TabularTransform::redact_tabular(&mut h, rs).await?; assert_eq!(h.cell(0, 1), Some("")); Ok(()) } @@ -267,9 +242,11 @@ mod tests { #[tokio::test] async fn header_redaction() -> Result<()> { let mut h = handler(); + let mut rs = Redactions::new(ConflictPolicy::Reject); // Row 0 = headers - let r = redaction(0, 1, 0, 3, "[REDACTED]"); - TabularTransform::redact_tabular(&mut h, &[r]).await?; + rs.try_insert(cell(0, 1), redaction(0, 3, "[REDACTED]")) + .unwrap(); + TabularTransform::redact_tabular(&mut h, rs).await?; assert_eq!( h.headers(), Some(["name".to_string(), "[REDACTED]".to_string()].as_slice()) @@ -280,8 +257,9 @@ mod tests { #[tokio::test] async fn row_out_of_bounds() { let mut h = handler(); - let r = redaction(99, 0, 0, 1, "x"); - let err = TabularTransform::redact_tabular(&mut h, &[r]) + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert(cell(99, 0), redaction(0, 1, "x")).unwrap(); + let err = TabularTransform::redact_tabular(&mut h, rs) .await .unwrap_err(); assert!(err.to_string().contains("row_index 99 out of bounds")); @@ -290,8 +268,9 @@ mod tests { #[tokio::test] async fn col_out_of_bounds() { let mut h = handler(); - let r = redaction(0, 99, 0, 1, "x"); - let err = TabularTransform::redact_tabular(&mut h, &[r]) + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert(cell(0, 99), redaction(0, 1, "x")).unwrap(); + let err = TabularTransform::redact_tabular(&mut h, rs) .await .unwrap_err(); assert!(err.to_string().contains("column_index 99 out of bounds")); diff --git a/crates/nvisy-codec/src/transform/text/instruction.rs b/crates/nvisy-codec/src/transform/text/instruction.rs index 8b76a668..bfc0a267 100644 --- a/crates/nvisy-codec/src/transform/text/instruction.rs +++ b/crates/nvisy-codec/src/transform/text/instruction.rs @@ -3,11 +3,17 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -/// A located text redaction: pairs a span identifier and intra-span byte -/// range with a [`TextOutput`] that carries the replacement. -pub struct TextRedaction { - /// Which span this redaction targets. - pub span_id: S, +use crate::transform::Mergeable; + +/// A text redaction targeting a byte range within its containing span. +/// +/// Span identity is supplied externally via [`Redactions`] — this +/// struct only carries the intra-span byte range and the replacement +/// output. +/// +/// [`Redactions`]: crate::transform::Redactions +#[derive(Debug, Clone, PartialEq)] +pub struct TextRedaction { /// Byte offset where the redacted region starts within the span. pub start: usize, /// Byte offset where the redacted region ends (exclusive) within the span. @@ -16,6 +22,13 @@ pub struct TextRedaction { pub output: TextOutput, } +impl TextRedaction { + /// Create a new text redaction. + pub fn new(start: usize, end: usize, output: TextOutput) -> Self { + Self { start, end, output } + } +} + /// Text redaction output — the codec only needs to know the replacement string /// or that the span should be removed entirely. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] @@ -28,7 +41,9 @@ pub enum TextOutput { } impl TextOutput { - /// Create a [`Replace`](Self::Replace) output with the given string. + /// Create a [`Replace`] output with the given string. + /// + /// [`Replace`]: Self::Replace pub fn replace(replacement: impl Into) -> Self { Self::Replace { replacement: replacement.into(), @@ -37,8 +52,10 @@ impl TextOutput { /// Returns the text replacement string, regardless of specific method. /// - /// Returns `None` for [`Remove`](Self::Remove) — the caller should - /// treat that as an empty string (span deleted). + /// Returns `None` for [`Remove`] — the caller should treat that as + /// an empty string (span deleted). + /// + /// [`Remove`]: Self::Remove pub fn replacement_value(&self) -> Option<&str> { match self { Self::Replace { replacement } => Some(replacement), @@ -46,3 +63,26 @@ impl TextOutput { } } } + +impl Mergeable for TextRedaction { + fn overlaps(&self, other: &Self) -> bool { + self.start < other.end && other.start < self.end + } + + /// Merge two overlapping text redactions. + /// + /// Returns `Some` only when both share the same [`TextOutput`] — + /// the merged redaction unions the byte ranges. Returns `None` + /// when the outputs differ (e.g. `Replace { "[A]" }` vs `Replace { "[B]" }`), + /// since picking one would silently drop a redaction. + fn try_merge(self, other: Self) -> Option { + if self.output != other.output { + return None; + } + Some(Self { + start: self.start.min(other.start), + end: self.end.max(other.end), + output: self.output, + }) + } +} diff --git a/crates/nvisy-codec/src/transform/text/transform.rs b/crates/nvisy-codec/src/transform/text/transform.rs index b3af3537..0e99c015 100644 --- a/crates/nvisy-codec/src/transform/text/transform.rs +++ b/crates/nvisy-codec/src/transform/text/transform.rs @@ -1,12 +1,15 @@ //! [`TextTransform`] async trait and blanket implementation. //! -//! The blanket implementation groups redactions by location, reads -//! current content via [`TextHandler::text_spans`], applies intra-span -//! byte-offset replacements right-to-left, and writes the results back -//! via [`TextHandler::edit_text`]. +//! The blanket implementation walks the per-span groups in a +//! [`Redactions`] collection, reads current content via +//! [`TextHandler::text_spans`], applies intra-span byte-offset +//! replacements right-to-left (so earlier offsets stay valid), and +//! writes the results back via [`TextHandler::edit_text`]. +//! +//! Overlap detection is owned by [`Redactions`]; this transform +//! trusts that ranges within a span do not overlap. use std::cmp::Reverse; -use std::collections::HashMap; use futures::StreamExt; use nvisy_core::Error; @@ -15,6 +18,7 @@ use nvisy_ontology::entity::TextLocation; use super::instruction::TextRedaction; use crate::document::{Span, SpanStream}; use crate::handler::{TextData, TextHandler}; +use crate::transform::Redactions; const TARGET: &str = "nvisy_codec::transform::text"; @@ -23,13 +27,13 @@ const TARGET: &str = "nvisy_codec::transform::text"; pub trait TextTransform: TextHandler { /// Apply a batch of text redactions, mutating in place. /// - /// Each [`TextRedaction`] identifies a span by [`TextLocation`] and - /// an intra-span byte range with a replacement value. Replacements - /// within each span are applied right-to-left so byte offsets - /// remain valid. + /// Redactions are grouped by [`TextLocation`] span in the input + /// [`Redactions`] collection. The implementation assumes ranges + /// within a single span do not overlap — the [`Redactions`] + /// collection enforces this on insert. async fn redact_text( &mut self, - redactions: &[TextRedaction], + redactions: Redactions, ) -> Result<(), Error>; } @@ -37,7 +41,7 @@ pub trait TextTransform: TextHandler { impl TextTransform for H { async fn redact_text( &mut self, - redactions: &[TextRedaction], + redactions: Redactions, ) -> Result<(), Error> { tracing::debug!( target: TARGET, @@ -48,48 +52,27 @@ impl TextTransform for H { return Ok(()); } - // Group redactions by span start offset (each span has a unique start). - let mut by_span: HashMap> = HashMap::new(); - for r in redactions { - let value = r.output.replacement_value().unwrap_or_default().to_string(); - by_span - .entry(r.span_id.start_offset) - .or_default() - .push((r.start, r.end, value)); - } - - // Read current content for affected spans. + // Read current content for all spans, then walk each affected span. let all_spans: Vec<_> = self.text_spans().await.collect().await; let mut edits: Vec> = Vec::new(); - for span in &all_spans { - let Some(replacements) = by_span.get_mut(&span.id.start_offset) else { + for (span_loc, mut items) in redactions { + let Some(span) = all_spans + .iter() + .find(|s| s.id.start_offset == span_loc.start_offset) + else { continue; }; let content: &str = span.data.as_ref(); // Sort right-to-left so earlier byte offsets stay valid. - replacements.sort_by_key(|r| Reverse(r.0)); - - // Check for overlapping ranges (sorted descending by start). - for pair in replacements.windows(2) { - let (later_start, _, _) = &pair[0]; // higher start - let (earlier_start, earlier_end, _) = &pair[1]; // lower start - if *earlier_end > *later_start { - return Err(Error::validation( - format!( - "overlapping redaction ranges: {}..{} and {}..{}", - earlier_start, earlier_end, later_start, pair[0].1, - ), - "text-redact", - )); - } - } + items.sort_by_key(|r| Reverse(r.start)); let mut result = content.to_string(); - for (start, end, value) in replacements.iter() { - let s = (*start).min(result.len()); - let e = (*end).min(result.len()); + for r in &items { + let value = r.output.replacement_value().unwrap_or_default(); + let s = r.start.min(result.len()); + let e = r.end.min(result.len()); if s >= e { continue; } @@ -97,7 +80,9 @@ impl TextTransform for H { return Err(Error::validation( format!( "redaction offset falls mid-character \ - (start={start}, end={end}, len={})", + (start={}, end={}, len={})", + r.start, + r.end, result.len() ), "text-redact", @@ -127,7 +112,7 @@ mod tests { use super::*; use crate::handler::TxtHandler; - use crate::transform::TextOutput; + use crate::transform::{ConflictPolicy, TextOutput}; fn handler(text: &str) -> TxtHandler { let trailing_newline = text.ends_with('\n'); @@ -135,45 +120,21 @@ mod tests { TxtHandler::new(lines, trailing_newline) } - /// Build a text redaction targeting line `line_idx` at `start..end`. - async fn replace_at( - h: &TxtHandler, - line_idx: usize, - start: usize, - end: usize, - replacement: &str, - ) -> TextRedaction { - let spans: Vec<_> = h.text_spans().await.collect().await; - TextRedaction { - span_id: spans[line_idx].id.clone(), - start, - end, - output: TextOutput::Replace { - replacement: replacement.to_string(), - }, - } - } - - async fn remove_at( - h: &TxtHandler, - line_idx: usize, - start: usize, - end: usize, - ) -> TextRedaction { + async fn span_for(h: &TxtHandler, line_idx: usize) -> TextLocation { let spans: Vec<_> = h.text_spans().await.collect().await; - TextRedaction { - span_id: spans[line_idx].id.clone(), - start, - end, - output: TextOutput::Remove, - } + spans[line_idx].id.clone() } #[tokio::test] async fn single_span_single_redaction() -> Result<()> { let mut h = handler("hello world\n"); - let r = replace_at(&h, 0, 0, 5, "[NAME]").await; - TextTransform::redact_text(&mut h, &[r]).await?; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + span_for(&h, 0).await, + TextRedaction::new(0, 5, TextOutput::replace("[NAME]")), + ) + .unwrap(); + TextTransform::redact_text(&mut h, rs).await?; let spans: Vec<_> = h.text_spans().await.collect().await; assert_eq!(spans[0].data, "[NAME] world"); @@ -183,9 +144,16 @@ mod tests { #[tokio::test] async fn multiple_redactions_within_one_span() -> Result<()> { let mut h = handler("Alice met Bob\n"); - let r1 = replace_at(&h, 0, 0, 5, "[X]").await; - let r2 = replace_at(&h, 0, 10, 13, "[Y]").await; - TextTransform::redact_text(&mut h, &[r1, r2]).await?; + let id = span_for(&h, 0).await; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + id.clone(), + TextRedaction::new(0, 5, TextOutput::replace("[X]")), + ) + .unwrap(); + rs.try_insert(id, TextRedaction::new(10, 13, TextOutput::replace("[Y]"))) + .unwrap(); + TextTransform::redact_text(&mut h, rs).await?; let spans: Vec<_> = h.text_spans().await.collect().await; assert_eq!(spans[0].data, "[X] met [Y]"); @@ -195,8 +163,13 @@ mod tests { #[tokio::test] async fn redaction_spanning_entire_content_replace() -> Result<()> { let mut h = handler("secret\n"); - let r = replace_at(&h, 0, 0, 6, "[REDACTED]").await; - TextTransform::redact_text(&mut h, &[r]).await?; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + span_for(&h, 0).await, + TextRedaction::new(0, 6, TextOutput::replace("[REDACTED]")), + ) + .unwrap(); + TextTransform::redact_text(&mut h, rs).await?; let spans: Vec<_> = h.text_spans().await.collect().await; assert_eq!(spans[0].data, "[REDACTED]"); @@ -206,8 +179,13 @@ mod tests { #[tokio::test] async fn redaction_spanning_entire_content_remove() -> Result<()> { let mut h = handler("secret\n"); - let r = remove_at(&h, 0, 0, 6).await; - TextTransform::redact_text(&mut h, &[r]).await?; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + span_for(&h, 0).await, + TextRedaction::new(0, 6, TextOutput::Remove), + ) + .unwrap(); + TextTransform::redact_text(&mut h, rs).await?; let spans: Vec<_> = h.text_spans().await.collect().await; assert_eq!(spans[0].data, ""); @@ -217,7 +195,8 @@ mod tests { #[tokio::test] async fn empty_redactions_is_noop() -> Result<()> { let mut h = handler("unchanged\n"); - TextTransform::redact_text(&mut h, &[]).await?; + let rs: Redactions = Redactions::default(); + TextTransform::redact_text(&mut h, rs).await?; let spans: Vec<_> = h.text_spans().await.collect().await; assert_eq!(spans[0].data, "unchanged"); @@ -227,9 +206,18 @@ mod tests { #[tokio::test] async fn multiple_spans_with_separate_redactions() -> Result<()> { let mut h = handler("hello\nworld\n"); - let r1 = replace_at(&h, 0, 0, 5, "[A]").await; - let r2 = replace_at(&h, 1, 0, 5, "[B]").await; - TextTransform::redact_text(&mut h, &[r1, r2]).await?; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + span_for(&h, 0).await, + TextRedaction::new(0, 5, TextOutput::replace("[A]")), + ) + .unwrap(); + rs.try_insert( + span_for(&h, 1).await, + TextRedaction::new(0, 5, TextOutput::replace("[B]")), + ) + .unwrap(); + TextTransform::redact_text(&mut h, rs).await?; let spans: Vec<_> = h.text_spans().await.collect().await; assert_eq!(spans[0].data, "[A]"); diff --git a/crates/nvisy-engine/src/operation/envelope/document.rs b/crates/nvisy-engine/src/operation/envelope/document.rs index b2e38351..9bce0f1c 100644 --- a/crates/nvisy-engine/src/operation/envelope/document.rs +++ b/crates/nvisy-engine/src/operation/envelope/document.rs @@ -8,7 +8,9 @@ use std::fmt; use nvisy_codec::handler::{AudioData, ImageData, TextData}; -use nvisy_codec::transform::{AudioRedaction, ImageRedaction, TabularRedaction, TextRedaction}; +use nvisy_codec::transform::{ + AudioRedaction, ImageRedaction, Redactions, TabularRedaction, TextRedaction, +}; use nvisy_codec::{ContentHandle, Span, SpanStream}; use nvisy_core::Error; use nvisy_core::content::{ContentData, ContentMetadata, ContentSource}; @@ -160,7 +162,7 @@ impl Document { /// Apply a batch of text redactions to the document. pub async fn apply_text_redactions( &mut self, - redactions: &[TextRedaction], + redactions: Redactions, ) -> Result<(), Error> { self.handle.apply_text_redactions(redactions).await } @@ -168,7 +170,7 @@ impl Document { /// Apply a batch of image redactions to the document. pub async fn apply_image_redactions( &mut self, - redactions: &[ImageRedaction], + redactions: Redactions, ) -> Result<(), Error> { self.handle.apply_image_redactions(redactions).await } @@ -176,7 +178,7 @@ impl Document { /// Apply a batch of audio redactions to the document. pub async fn apply_audio_redactions( &mut self, - redactions: &[AudioRedaction], + redactions: Redactions, ) -> Result<(), Error> { self.handle.apply_audio_redactions(redactions).await } @@ -184,7 +186,7 @@ impl Document { /// Apply a batch of tabular redactions to the document. pub async fn apply_tabular_redactions( &mut self, - redactions: &[TabularRedaction], + redactions: Redactions, ) -> Result<(), Error> { use nvisy_codec::transform::TabularTransform; match &mut self.handle { diff --git a/crates/nvisy-engine/src/operation/redaction/apply.rs b/crates/nvisy-engine/src/operation/redaction/apply.rs index fd17f51d..f60360b9 100644 --- a/crates/nvisy-engine/src/operation/redaction/apply.rs +++ b/crates/nvisy-engine/src/operation/redaction/apply.rs @@ -9,7 +9,8 @@ use std::collections::HashMap; use nvisy_codec::transform::{ - AudioOutput, AudioRedaction, ImageOutput, ImageRedaction, TextOutput, TextRedaction, + AudioOutput, AudioRedaction, ConflictPolicy, ImageOutput, ImageRedaction, Redactions, + TextOutput, TextRedaction, }; use nvisy_ontology::entity::{ AudioLocation, Entity, EntityKind, ImageLocation, Location, TextLocation, @@ -41,32 +42,28 @@ impl<'a> RedactionApplicator<'a> { /// Build and apply all redaction instructions. pub async fn apply(mut self) -> nvisy_core::Result<()> { - let text = self.build_text_redactions().await; - let image = self.build_image_redactions(); - let audio = self.build_audio_redactions(); + let text = self.build_text_redactions().await?; + let image = self.build_image_redactions()?; + let audio = self.build_audio_redactions()?; if !text.is_empty() { - self.envelope.document.apply_text_redactions(&text).await?; + self.envelope.document.apply_text_redactions(text).await?; } if !image.is_empty() { - self.envelope - .document - .apply_image_redactions(&image) - .await?; + self.envelope.document.apply_image_redactions(image).await?; } if !audio.is_empty() { - self.envelope - .document - .apply_audio_redactions(&audio) - .await?; + self.envelope.document.apply_audio_redactions(audio).await?; } Ok(()) } - async fn build_text_redactions(&mut self) -> Vec> { + async fn build_text_redactions( + &mut self, + ) -> nvisy_core::Result> { let entity_map = Self::entity_map(&self.envelope.audit.entities); - let mut redactions = Vec::new(); + let mut redactions = Redactions::new(ConflictPolicy::Reject); for i in 0..self.envelope.audit.entries.len() { let record = &self.envelope.audit.entries[i]; @@ -113,23 +110,26 @@ impl<'a> RedactionApplicator<'a> { "built text redaction instruction", ); - // The entity location directly identifies the byte range - // to redact. start/end are intra-span offsets (0..len for - // a full value replacement within the containing span). - redactions.push(TextRedaction { - span_id: loc.clone(), - start: loc.start_offset, - end: loc.end_offset, - output, - }); + // The entity location identifies the span; start/end are + // intra-span offsets (0..len for full value replacement). + redactions + .try_insert( + loc.clone(), + TextRedaction::new(loc.start_offset, loc.end_offset, output), + ) + .map_err(|e| { + nvisy_core::Error::validation(e.to_string(), "redaction-apply-text") + })?; } - redactions + Ok(redactions) } - fn build_image_redactions(&mut self) -> Vec> { + fn build_image_redactions( + &mut self, + ) -> nvisy_core::Result> { let entity_map = Self::entity_map(&self.envelope.audit.entities); - let mut redactions = Vec::new(); + let mut redactions = Redactions::new(ConflictPolicy::Reject); for i in 0..self.envelope.audit.entries.len() { let record = &self.envelope.audit.entries[i]; @@ -174,19 +174,21 @@ impl<'a> RedactionApplicator<'a> { "built image redaction instruction", ); - redactions.push(ImageRedaction { - span_id: loc.clone(), - bounding_box: loc.bounding_box, - output, - }); + redactions + .try_insert(loc.clone(), ImageRedaction::new(loc.bounding_box, output)) + .map_err(|e| { + nvisy_core::Error::validation(e.to_string(), "redaction-apply-image") + })?; } - redactions + Ok(redactions) } - fn build_audio_redactions(&mut self) -> Vec> { + fn build_audio_redactions( + &mut self, + ) -> nvisy_core::Result> { let entity_map = Self::entity_map(&self.envelope.audit.entities); - let mut redactions = Vec::new(); + let mut redactions = Redactions::new(ConflictPolicy::Reject); for i in 0..self.envelope.audit.entries.len() { let record = &self.envelope.audit.entries[i]; @@ -230,14 +232,14 @@ impl<'a> RedactionApplicator<'a> { "built audio redaction instruction", ); - redactions.push(AudioRedaction { - span_id: loc.clone(), - time_span: loc.time_span, - output, - }); + redactions + .try_insert(loc.clone(), AudioRedaction::new(loc.time_span, output)) + .map_err(|e| { + nvisy_core::Error::validation(e.to_string(), "redaction-apply-audio") + })?; } - redactions + Ok(redactions) } /// Build a lookup map from entity UUID to entity reference. From e675e26e9de3d4d1baf533b47a7185b663c908d8 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 18 May 2026 23:16:45 +0200 Subject: [PATCH 2/8] chore: import nvisy_core::{Error, Result} instead of qualifying inline Sweeps the workspace for inline-qualified `nvisy_core::Result<...>` and `nvisy_core::Error::...` uses and adds proper `use nvisy_core::{...};` imports following the existing convention used across other engine files. Affected: - nvisy-engine/src/operation/redaction/apply.rs - nvisy-engine/src/operation/mod.rs - nvisy-engine/src/utility/encryption/provider.rs - nvisy-provider/src/http/mod.rs Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/nvisy-engine/src/operation/mod.rs | 4 +++- .../src/operation/redaction/apply.rs | 15 ++++++++------- .../src/utility/encryption/provider.rs | 2 +- crates/nvisy-provider/src/http/mod.rs | 19 ++++++++++--------- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/crates/nvisy-engine/src/operation/mod.rs b/crates/nvisy-engine/src/operation/mod.rs index e1cf1026..2973fd7c 100644 --- a/crates/nvisy-engine/src/operation/mod.rs +++ b/crates/nvisy-engine/src/operation/mod.rs @@ -20,6 +20,8 @@ mod import_file; pub(crate) mod redaction; mod validation; +use nvisy_core::Result; + pub(crate) use self::deduplication::DeduplicationOp; pub(crate) use self::detection::{EntityRecognitionOp, PatternRecognitionOp}; pub use self::envelope::{Document, DocumentEnvelope}; @@ -40,5 +42,5 @@ pub trait Operation { fn execute( &self, envelope: &mut DocumentEnvelope, - ) -> impl Future> + Send; + ) -> impl Future> + Send; } diff --git a/crates/nvisy-engine/src/operation/redaction/apply.rs b/crates/nvisy-engine/src/operation/redaction/apply.rs index f60360b9..7c7756de 100644 --- a/crates/nvisy-engine/src/operation/redaction/apply.rs +++ b/crates/nvisy-engine/src/operation/redaction/apply.rs @@ -12,6 +12,7 @@ use nvisy_codec::transform::{ AudioOutput, AudioRedaction, ConflictPolicy, ImageOutput, ImageRedaction, Redactions, TextOutput, TextRedaction, }; +use nvisy_core::{Error, Result}; use nvisy_ontology::entity::{ AudioLocation, Entity, EntityKind, ImageLocation, Location, TextLocation, }; @@ -41,7 +42,7 @@ impl<'a> RedactionApplicator<'a> { } /// Build and apply all redaction instructions. - pub async fn apply(mut self) -> nvisy_core::Result<()> { + pub async fn apply(mut self) -> Result<()> { let text = self.build_text_redactions().await?; let image = self.build_image_redactions()?; let audio = self.build_audio_redactions()?; @@ -61,7 +62,7 @@ impl<'a> RedactionApplicator<'a> { async fn build_text_redactions( &mut self, - ) -> nvisy_core::Result> { + ) -> Result> { let entity_map = Self::entity_map(&self.envelope.audit.entities); let mut redactions = Redactions::new(ConflictPolicy::Reject); @@ -118,7 +119,7 @@ impl<'a> RedactionApplicator<'a> { TextRedaction::new(loc.start_offset, loc.end_offset, output), ) .map_err(|e| { - nvisy_core::Error::validation(e.to_string(), "redaction-apply-text") + Error::validation(e.to_string(), "redaction-apply-text") })?; } @@ -127,7 +128,7 @@ impl<'a> RedactionApplicator<'a> { fn build_image_redactions( &mut self, - ) -> nvisy_core::Result> { + ) -> Result> { let entity_map = Self::entity_map(&self.envelope.audit.entities); let mut redactions = Redactions::new(ConflictPolicy::Reject); @@ -177,7 +178,7 @@ impl<'a> RedactionApplicator<'a> { redactions .try_insert(loc.clone(), ImageRedaction::new(loc.bounding_box, output)) .map_err(|e| { - nvisy_core::Error::validation(e.to_string(), "redaction-apply-image") + Error::validation(e.to_string(), "redaction-apply-image") })?; } @@ -186,7 +187,7 @@ impl<'a> RedactionApplicator<'a> { fn build_audio_redactions( &mut self, - ) -> nvisy_core::Result> { + ) -> Result> { let entity_map = Self::entity_map(&self.envelope.audit.entities); let mut redactions = Redactions::new(ConflictPolicy::Reject); @@ -235,7 +236,7 @@ impl<'a> RedactionApplicator<'a> { redactions .try_insert(loc.clone(), AudioRedaction::new(loc.time_span, output)) .map_err(|e| { - nvisy_core::Error::validation(e.to_string(), "redaction-apply-audio") + Error::validation(e.to_string(), "redaction-apply-audio") })?; } diff --git a/crates/nvisy-engine/src/utility/encryption/provider.rs b/crates/nvisy-engine/src/utility/encryption/provider.rs index ff8395b7..36e47dea 100644 --- a/crates/nvisy-engine/src/utility/encryption/provider.rs +++ b/crates/nvisy-engine/src/utility/encryption/provider.rs @@ -101,7 +101,7 @@ impl StaticKeyProvider { impl KeyProvider for StaticKeyProvider { fn resolve(&self, key_id: &str) -> Result { self.keys.get(key_id).cloned().ok_or_else(|| { - nvisy_core::Error::validation( + Error::validation( format!("unknown key_id: {key_id}"), "StaticKeyProvider::resolve", ) diff --git a/crates/nvisy-provider/src/http/mod.rs b/crates/nvisy-provider/src/http/mod.rs index 66cdd832..1d68656a 100644 --- a/crates/nvisy-provider/src/http/mod.rs +++ b/crates/nvisy-provider/src/http/mod.rs @@ -7,6 +7,7 @@ use std::fmt; use std::time::Duration; use derive_more::Deref; +use nvisy_core::{Error, Result}; use reqwest_middleware::{ClientBuilder, ClientWithMiddleware, RequestBuilder}; pub use self::config::HttpConfig; @@ -29,7 +30,7 @@ impl HttpClient { /// /// Returns an error if the underlying `reqwest::Client` cannot be built /// (e.g. TLS backend initialisation failure). - pub fn new(config: &HttpConfig) -> nvisy_core::Result { + pub fn new(config: &HttpConfig) -> Result { tracing::debug!( target: TARGET, max_retries = config.max_retries, @@ -47,7 +48,7 @@ impl HttpClient { .pool_idle_timeout(Duration::from_secs(config.idle_timeout_secs)) .build() .map_err(|e| { - nvisy_core::Error::runtime( + Error::runtime( format!("failed to build HTTP client: {e}"), "http", false, @@ -87,31 +88,31 @@ pub trait RequestBuilderExt { fn send_and_check( self, provider: &str, - ) -> impl Future> + Send; + ) -> impl Future> + Send; /// Send the request, check status, and parse the JSON response body. fn send_and_parse( self, provider: &str, - ) -> impl Future> + Send; + ) -> impl Future> + Send; } impl RequestBuilderExt for RequestBuilder { async fn send_and_check( self, provider: &str, - ) -> nvisy_core::Result { + ) -> Result { let resp = self .send() .await - .map_err(|e| nvisy_core::Error::connection(e.to_string(), provider, true))?; + .map_err(|e| Error::connection(e.to_string(), provider, true))?; let status = resp.status(); if status.is_success() { return Ok(resp); } let body = resp.text().await.unwrap_or_default(); - Err(nvisy_core::Error::connection( + Err(Error::connection( format!("{provider} returned {status}: {body}"), provider, status.is_server_error(), @@ -121,10 +122,10 @@ impl RequestBuilderExt for RequestBuilder { async fn send_and_parse( self, provider: &str, - ) -> nvisy_core::Result { + ) -> Result { let resp = self.send_and_check(provider).await?; resp.json().await.map_err(|e| { - nvisy_core::Error::runtime(format!("{provider} JSON parse error: {e}"), provider, false) + Error::runtime(format!("{provider} JSON parse error: {e}"), provider, false) }) } } From 3970764d39c13b80c18a7463df4e467e36c7b8ec Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 18 May 2026 23:25:26 +0200 Subject: [PATCH 3/8] refactor(codec): make *Redaction fields pub(crate) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The four redaction payload structs (TextRedaction, ImageRedaction, AudioRedaction, TabularRedaction) are constructed via ::new() and their fields are only read inside nvisy-codec (by transforms and by Mergeable impls). Tightens the surface to pub(crate) — external crates already use ::new() exclusively. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/nvisy-codec/src/transform/audio/instruction.rs | 4 ++-- crates/nvisy-codec/src/transform/image/instruction.rs | 4 ++-- crates/nvisy-codec/src/transform/tabular/instruction.rs | 6 +++--- crates/nvisy-codec/src/transform/text/instruction.rs | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/nvisy-codec/src/transform/audio/instruction.rs b/crates/nvisy-codec/src/transform/audio/instruction.rs index cb2396f6..9a267fee 100644 --- a/crates/nvisy-codec/src/transform/audio/instruction.rs +++ b/crates/nvisy-codec/src/transform/audio/instruction.rs @@ -15,9 +15,9 @@ use crate::transform::Mergeable; #[derive(Debug, Clone, PartialEq)] pub struct AudioRedaction { /// Time interval of the segment to redact. - pub time_span: TimeSpan, + pub(crate) time_span: TimeSpan, /// The redaction output that determines the rendering method. - pub output: AudioOutput, + pub(crate) output: AudioOutput, } impl AudioRedaction { diff --git a/crates/nvisy-codec/src/transform/image/instruction.rs b/crates/nvisy-codec/src/transform/image/instruction.rs index 19edba35..2b3a38ca 100644 --- a/crates/nvisy-codec/src/transform/image/instruction.rs +++ b/crates/nvisy-codec/src/transform/image/instruction.rs @@ -15,9 +15,9 @@ use crate::transform::Mergeable; #[derive(Debug, Clone, PartialEq)] pub struct ImageRedaction { /// Bounding box of the region to redact within the span. - pub bounding_box: BoundingBox, + pub(crate) bounding_box: BoundingBox, /// The redaction output that determines the rendering method. - pub output: ImageOutput, + pub(crate) output: ImageOutput, } impl ImageRedaction { diff --git a/crates/nvisy-codec/src/transform/tabular/instruction.rs b/crates/nvisy-codec/src/transform/tabular/instruction.rs index f0432a80..e41a1da4 100644 --- a/crates/nvisy-codec/src/transform/tabular/instruction.rs +++ b/crates/nvisy-codec/src/transform/tabular/instruction.rs @@ -18,11 +18,11 @@ use crate::transform::{Mergeable, TextOutput}; #[derive(Debug, Clone, PartialEq)] pub struct TabularRedaction { /// Byte offset where the redacted region starts within the cell value. - pub start: usize, + pub(crate) start: usize, /// Byte offset where the redacted region ends (exclusive) within the cell value. - pub end: usize, + pub(crate) end: usize, /// The redaction output that carries the replacement value. - pub output: TextOutput, + pub(crate) output: TextOutput, } impl TabularRedaction { diff --git a/crates/nvisy-codec/src/transform/text/instruction.rs b/crates/nvisy-codec/src/transform/text/instruction.rs index bfc0a267..f97e55dd 100644 --- a/crates/nvisy-codec/src/transform/text/instruction.rs +++ b/crates/nvisy-codec/src/transform/text/instruction.rs @@ -15,11 +15,11 @@ use crate::transform::Mergeable; #[derive(Debug, Clone, PartialEq)] pub struct TextRedaction { /// Byte offset where the redacted region starts within the span. - pub start: usize, + pub(crate) start: usize, /// Byte offset where the redacted region ends (exclusive) within the span. - pub end: usize, + pub(crate) end: usize, /// The redaction output that carries the replacement value. - pub output: TextOutput, + pub(crate) output: TextOutput, } impl TextRedaction { From 22be651199786c47486a5f201e9655b06e6f7afd Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 19 May 2026 02:10:28 +0200 Subject: [PATCH 4/8] refactor(codec)!: reshape handler API around locations and read/redact MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces Span + SpanStream with Located + LocationStream. Handler capability traits now expose locations() (cheap identity-only streams), read(&L) -> Option<*Data> (typed per-modality fetch), and redact(Redactions) -> Result<()> (direct batch application). ContentHandle gains typed read_text/read_image/read_audio in place of the modality-erased value_at(&Location) -> Option. Tabular handlers (CSV, XLSX) move into handler/text since they implement TextHandler. The *Transform blanket-impl traits are removed; helpers will live alongside the per-modality instruction types. Concrete handlers still implement the old API and do not compile after this commit — follow-up commits migrate them and the engine callers. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/nvisy-codec/src/document/located.rs | 61 ++++ crates/nvisy-codec/src/document/mod.rs | 91 +++--- crates/nvisy-codec/src/document/span.rs | 66 ----- crates/nvisy-codec/src/document/stream.rs | 38 ++- .../src/handler/audio/audio_handler.rs | 50 +--- crates/nvisy-codec/src/handler/audio/mod.rs | 31 +- .../src/handler/image/image_handler.rs | 63 +--- crates/nvisy-codec/src/handler/image/mod.rs | 31 +- crates/nvisy-codec/src/handler/mod.rs | 4 +- .../src/handler/rich/rich_handler.rs | 90 +++--- crates/nvisy-codec/src/handler/tabular/mod.rs | 20 -- .../handler/{tabular => text}/csv_handler.rs | 0 .../handler/{tabular => text}/csv_loader.rs | 0 crates/nvisy-codec/src/handler/text/mod.rs | 53 ++-- .../src/handler/text/text_handler.rs | 59 +--- .../handler/{tabular => text}/xlsx_handler.rs | 0 .../handler/{tabular => text}/xlsx_loader.rs | 0 crates/nvisy-codec/src/lib.rs | 2 +- crates/nvisy-codec/src/transform/audio/mod.rs | 2 - .../src/transform/audio/transform.rs | 45 --- crates/nvisy-codec/src/transform/image/mod.rs | 3 +- .../src/transform/image/transform.rs | 109 ------- crates/nvisy-codec/src/transform/mod.rs | 23 +- .../nvisy-codec/src/transform/tabular/mod.rs | 2 - .../src/transform/tabular/transform.rs | 278 ------------------ crates/nvisy-codec/src/transform/text/mod.rs | 2 - .../src/transform/text/transform.rs | 227 -------------- 27 files changed, 295 insertions(+), 1055 deletions(-) create mode 100644 crates/nvisy-codec/src/document/located.rs delete mode 100644 crates/nvisy-codec/src/document/span.rs delete mode 100644 crates/nvisy-codec/src/handler/tabular/mod.rs rename crates/nvisy-codec/src/handler/{tabular => text}/csv_handler.rs (100%) rename crates/nvisy-codec/src/handler/{tabular => text}/csv_loader.rs (100%) rename crates/nvisy-codec/src/handler/{tabular => text}/xlsx_handler.rs (100%) rename crates/nvisy-codec/src/handler/{tabular => text}/xlsx_loader.rs (100%) delete mode 100644 crates/nvisy-codec/src/transform/audio/transform.rs delete mode 100644 crates/nvisy-codec/src/transform/image/transform.rs delete mode 100644 crates/nvisy-codec/src/transform/tabular/transform.rs delete mode 100644 crates/nvisy-codec/src/transform/text/transform.rs diff --git a/crates/nvisy-codec/src/document/located.rs b/crates/nvisy-codec/src/document/located.rs new file mode 100644 index 00000000..37eb781e --- /dev/null +++ b/crates/nvisy-codec/src/document/located.rs @@ -0,0 +1,61 @@ +//! [`Located`]: a location paired with its production-time provenance. + +use nvisy_core::content::ContentSource; + +/// A location tagged with the [`ContentSource`] of the handler that +/// produced it. +/// +/// Returned by handler `locations()` streams so callers can attribute +/// each location to a specific content artifact. The location itself +/// remains the structural identity used as a key in +/// [`Redactions`] — the source is metadata about how the location +/// was produced, not part of its identity. +/// +/// [`Redactions`]: crate::transform::Redactions +#[derive(Debug, Clone, PartialEq)] +pub struct Located { + /// The handler-level source that produced this location. + pub source: ContentSource, + /// The structural location within the handler's data model. + pub location: L, +} + +impl Located { + /// Create a new located location. + pub fn new(source: ContentSource, location: L) -> Self { + Self { source, location } + } + + /// Discard the source, returning the underlying location. + pub fn into_location(self) -> L { + self.location + } + + /// Transform the inner location, keeping the source unchanged. + pub fn map(self, f: impl FnOnce(L) -> T) -> Located { + Located { + source: self.source, + location: f(self.location), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn map_transforms_inner() { + let src = ContentSource::new(); + let l = Located::new(src, 7u32); + let mapped = l.map(|n| n.to_string()); + assert_eq!(mapped.location, "7"); + assert_eq!(mapped.source, src); + } + + #[test] + fn into_location_discards_source() { + let l = Located::new(ContentSource::new(), 42u32); + assert_eq!(l.into_location(), 42); + } +} diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs index 22866f58..3133c6b7 100644 --- a/crates/nvisy-codec/src/document/mod.rs +++ b/crates/nvisy-codec/src/document/mod.rs @@ -1,21 +1,20 @@ //! Type-erased content handle for all supported formats. -mod span; +mod located; mod stream; use std::fmt; use derive_more::{From, IsVariant, TryInto}; -use futures::StreamExt; use nvisy_core::Error; use nvisy_core::content::{Content, ContentData, ContentSource}; use nvisy_core::media::{ AudioFormat, DocumentType, ImageFormat, SpreadsheetFormat, TextFormat, WordFormat, }; -use nvisy_ontology::entity::{AudioLocation, ImageLocation, Location, TextLocation}; +use nvisy_ontology::entity::{AudioLocation, ImageLocation, TextLocation}; -pub use self::span::Span; -pub use self::stream::SpanStream; +pub use self::located::Located; +pub use self::stream::LocationStream; use crate::handler::{ AudioData, AudioHandler, BoxedAudioHandler, BoxedImageHandler, BoxedRichHandler, BoxedTextHandler, CsvLoader, CsvParams, Handler, ImageData, ImageHandler, JpegLoader, @@ -29,7 +28,7 @@ use crate::transform::{AudioRedaction, ImageRedaction, Redactions, TextRedaction /// /// Groups documents into four modality families: /// - **Text**: plain text, CSV, JSON, HTML, XLSX -/// - **Image**: PNG, JPEG +/// - **Image**: PNG, JPEG, TIFF /// - **Audio**: WAV, MP3 /// - **Rich**: PDF, DOCX (multi-modal documents with text + images) #[derive(From, IsVariant, TryInto)] @@ -79,54 +78,57 @@ impl ContentHandle { } } - /// Stream text spans from text or rich documents. - pub async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { + /// Stream text locations from text or rich documents. + pub fn text_locations(&self) -> LocationStream<'_, TextLocation> { match self { - Self::Text(h) => h.text_spans().await, - Self::Rich(h) => h.text_spans().await, - _ => SpanStream::new(futures::stream::empty()), + Self::Text(h) => h.locations(), + Self::Rich(h) => TextHandler::locations(h), + Self::Image(_) | Self::Audio(_) => LocationStream::empty(), } } - /// Stream image spans from image or rich documents. - pub async fn image_spans(&self) -> SpanStream<'_, ImageLocation, ImageData> { + /// Stream image locations from image or rich documents. + pub fn image_locations(&self) -> LocationStream<'_, ImageLocation> { match self { - Self::Image(h) => h.image_spans().await, - Self::Rich(h) => h.image_spans().await, - Self::Text(_) | Self::Audio(_) => SpanStream::new(futures::stream::empty()), + Self::Image(h) => h.locations(), + Self::Rich(h) => ImageHandler::locations(h), + Self::Text(_) | Self::Audio(_) => LocationStream::empty(), } } - /// Stream audio spans from audio documents. - pub async fn audio_spans(&self) -> SpanStream<'_, AudioLocation, AudioData> { + /// Stream audio locations from audio documents. + pub fn audio_locations(&self) -> LocationStream<'_, AudioLocation> { match self { - Self::Audio(h) => h.audio_spans().await, - _ => SpanStream::new(futures::stream::empty()), + Self::Audio(h) => h.locations(), + _ => LocationStream::empty(), } } - /// Collect all text spans into a `Vec`. - pub async fn collect_text_spans(&self) -> Vec> { - self.text_spans().await.collect().await - } - - /// Collect all image spans into a `Vec`. - pub async fn collect_image_spans(&self) -> Vec> { - self.image_spans().await.collect().await + /// Read text data at the given location. + /// + /// Returns `None` if the location is out of bounds or the handle + /// does not expose text content. + pub async fn read_text(&self, location: &TextLocation) -> Option { + match self { + Self::Text(h) => h.read(location).await, + Self::Rich(h) => TextHandler::read(h, location).await, + Self::Image(_) | Self::Audio(_) => None, + } } - /// Collect all audio spans into a `Vec`. - pub async fn collect_audio_spans(&self) -> Vec> { - self.audio_spans().await.collect().await + /// Read image data at the given location. + pub async fn read_image(&self, location: &ImageLocation) -> Option { + match self { + Self::Image(h) => h.read(location).await, + Self::Rich(h) => ImageHandler::read(h, location).await, + Self::Text(_) | Self::Audio(_) => None, + } } - /// Extract the value at the given location, dispatching by modality. - /// - /// Returns the text/data at the location if available. - pub async fn value_at(&self, location: &Location) -> Option { - match (self, location) { - (Self::Text(h), Location::Text(loc)) => h.value_at(loc).await, - (Self::Rich(h), Location::Text(loc)) => TextHandler::value_at(h, loc).await, + /// Read audio data at the given location. + pub async fn read_audio(&self, location: &AudioLocation) -> Option { + match self { + Self::Audio(h) => h.read(location).await, _ => None, } } @@ -136,10 +138,9 @@ impl ContentHandle { &mut self, redactions: Redactions, ) -> Result<(), Error> { - use crate::transform::TextTransform; match self { - Self::Text(h) => h.redact_text(redactions).await, - Self::Rich(h) => h.redact_text(redactions).await, + Self::Text(h) => h.redact(redactions).await, + Self::Rich(h) => TextHandler::redact(h, redactions).await, Self::Image(_) | Self::Audio(_) => Ok(()), } } @@ -149,10 +150,9 @@ impl ContentHandle { &mut self, redactions: Redactions, ) -> Result<(), Error> { - use crate::transform::ImageTransform; match self { - Self::Image(h) => h.redact_images(redactions).await, - Self::Rich(h) => h.redact_images(redactions).await, + Self::Image(h) => h.redact(redactions).await, + Self::Rich(h) => ImageHandler::redact(h, redactions).await, Self::Text(_) | Self::Audio(_) => Ok(()), } } @@ -162,9 +162,8 @@ impl ContentHandle { &mut self, redactions: Redactions, ) -> Result<(), Error> { - use crate::transform::AudioTransform; match self { - Self::Audio(h) => h.redact_audio(redactions).await, + Self::Audio(h) => h.redact(redactions).await, Self::Text(_) | Self::Image(_) | Self::Rich(_) => Ok(()), } } diff --git a/crates/nvisy-codec/src/document/span.rs b/crates/nvisy-codec/src/document/span.rs deleted file mode 100644 index 29f85e0b..00000000 --- a/crates/nvisy-codec/src/document/span.rs +++ /dev/null @@ -1,66 +0,0 @@ -//! [`Span`]: a span of content tagged with its origin. - -use nvisy_core::content::ContentSource; - -/// A span of content tagged with its origin in the source structure. -/// -/// Used both when reading spans from a handler and when sending -/// edits back. The `id` locates the span within the handler's data -/// model and `data` carries the content (or replacement content). -#[derive(Debug, Clone)] -pub struct Span { - /// Content source identity and lineage. - pub source: ContentSource, - /// Identifier locating this span within the handler's data model. - pub id: Id, - /// The content of this span. - pub data: Data, -} - -impl Span { - /// Create a new span with the given identifier and data. - pub fn new(id: Id, data: Data) -> Self { - Self { - source: ContentSource::default(), - id, - data, - } - } - - /// Set the content source on this span (builder pattern). - pub fn with_source(mut self, source: ContentSource) -> Self { - self.source = source; - self - } - - /// Transform the data, keeping the identifier and source unchanged. - pub fn map(self, f: impl FnOnce(Data) -> T) -> Span { - Span { - source: self.source, - id: self.id, - data: f(self.data), - } - } -} - -#[cfg(test)] -mod tests { - use nvisy_core::content::ContentSource; - - use super::*; - - #[test] - fn span_with_source() { - let source = ContentSource::new(); - let span = Span::new(0u32, "data").with_source(source); - assert_eq!(span.source, source); - } - - #[test] - fn span_map_transforms_data() { - let span = Span::new(1u32, "hello"); - let mapped = span.map(|d| d.len()); - assert_eq!(mapped.id, 1); - assert_eq!(mapped.data, 5); - } -} diff --git a/crates/nvisy-codec/src/document/stream.rs b/crates/nvisy-codec/src/document/stream.rs index 60aef277..1d902575 100644 --- a/crates/nvisy-codec/src/document/stream.rs +++ b/crates/nvisy-codec/src/document/stream.rs @@ -1,36 +1,42 @@ -//! Async span stream for viewing and editing handler content. +//! Async location stream returned by handler `locations()` methods. use std::pin::Pin; use std::task::{Context, Poll}; use futures::Stream; -use super::Span; +use super::Located; -/// Async stream of spans returned by capability trait methods. +/// Async stream of [`Located`] items returned by handler +/// capability traits. /// -/// Wraps a `Pin>` so that handler implementations -/// can return any iterator/stream without exposing a concrete type. -/// -/// Used both for reading spans from a handler and for sending edits -/// back. -pub struct SpanStream<'a, Id, Data> { - inner: Pin> + Send + 'a>>, +/// Wraps a `Pin>` so handlers can return any +/// iterator/stream without exposing a concrete type. +pub struct LocationStream<'a, L> { + inner: Pin> + Send + 'a>>, } -impl<'a, Id, Data> SpanStream<'a, Id, Data> { - /// Wrap any `Send` stream of spans. - pub fn new(stream: impl Stream> + Send + 'a) -> Self { +impl<'a, L> LocationStream<'a, L> { + /// Wrap any `Send` stream of located locations. + pub fn new(stream: impl Stream> + Send + 'a) -> Self { Self { inner: Box::pin(stream), } } + + /// Construct an empty stream. + pub fn empty() -> Self + where + L: Send + 'a, + { + Self::new(futures::stream::empty()) + } } -impl Unpin for SpanStream<'_, Id, Data> {} +impl Unpin for LocationStream<'_, L> {} -impl Stream for SpanStream<'_, Id, Data> { - type Item = Span; +impl Stream for LocationStream<'_, L> { + type Item = Located; fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { self.inner.as_mut().poll_next(cx) diff --git a/crates/nvisy-codec/src/handler/audio/audio_handler.rs b/crates/nvisy-codec/src/handler/audio/audio_handler.rs index 77b2469d..ee79dc11 100644 --- a/crates/nvisy-codec/src/handler/audio/audio_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/audio_handler.rs @@ -8,8 +8,9 @@ use nvisy_core::media::DocumentType; use nvisy_ontology::entity::AudioLocation; use super::{AudioData, Mp3Handler, WavHandler}; -use crate::document::SpanStream; +use crate::document::LocationStream; use crate::handler::{AudioHandler, Handler}; +use crate::transform::{AudioRedaction, Redactions}; /// A type-erased audio handler backed by a boxed trait object. pub struct BoxedAudioHandler(Box); @@ -57,47 +58,18 @@ impl Handler for BoxedAudioHandler { #[async_trait::async_trait] impl AudioHandler for BoxedAudioHandler { - async fn audio_spans(&self) -> SpanStream<'_, AudioLocation, AudioData> { - self.0.audio_spans().await + fn locations(&self) -> LocationStream<'_, AudioLocation> { + self.0.locations() } - async fn edit_audio( - &mut self, - edits: SpanStream<'_, AudioLocation, AudioData>, - ) -> Result<(), Error> { - self.0.edit_audio(edits).await - } - - async fn value_at(&self, location: &AudioLocation) -> Option { - self.0.value_at(location).await + async fn read(&self, location: &AudioLocation) -> Option { + self.0.read(location).await } -} - -#[cfg(test)] -mod tests { - use futures::StreamExt; - use super::*; - - #[tokio::test] - async fn wav_variant_delegates() { - let h = BoxedAudioHandler::from(WavHandler::new(bytes::Bytes::from_static(b"wav-data"))); - assert_eq!( - h.document_type(), - DocumentType::Audio(nvisy_core::media::AudioFormat::Wav), - ); - let spans: Vec<_> = h.audio_spans().await.collect().await; - assert_eq!(spans.len(), 1); - assert_eq!(spans[0].data.as_bytes().as_ref(), b"wav-data"); - } - - #[tokio::test] - async fn mp3_variant_delegates() { - let h = BoxedAudioHandler::from(Mp3Handler::new(bytes::Bytes::from_static(b"mp3-data"))); - assert_eq!( - h.document_type(), - DocumentType::Audio(nvisy_core::media::AudioFormat::Mp3), - ); - assert_eq!(h.encode().unwrap().as_bytes(), b"mp3-data"); + async fn redact( + &mut self, + redactions: Redactions, + ) -> Result<(), Error> { + self.0.redact(redactions).await } } diff --git a/crates/nvisy-codec/src/handler/audio/mod.rs b/crates/nvisy-codec/src/handler/audio/mod.rs index cdc3f567..56885b8f 100644 --- a/crates/nvisy-codec/src/handler/audio/mod.rs +++ b/crates/nvisy-codec/src/handler/audio/mod.rs @@ -4,7 +4,8 @@ use nvisy_core::Error; use nvisy_ontology::entity::AudioLocation; use super::Handler; -use crate::document::SpanStream; +use crate::document::LocationStream; +use crate::transform::{AudioRedaction, Redactions}; mod audio_data; mod audio_handler; @@ -24,22 +25,26 @@ pub use self::wav_loader::{WavLoader, WavParams}; /// Capability trait for handlers that expose audio content. /// -/// All audio handlers use [`AudioLocation`] as their span identifier. +/// Handlers expose audio content as a stream of [`AudioLocation`]s +/// (cheap, identity-only), with explicit `read` calls to fetch the +/// payload for any given location, and a `redact` call that applies a +/// batch of [`AudioRedaction`]s grouped by location. #[async_trait::async_trait] pub trait AudioHandler: Handler { - /// Return audio content as an async stream of [`Span`](crate::document::Span)s. + /// Async stream of [`AudioLocation`]s for this document, each + /// tagged with the handler's [`ContentSource`]. /// - /// Each span carries an [`AudioLocation`] and [`AudioData`] payload. - async fn audio_spans(&self) -> SpanStream<'_, AudioLocation, AudioData>; + /// [`ContentSource`]: nvisy_core::content::ContentSource + fn locations(&self) -> LocationStream<'_, AudioLocation>; - /// Apply audio edits from an async stream back to the handler. - async fn edit_audio( - &mut self, - edits: SpanStream<'_, AudioLocation, AudioData>, - ) -> Result<(), Error>; - - /// Extract the audio data at the given location (time span segment). + /// Read the audio segment at the given location (time-span slice). /// /// Returns `None` if the location is out of bounds. - async fn value_at(&self, location: &AudioLocation) -> Option; + async fn read(&self, location: &AudioLocation) -> Option; + + /// Apply a batch of redactions grouped by [`AudioLocation`]. + async fn redact( + &mut self, + redactions: Redactions, + ) -> Result<(), Error>; } diff --git a/crates/nvisy-codec/src/handler/image/image_handler.rs b/crates/nvisy-codec/src/handler/image/image_handler.rs index f8027620..b1db2149 100644 --- a/crates/nvisy-codec/src/handler/image/image_handler.rs +++ b/crates/nvisy-codec/src/handler/image/image_handler.rs @@ -8,8 +8,9 @@ use nvisy_core::media::DocumentType; use nvisy_ontology::entity::ImageLocation; use super::{ImageData, JpegHandler, PngHandler, TiffHandler}; -use crate::document::SpanStream; +use crate::document::LocationStream; use crate::handler::{Handler, ImageHandler}; +use crate::transform::{ImageRedaction, Redactions}; /// A type-erased image handler backed by a boxed trait object. pub struct BoxedImageHandler(Box); @@ -63,60 +64,18 @@ impl Handler for BoxedImageHandler { #[async_trait::async_trait] impl ImageHandler for BoxedImageHandler { - async fn image_spans(&self) -> SpanStream<'_, ImageLocation, ImageData> { - self.0.image_spans().await + fn locations(&self) -> LocationStream<'_, ImageLocation> { + self.0.locations() } - async fn edit_images( - &mut self, - edits: SpanStream<'_, ImageLocation, ImageData>, - ) -> Result<(), Error> { - self.0.edit_images(edits).await - } - - async fn value_at(&self, location: &ImageLocation) -> Option { - self.0.value_at(location).await - } -} - -#[cfg(test)] -mod tests { - use futures::StreamExt; - - use super::*; - - fn make_png() -> PngHandler { - let img = image::DynamicImage::new_rgb8(1, 1); - PngHandler::new(img) + async fn read(&self, location: &ImageLocation) -> Option { + self.0.read(location).await } - fn make_jpeg() -> JpegHandler { - let img = image::DynamicImage::new_rgb8(1, 1); - JpegHandler::new(img) - } - - #[test] - fn png_variant_document_type() { - let h = BoxedImageHandler::from(make_png()); - assert_eq!( - h.document_type(), - DocumentType::Image(nvisy_core::media::ImageFormat::Png), - ); - } - - #[test] - fn jpeg_variant_document_type() { - let h = BoxedImageHandler::from(make_jpeg()); - assert_eq!( - h.document_type(), - DocumentType::Image(nvisy_core::media::ImageFormat::Jpeg), - ); - } - - #[tokio::test] - async fn view_spans_returns_image() { - let h = BoxedImageHandler::from(make_png()); - let spans: Vec<_> = h.image_spans().await.collect().await; - assert_eq!(spans.len(), 1); + async fn redact( + &mut self, + redactions: Redactions, + ) -> Result<(), Error> { + self.0.redact(redactions).await } } diff --git a/crates/nvisy-codec/src/handler/image/mod.rs b/crates/nvisy-codec/src/handler/image/mod.rs index e5bb3ce5..6c8989a6 100644 --- a/crates/nvisy-codec/src/handler/image/mod.rs +++ b/crates/nvisy-codec/src/handler/image/mod.rs @@ -4,7 +4,8 @@ use nvisy_core::Error; use nvisy_ontology::entity::ImageLocation; use super::Handler; -use crate::document::SpanStream; +use crate::document::LocationStream; +use crate::transform::{ImageRedaction, Redactions}; mod image_data; mod image_handler; @@ -31,22 +32,26 @@ pub use self::tiff_loader::{TiffLoader, TiffParams}; /// Capability trait for handlers that expose image content. /// -/// All image handlers use [`ImageLocation`] as their span identifier. +/// Handlers expose image content as a stream of [`ImageLocation`]s +/// (cheap, identity-only), with explicit `read` calls to fetch the +/// payload for any given location, and a `redact` call that applies a +/// batch of [`ImageRedaction`]s grouped by location. #[async_trait::async_trait] pub trait ImageHandler: Handler { - /// Return image content as an async stream of [`Span`](crate::document::Span)s. + /// Async stream of [`ImageLocation`]s for this document, each + /// tagged with the handler's [`ContentSource`]. /// - /// Each span carries an [`ImageLocation`] and [`ImageData`] payload. - async fn image_spans(&self) -> SpanStream<'_, ImageLocation, ImageData>; + /// [`ContentSource`]: nvisy_core::content::ContentSource + fn locations(&self) -> LocationStream<'_, ImageLocation>; - /// Apply image edits from an async stream back to the handler. - async fn edit_images( - &mut self, - edits: SpanStream<'_, ImageLocation, ImageData>, - ) -> Result<(), Error>; - - /// Extract the image data at the given location (crop the bounding box). + /// Read the image data at the given location (crop the bounding box). /// /// Returns `None` if the location is out of bounds. - async fn value_at(&self, location: &ImageLocation) -> Option; + async fn read(&self, location: &ImageLocation) -> Option; + + /// Apply a batch of redactions grouped by [`ImageLocation`]. + async fn redact( + &mut self, + redactions: Redactions, + ) -> Result<(), Error>; } diff --git a/crates/nvisy-codec/src/handler/mod.rs b/crates/nvisy-codec/src/handler/mod.rs index 73eca242..81e1bf3f 100644 --- a/crates/nvisy-codec/src/handler/mod.rs +++ b/crates/nvisy-codec/src/handler/mod.rs @@ -15,7 +15,6 @@ use nvisy_core::media::DocumentType; mod audio; mod image; mod rich; -mod tabular; mod text; use nvisy_core::content::ContentSource; @@ -23,7 +22,6 @@ use nvisy_core::content::ContentSource; pub use self::audio::*; pub use self::image::*; pub use self::rich::*; -pub use self::tabular::*; pub use self::text::*; /// Base trait implemented by all format handlers. @@ -32,7 +30,7 @@ pub use self::text::*; /// identify and serialize it. Handlers are produced by their /// corresponding [`Loader`]. /// -/// Capability-specific span access is provided by the opt-in traits +/// Capability-specific access is provided by the opt-in traits /// [`TextHandler`], [`ImageHandler`], and [`AudioHandler`]. pub trait Handler: Send + Sync + 'static { /// The document type this handler represents. diff --git a/crates/nvisy-codec/src/handler/rich/rich_handler.rs b/crates/nvisy-codec/src/handler/rich/rich_handler.rs index f694cd8b..23ad7450 100644 --- a/crates/nvisy-codec/src/handler/rich/rich_handler.rs +++ b/crates/nvisy-codec/src/handler/rich/rich_handler.rs @@ -9,66 +9,64 @@ use nvisy_ontology::entity::{ImageLocation, TextLocation}; #[cfg(feature = "pdf")] use super::RichTextHandler; -use crate::document::SpanStream; +use crate::document::LocationStream; use crate::handler::image::ImageData; use crate::handler::text::TextData; use crate::handler::{Handler, ImageHandler, TextHandler}; +use crate::transform::{ImageRedaction, Redactions, TextRedaction}; /// A type-erased rich-document handler backed by a boxed trait object. -/// -/// Since both [`TextHandler`] and [`ImageHandler`] are now directly -/// object-safe, this is a simple `Box` wrapper. pub struct BoxedRichHandler(Box); /// Combined text + image handler trait for rich documents (PDF, DOCX). #[async_trait::async_trait] pub(crate) trait RichHandler: Handler + Send + Sync { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData>; - async fn edit_text( + fn text_locations(&self) -> LocationStream<'_, TextLocation>; + async fn read_text(&self, location: &TextLocation) -> Option; + async fn redact_text( &mut self, - edits: SpanStream<'_, TextLocation, TextData>, + redactions: Redactions, ) -> Result<(), Error>; - async fn text_value_at(&self, location: &TextLocation) -> Option; - async fn image_spans(&self) -> SpanStream<'_, ImageLocation, ImageData>; - async fn edit_images( + fn image_locations(&self) -> LocationStream<'_, ImageLocation>; + async fn read_image(&self, location: &ImageLocation) -> Option; + async fn redact_images( &mut self, - edits: SpanStream<'_, ImageLocation, ImageData>, + redactions: Redactions, ) -> Result<(), Error>; - async fn image_value_at(&self, location: &ImageLocation) -> Option; } #[cfg(feature = "pdf")] #[async_trait::async_trait] impl RichHandler for RichTextHandler { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - TextHandler::text_spans(self).await + fn text_locations(&self) -> LocationStream<'_, TextLocation> { + TextHandler::locations(self) } - async fn edit_text( + async fn read_text(&self, location: &TextLocation) -> Option { + TextHandler::read(self, location).await + } + + async fn redact_text( &mut self, - edits: SpanStream<'_, TextLocation, TextData>, + redactions: Redactions, ) -> Result<(), Error> { - TextHandler::edit_text(self, edits).await + TextHandler::redact(self, redactions).await } - async fn text_value_at(&self, location: &TextLocation) -> Option { - TextHandler::value_at(self, location).await + fn image_locations(&self) -> LocationStream<'_, ImageLocation> { + ImageHandler::locations(self) } - async fn image_spans(&self) -> SpanStream<'_, ImageLocation, ImageData> { - ImageHandler::image_spans(self).await + async fn read_image(&self, location: &ImageLocation) -> Option { + ImageHandler::read(self, location).await } - async fn edit_images( + async fn redact_images( &mut self, - edits: SpanStream<'_, ImageLocation, ImageData>, + redactions: Redactions, ) -> Result<(), Error> { - ImageHandler::edit_images(self, edits).await - } - - async fn image_value_at(&self, location: &ImageLocation) -> Option { - ImageHandler::value_at(self, location).await + ImageHandler::redact(self, redactions).await } } @@ -109,36 +107,36 @@ impl Handler for BoxedRichHandler { #[async_trait::async_trait] impl TextHandler for BoxedRichHandler { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - self.0.text_spans().await + fn locations(&self) -> LocationStream<'_, TextLocation> { + self.0.text_locations() } - async fn edit_text( - &mut self, - edits: SpanStream<'_, TextLocation, TextData>, - ) -> Result<(), Error> { - self.0.edit_text(edits).await + async fn read(&self, location: &TextLocation) -> Option { + self.0.read_text(location).await } - async fn value_at(&self, location: &TextLocation) -> Option { - self.0.text_value_at(location).await + async fn redact( + &mut self, + redactions: Redactions, + ) -> Result<(), Error> { + self.0.redact_text(redactions).await } } #[async_trait::async_trait] impl ImageHandler for BoxedRichHandler { - async fn image_spans(&self) -> SpanStream<'_, ImageLocation, ImageData> { - self.0.image_spans().await + fn locations(&self) -> LocationStream<'_, ImageLocation> { + self.0.image_locations() } - async fn edit_images( - &mut self, - edits: SpanStream<'_, ImageLocation, ImageData>, - ) -> Result<(), Error> { - self.0.edit_images(edits).await + async fn read(&self, location: &ImageLocation) -> Option { + self.0.read_image(location).await } - async fn value_at(&self, location: &ImageLocation) -> Option { - self.0.image_value_at(location).await + async fn redact( + &mut self, + redactions: Redactions, + ) -> Result<(), Error> { + self.0.redact_images(redactions).await } } diff --git a/crates/nvisy-codec/src/handler/tabular/mod.rs b/crates/nvisy-codec/src/handler/tabular/mod.rs deleted file mode 100644 index bb5eec2c..00000000 --- a/crates/nvisy-codec/src/handler/tabular/mod.rs +++ /dev/null @@ -1,20 +0,0 @@ -//! Tabular format handlers and loaders (CSV, XLSX). -//! -//! Tabular handlers expose cell-level text spans via [`TextHandler`] -//! and address cells by byte offsets computed from the serialized form. -//! -//! [`TextHandler`]: crate::handler::TextHandler - -mod csv_handler; -mod csv_loader; -#[cfg(feature = "xlsx")] -mod xlsx_handler; -#[cfg(feature = "xlsx")] -mod xlsx_loader; - -pub use self::csv_handler::{CsvData, CsvHandler}; -pub use self::csv_loader::{CsvLoader, CsvParams}; -#[cfg(feature = "xlsx")] -pub use self::xlsx_handler::XlsxHandler; -#[cfg(feature = "xlsx")] -pub use self::xlsx_loader::{XlsxLoader, XlsxParams}; diff --git a/crates/nvisy-codec/src/handler/tabular/csv_handler.rs b/crates/nvisy-codec/src/handler/text/csv_handler.rs similarity index 100% rename from crates/nvisy-codec/src/handler/tabular/csv_handler.rs rename to crates/nvisy-codec/src/handler/text/csv_handler.rs diff --git a/crates/nvisy-codec/src/handler/tabular/csv_loader.rs b/crates/nvisy-codec/src/handler/text/csv_loader.rs similarity index 100% rename from crates/nvisy-codec/src/handler/tabular/csv_loader.rs rename to crates/nvisy-codec/src/handler/text/csv_loader.rs diff --git a/crates/nvisy-codec/src/handler/text/mod.rs b/crates/nvisy-codec/src/handler/text/mod.rs index 90d00e98..f507c67d 100644 --- a/crates/nvisy-codec/src/handler/text/mod.rs +++ b/crates/nvisy-codec/src/handler/text/mod.rs @@ -4,8 +4,11 @@ use nvisy_core::Error; use nvisy_ontology::entity::TextLocation; use super::Handler; -use crate::document::SpanStream; +use crate::document::LocationStream; +use crate::transform::{Redactions, TextRedaction}; +mod csv_handler; +mod csv_loader; #[cfg(feature = "html")] mod html_handler; #[cfg(feature = "html")] @@ -17,7 +20,11 @@ mod text_data; mod text_handler; mod txt_handler; mod txt_loader; +mod xlsx_handler; +mod xlsx_loader; +pub use self::csv_handler::{CsvData, CsvHandler}; +pub use self::csv_loader::{CsvLoader, CsvParams}; #[cfg(feature = "html")] pub use self::html_handler::{HtmlData, HtmlHandler}; #[cfg(feature = "html")] @@ -29,43 +36,45 @@ pub use self::text_data::TextData; pub use self::text_handler::BoxedTextHandler; pub use self::txt_handler::TxtHandler; pub use self::txt_loader::{TxtLoader, TxtParams}; +pub use self::xlsx_handler::XlsxHandler; +pub use self::xlsx_loader::{XlsxLoader, XlsxParams}; /// Capability trait for handlers that expose text content. /// -/// Handlers implementing this trait yield text spans addressed by -/// [`TextLocation`] and accept text edits keyed by the same type. +/// Handlers expose text content as a stream of [`TextLocation`]s +/// (cheap, identity-only), with explicit `read` calls to fetch the +/// payload for any given location, and a `redact` call that applies a +/// batch of [`TextRedaction`]s grouped by location. /// /// # Offset semantics /// /// Byte offsets in [`TextLocation`] are relative to the handler's /// **serialized** form. For plain text this is identical to the /// in-memory form; for JSON and CSV the offsets include formatting -/// characters (quotes, escapes, delimiters). Use [`value_at`] to -/// extract the logical value at a location rather than slicing the -/// serialized bytes directly. +/// characters (quotes, escapes, delimiters). Use [`read`] to extract +/// the logical value at a location rather than slicing the serialized +/// bytes directly. /// -/// [`value_at`]: TextHandler::value_at +/// [`read`]: TextHandler::read #[async_trait::async_trait] pub trait TextHandler: Handler { - /// Return text content as an async stream of [`Span`]s. + /// Async stream of [`TextLocation`]s for this document, each + /// tagged with the handler's [`ContentSource`]. /// - /// Each span carries a [`TextLocation`] identifying its position - /// within the document and a [`TextData`] payload. + /// [`ContentSource`]: nvisy_core::content::ContentSource + fn locations(&self) -> LocationStream<'_, TextLocation>; + + /// Read the text content at the given location. /// - /// [`Span`]: crate::document::Span - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData>; + /// Returns `None` if the location is out of bounds. + async fn read(&self, location: &TextLocation) -> Option; - /// Apply text edits from an async stream back to the handler. + /// Apply a batch of redactions grouped by [`TextLocation`]. /// - /// The stream items must use [`TextLocation`] values that - /// correspond to spans returned by [`text_spans`](Self::text_spans). - async fn edit_text( + /// The collection enforces overlap policy on insert; this method + /// trusts that ranges within a single location do not overlap. + async fn redact( &mut self, - edits: SpanStream<'_, TextLocation, TextData>, + redactions: Redactions, ) -> Result<(), Error>; - - /// Extract the text value at the given location, if available. - /// - /// Returns `None` if the location is out of bounds. - async fn value_at(&self, location: &TextLocation) -> Option; } diff --git a/crates/nvisy-codec/src/handler/text/text_handler.rs b/crates/nvisy-codec/src/handler/text/text_handler.rs index 1cb19b8e..bad0fe7e 100644 --- a/crates/nvisy-codec/src/handler/text/text_handler.rs +++ b/crates/nvisy-codec/src/handler/text/text_handler.rs @@ -8,13 +8,11 @@ use nvisy_core::media::DocumentType; use nvisy_ontology::entity::TextLocation; use super::TextData; -use crate::document::SpanStream; +use crate::document::LocationStream; use crate::handler::{Handler, TextHandler}; +use crate::transform::{Redactions, TextRedaction}; /// A type-erased text handler backed by a boxed trait object. -/// -/// Since [`TextHandler`] no longer has an associated type, it is -/// directly object-safe and can be stored as `Box`. pub struct BoxedTextHandler(Box); impl BoxedTextHandler { @@ -49,19 +47,19 @@ impl Handler for BoxedTextHandler { #[async_trait::async_trait] impl TextHandler for BoxedTextHandler { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - self.0.text_spans().await + fn locations(&self) -> LocationStream<'_, TextLocation> { + self.0.locations() } - async fn edit_text( - &mut self, - edits: SpanStream<'_, TextLocation, TextData>, - ) -> Result<(), Error> { - self.0.edit_text(edits).await + async fn read(&self, location: &TextLocation) -> Option { + self.0.read(location).await } - async fn value_at(&self, location: &TextLocation) -> Option { - self.0.value_at(location).await + async fn redact( + &mut self, + redactions: Redactions, + ) -> Result<(), Error> { + self.0.redact(redactions).await } } @@ -78,41 +76,10 @@ macro_rules! impl_from_text_handler { }; } -use super::{JsonHandler, TxtHandler}; -use crate::handler::tabular::CsvHandler; -impl_from_text_handler!(TxtHandler, CsvHandler, JsonHandler); +use super::{CsvHandler, JsonHandler, TxtHandler, XlsxHandler}; +impl_from_text_handler!(TxtHandler, CsvHandler, JsonHandler, XlsxHandler); #[cfg(feature = "html")] use super::HtmlHandler; #[cfg(feature = "html")] impl_from_text_handler!(HtmlHandler); - -#[cfg(feature = "xlsx")] -use crate::handler::tabular::XlsxHandler; -#[cfg(feature = "xlsx")] -impl_from_text_handler!(XlsxHandler); - -#[cfg(test)] -mod tests { - use futures::StreamExt; - use nvisy_core::media::TextFormat; - - use super::*; - use crate::handler::TxtHandler; - - #[test] - fn txt_variant_document_type() { - let h = BoxedTextHandler::from(TxtHandler::new(vec!["hello".into()], false)); - assert_eq!(h.document_type(), DocumentType::Text(TextFormat::Txt)); - } - - #[tokio::test] - async fn view_spans_returns_text() { - let h = - BoxedTextHandler::from(TxtHandler::new(vec!["line1".into(), "line2".into()], false)); - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans.len(), 2); - assert_eq!(spans[0].data, "line1"); - assert_eq!(spans[1].data, "line2"); - } -} diff --git a/crates/nvisy-codec/src/handler/tabular/xlsx_handler.rs b/crates/nvisy-codec/src/handler/text/xlsx_handler.rs similarity index 100% rename from crates/nvisy-codec/src/handler/tabular/xlsx_handler.rs rename to crates/nvisy-codec/src/handler/text/xlsx_handler.rs diff --git a/crates/nvisy-codec/src/handler/tabular/xlsx_loader.rs b/crates/nvisy-codec/src/handler/text/xlsx_loader.rs similarity index 100% rename from crates/nvisy-codec/src/handler/tabular/xlsx_loader.rs rename to crates/nvisy-codec/src/handler/text/xlsx_loader.rs diff --git a/crates/nvisy-codec/src/lib.rs b/crates/nvisy-codec/src/lib.rs index 24473b1a..02ff2810 100644 --- a/crates/nvisy-codec/src/lib.rs +++ b/crates/nvisy-codec/src/lib.rs @@ -6,4 +6,4 @@ mod document; pub mod handler; pub mod transform; -pub use self::document::{ContentHandle, Span, SpanStream}; +pub use self::document::{ContentHandle, Located, LocationStream}; diff --git a/crates/nvisy-codec/src/transform/audio/mod.rs b/crates/nvisy-codec/src/transform/audio/mod.rs index 3c7d3a8a..ebb72430 100644 --- a/crates/nvisy-codec/src/transform/audio/mod.rs +++ b/crates/nvisy-codec/src/transform/audio/mod.rs @@ -1,7 +1,5 @@ //! Audio redaction primitives. mod instruction; -mod transform; pub use self::instruction::{AudioOutput, AudioRedaction}; -pub use self::transform::AudioTransform; diff --git a/crates/nvisy-codec/src/transform/audio/transform.rs b/crates/nvisy-codec/src/transform/audio/transform.rs deleted file mode 100644 index 402371d6..00000000 --- a/crates/nvisy-codec/src/transform/audio/transform.rs +++ /dev/null @@ -1,45 +0,0 @@ -//! [`AudioTransform`] async trait and blanket implementation. - -use nvisy_core::Error; -use nvisy_ontology::entity::AudioLocation; - -use super::instruction::AudioRedaction; -use crate::handler::AudioHandler; -use crate::transform::Redactions; - -const TARGET: &str = "nvisy_codec::transform::audio"; - -/// Extension trait for handlers that support audio redaction. -#[async_trait::async_trait] -pub trait AudioTransform: AudioHandler { - /// Apply a batch of audio redactions, mutating in place. - /// - /// Redactions are grouped by [`AudioLocation`] span in the input - /// [`Redactions`] collection. Time-span overlaps within a span are - /// resolved by the collection on insert. - async fn redact_audio( - &mut self, - redactions: Redactions, - ) -> Result<(), Error>; -} - -#[async_trait::async_trait] -impl AudioTransform for H { - async fn redact_audio( - &mut self, - redactions: Redactions, - ) -> Result<(), Error> { - tracing::debug!( - target: TARGET, - redaction_count = redactions.len(), - "applying audio redactions" - ); - if redactions.is_empty() { - return Ok(()); - } - - // TODO: implement audio redaction (silence/remove time ranges) - tracing::warn!(target: TARGET, "audio redaction is not yet implemented"); - Ok(()) - } -} diff --git a/crates/nvisy-codec/src/transform/image/mod.rs b/crates/nvisy-codec/src/transform/image/mod.rs index 8ff18a98..38fd933c 100644 --- a/crates/nvisy-codec/src/transform/image/mod.rs +++ b/crates/nvisy-codec/src/transform/image/mod.rs @@ -2,7 +2,6 @@ mod instruction; mod ops; -mod transform; pub use self::instruction::{ImageOutput, ImageRedaction}; -pub use self::transform::ImageTransform; +pub(crate) use self::ops::ImageOps; diff --git a/crates/nvisy-codec/src/transform/image/transform.rs b/crates/nvisy-codec/src/transform/image/transform.rs deleted file mode 100644 index 04b98061..00000000 --- a/crates/nvisy-codec/src/transform/image/transform.rs +++ /dev/null @@ -1,109 +0,0 @@ -//! [`ImageTransform`] async trait and blanket implementation. - -use std::iter; - -use futures::StreamExt; -use image::DynamicImage; -use nvisy_core::Error; -use nvisy_ontology::entity::ImageLocation; - -use super::instruction::{ImageOutput, ImageRedaction}; -use super::ops::ImageOps; -use crate::document::{Span, SpanStream}; -use crate::handler::{ImageData, ImageHandler}; -use crate::transform::Redactions; - -const TARGET: &str = "nvisy_codec::transform::image"; - -/// Extension trait for handlers that support image redaction. -#[async_trait::async_trait] -pub trait ImageTransform: ImageHandler { - /// Apply a batch of image redactions, mutating in place. - /// - /// Redactions are grouped by [`ImageLocation`] span in the input - /// [`Redactions`] collection. Bounding-box overlaps within a span - /// are resolved by the collection on insert. - async fn redact_images( - &mut self, - redactions: Redactions, - ) -> Result<(), Error>; -} - -#[async_trait::async_trait] -impl ImageTransform for H { - async fn redact_images( - &mut self, - redactions: Redactions, - ) -> Result<(), Error> { - tracing::debug!( - target: TARGET, - redaction_count = redactions.len(), - "applying image redactions" - ); - if redactions.is_empty() { - return Ok(()); - } - - let spans: Vec<_> = self.image_spans().await.collect().await; - let span = match spans.into_iter().next() { - Some(s) => s, - None => return Ok(()), - }; - - let image_data: ImageData = span.data; - let mut img: DynamicImage = image_data.into_inner(); - - // Image handlers expose a single span; apply every redaction in - // the collection to that one image. The collection's grouping - // is preserved but does not gate application here. - for (_loc, items) in redactions { - for redaction in items { - let region = redaction.bounding_box.to_pixel(); - match &redaction.output { - ImageOutput::Blur { sigma } => { - img.apply_gaussian_blur(®ion, *sigma); - } - ImageOutput::Block { color } => { - img.apply_block_overlay(®ion, *color); - } - ImageOutput::Pixelate { block_size } => { - img.apply_pixelate(®ion, *block_size); - } - ImageOutput::Replace { data } => { - let replacement = match image::load_from_memory(data) { - Ok(r) => r, - Err(e) => { - tracing::warn!( - target: TARGET, - region = ?region, - error = %e, - "failed to decode replacement image data, skipping region" - ); - continue; - } - }; - let resized = replacement.resize_exact( - region.width, - region.height, - image::imageops::FilterType::Lanczos3, - ); - image::imageops::overlay( - &mut img, - &resized, - region.x as i64, - region.y as i64, - ); - } - } - } - } - - self.edit_images(SpanStream::new(futures::stream::iter(iter::once( - Span::new(span.id, ImageData::from(img)), - )))) - .await?; - - tracing::debug!(target: TARGET, "image redactions applied"); - Ok(()) - } -} diff --git a/crates/nvisy-codec/src/transform/mod.rs b/crates/nvisy-codec/src/transform/mod.rs index d5fd546c..3f931666 100644 --- a/crates/nvisy-codec/src/transform/mod.rs +++ b/crates/nvisy-codec/src/transform/mod.rs @@ -1,4 +1,16 @@ -//! Redaction transform traits and output types. +//! Redaction transform primitives. +//! +//! [`Redactions`] groups per-modality instructions by their target +//! span identity and enforces an overlap [`ConflictPolicy`] on insert. +//! Handler capability traits ([`TextHandler`], [`ImageHandler`], +//! [`AudioHandler`]) consume these collections directly via their +//! `redact` methods. +//! +//! [`Redactions`]: crate::transform::Redactions +//! [`ConflictPolicy`]: crate::transform::ConflictPolicy +//! [`TextHandler`]: crate::handler::TextHandler +//! [`ImageHandler`]: crate::handler::ImageHandler +//! [`AudioHandler`]: crate::handler::AudioHandler mod audio; mod image; @@ -8,10 +20,11 @@ mod redactions; mod tabular; mod text; -pub use self::audio::{AudioOutput, AudioRedaction, AudioTransform}; -pub use self::image::{ImageOutput, ImageRedaction, ImageTransform}; +pub use self::audio::{AudioOutput, AudioRedaction}; +pub(crate) use self::image::ImageOps; +pub use self::image::{ImageOutput, ImageRedaction}; pub use self::mergeable::Mergeable; pub use self::policy::{ConflictPolicy, InsertError}; pub use self::redactions::Redactions; -pub use self::tabular::{TabularRedaction, TabularTransform}; -pub use self::text::{TextOutput, TextRedaction, TextTransform}; +pub use self::tabular::TabularRedaction; +pub use self::text::{TextOutput, TextRedaction}; diff --git a/crates/nvisy-codec/src/transform/tabular/mod.rs b/crates/nvisy-codec/src/transform/tabular/mod.rs index e8b233d1..9465c8c0 100644 --- a/crates/nvisy-codec/src/transform/tabular/mod.rs +++ b/crates/nvisy-codec/src/transform/tabular/mod.rs @@ -1,7 +1,5 @@ //! Tabular redaction primitives. mod instruction; -mod transform; pub use self::instruction::TabularRedaction; -pub use self::transform::TabularTransform; diff --git a/crates/nvisy-codec/src/transform/tabular/transform.rs b/crates/nvisy-codec/src/transform/tabular/transform.rs deleted file mode 100644 index ebeb8174..00000000 --- a/crates/nvisy-codec/src/transform/tabular/transform.rs +++ /dev/null @@ -1,278 +0,0 @@ -//! [`TabularTransform`] async trait and blanket implementation. -//! -//! Bridges [`TabularRedaction`] (cell-addressed by row/col) to the -//! underlying [`TextHandler`] (byte-offset-addressed spans). -//! -//! The blanket implementation walks the per-cell groups in a -//! [`Redactions`] collection, locates each cell's text span via -//! `line_number`/column-position, applies intra-cell byte-offset -//! replacements right-to-left, and writes results back via -//! [`TextHandler::edit_text`]. -//! -//! Overlap detection is owned by [`Redactions`]; this transform -//! trusts that ranges within a single cell do not overlap. - -use std::cmp::Reverse; -use std::collections::HashMap; - -use futures::StreamExt; -use nvisy_core::Error; -use nvisy_ontology::entity::{TabularLocation, TextLocation}; - -use super::instruction::TabularRedaction; -use crate::document::{Span, SpanStream}; -use crate::handler::{TextData, TextHandler}; -use crate::transform::Redactions; - -const TARGET: &str = "nvisy_codec::transform::tabular"; - -/// Extension trait for text handlers that support cell-addressed -/// tabular redaction. -/// -/// Implemented automatically for all [`TextHandler`] types via a -/// blanket impl. The trait translates `(row, col)` cell addresses -/// into the byte-offset [`TextLocation`]s that the handler understands. -#[async_trait::async_trait] -pub trait TabularTransform: TextHandler { - /// Apply a batch of cell-addressed redactions, mutating in place. - /// - /// Redactions are grouped by [`TabularLocation`] cell in the - /// input [`Redactions`] collection. Overlap detection per cell is - /// handled by the collection on insert. - async fn redact_tabular( - &mut self, - redactions: Redactions, - ) -> Result<(), Error>; -} - -#[async_trait::async_trait] -impl TabularTransform for H { - async fn redact_tabular( - &mut self, - redactions: Redactions, - ) -> Result<(), Error> { - tracing::debug!( - target: TARGET, - redaction_count = redactions.len(), - "applying tabular redactions" - ); - if redactions.is_empty() { - return Ok(()); - } - - // Collect all text spans and build a (row, col) -> span index. - let all_spans: Vec<_> = self.text_spans().await.collect().await; - - // Group span indices by line_number (= row), preserving column order. - let mut rows: HashMap> = HashMap::new(); - for (idx, span) in all_spans.iter().enumerate() { - let line = span.id.line_number.unwrap_or(1); - rows.entry(line).or_default().push(idx); - } - - // Build sorted row keys so we can map row_index -> line_number. - let mut line_numbers: Vec = rows.keys().copied().collect(); - line_numbers.sort_unstable(); - - let mut edits: Vec> = Vec::new(); - for (cell, mut items) in redactions { - // Map row_index -> line_number -> span indices. - let line_num = line_numbers.get(cell.row_index).ok_or_else(|| { - Error::validation( - format!( - "row_index {} out of bounds (have {} rows)", - cell.row_index, - line_numbers.len() - ), - "tabular-redact", - ) - })?; - let row_spans = rows.get(line_num).ok_or_else(|| { - Error::validation( - format!("no spans for line_number {line_num}"), - "tabular-redact", - ) - })?; - let &span_idx = row_spans.get(cell.column_index).ok_or_else(|| { - Error::validation( - format!( - "column_index {} out of bounds in row {} (have {} columns)", - cell.column_index, - cell.row_index, - row_spans.len() - ), - "tabular-redact", - ) - })?; - - let span = &all_spans[span_idx]; - let content: &str = span.data.as_ref(); - - // Sort right-to-left so earlier byte offsets stay valid. - items.sort_by_key(|r| Reverse(r.start)); - - let mut result = content.to_string(); - for r in &items { - let value = r.output.replacement_value().unwrap_or_default(); - let s = r.start.min(result.len()); - let e = r.end.min(result.len()); - if s >= e { - continue; - } - if !result.is_char_boundary(s) || !result.is_char_boundary(e) { - return Err(Error::validation( - format!( - "redaction offset falls mid-character \ - (start={}, end={}, len={})", - r.start, - r.end, - result.len() - ), - "tabular-redact", - )); - } - result.replace_range(s..e, value); - } - - edits.push(Span::new(span.id.clone(), TextData::from(result))); - } - - let edit_count = edits.len(); - if !edits.is_empty() { - self.edit_text(SpanStream::new(futures::stream::iter(edits))) - .await?; - } - - tracing::debug!(target: TARGET, edit_count, "tabular redactions applied"); - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use nvisy_core::Result; - use nvisy_ontology::entity::TabularLocation; - - use super::*; - use crate::handler::{CsvData, CsvHandler}; - use crate::transform::{ConflictPolicy, TextOutput}; - - fn handler() -> CsvHandler { - CsvHandler::new(CsvData { - headers: Some(vec!["name".into(), "ssn".into()]), - rows: vec![ - vec!["Alice Smith".into(), "123-45-6789".into()], - vec!["Bob Jones".into(), "987-65-4321".into()], - ], - delimiter: b',', - trailing_newline: true, - }) - } - - fn cell(row: usize, col: usize) -> TabularLocation { - TabularLocation::builder() - .with_row_index(row) - .with_column_index(col) - .build() - .expect("required fields provided") - } - - fn redaction(start: usize, end: usize, replacement: &str) -> TabularRedaction { - TabularRedaction::new(start, end, TextOutput::replace(replacement)) - } - - #[tokio::test] - async fn single_cell_redaction() -> Result<()> { - let mut h = handler(); - let mut rs = Redactions::new(ConflictPolicy::Reject); - // row 1 = first data row (headers are row 0), col 1 = ssn - rs.try_insert(cell(1, 1), redaction(0, 11, "[REDACTED]")) - .unwrap(); - TabularTransform::redact_tabular(&mut h, rs).await?; - assert_eq!(h.cell(0, 1), Some("[REDACTED]")); - Ok(()) - } - - #[tokio::test] - async fn partial_cell_redaction() -> Result<()> { - let mut h = handler(); - let mut rs = Redactions::new(ConflictPolicy::Reject); - // Redact "Alice" (0..5) in the name cell at row 1, col 0. - rs.try_insert(cell(1, 0), redaction(0, 5, "[NAME]")) - .unwrap(); - TabularTransform::redact_tabular(&mut h, rs).await?; - assert_eq!(h.cell(0, 0), Some("[NAME] Smith")); - Ok(()) - } - - #[tokio::test] - async fn multiple_cells_redacted() -> Result<()> { - let mut h = handler(); - let mut rs = Redactions::new(ConflictPolicy::Reject); - rs.try_insert(cell(1, 1), redaction(0, 11, "[REDACTED]")) - .unwrap(); - rs.try_insert(cell(2, 1), redaction(0, 11, "[REDACTED]")) - .unwrap(); - TabularTransform::redact_tabular(&mut h, rs).await?; - assert_eq!(h.cell(0, 1), Some("[REDACTED]")); - assert_eq!(h.cell(1, 1), Some("[REDACTED]")); - Ok(()) - } - - #[tokio::test] - async fn empty_redactions_is_noop() -> Result<()> { - let mut h = handler(); - let rs: Redactions = Redactions::default(); - TabularTransform::redact_tabular(&mut h, rs).await?; - assert_eq!(h.cell(0, 0), Some("Alice Smith")); - Ok(()) - } - - #[tokio::test] - async fn remove_cell_content() -> Result<()> { - let mut h = handler(); - let mut rs = Redactions::new(ConflictPolicy::Reject); - rs.try_insert(cell(1, 1), TabularRedaction::new(0, 11, TextOutput::Remove)) - .unwrap(); - TabularTransform::redact_tabular(&mut h, rs).await?; - assert_eq!(h.cell(0, 1), Some("")); - Ok(()) - } - - #[tokio::test] - async fn header_redaction() -> Result<()> { - let mut h = handler(); - let mut rs = Redactions::new(ConflictPolicy::Reject); - // Row 0 = headers - rs.try_insert(cell(0, 1), redaction(0, 3, "[REDACTED]")) - .unwrap(); - TabularTransform::redact_tabular(&mut h, rs).await?; - assert_eq!( - h.headers(), - Some(["name".to_string(), "[REDACTED]".to_string()].as_slice()) - ); - Ok(()) - } - - #[tokio::test] - async fn row_out_of_bounds() { - let mut h = handler(); - let mut rs = Redactions::new(ConflictPolicy::Reject); - rs.try_insert(cell(99, 0), redaction(0, 1, "x")).unwrap(); - let err = TabularTransform::redact_tabular(&mut h, rs) - .await - .unwrap_err(); - assert!(err.to_string().contains("row_index 99 out of bounds")); - } - - #[tokio::test] - async fn col_out_of_bounds() { - let mut h = handler(); - let mut rs = Redactions::new(ConflictPolicy::Reject); - rs.try_insert(cell(0, 99), redaction(0, 1, "x")).unwrap(); - let err = TabularTransform::redact_tabular(&mut h, rs) - .await - .unwrap_err(); - assert!(err.to_string().contains("column_index 99 out of bounds")); - } -} diff --git a/crates/nvisy-codec/src/transform/text/mod.rs b/crates/nvisy-codec/src/transform/text/mod.rs index 0993a6d9..8620b840 100644 --- a/crates/nvisy-codec/src/transform/text/mod.rs +++ b/crates/nvisy-codec/src/transform/text/mod.rs @@ -1,7 +1,5 @@ //! Text redaction primitives. mod instruction; -mod transform; pub use self::instruction::{TextOutput, TextRedaction}; -pub use self::transform::TextTransform; diff --git a/crates/nvisy-codec/src/transform/text/transform.rs b/crates/nvisy-codec/src/transform/text/transform.rs deleted file mode 100644 index 0e99c015..00000000 --- a/crates/nvisy-codec/src/transform/text/transform.rs +++ /dev/null @@ -1,227 +0,0 @@ -//! [`TextTransform`] async trait and blanket implementation. -//! -//! The blanket implementation walks the per-span groups in a -//! [`Redactions`] collection, reads current content via -//! [`TextHandler::text_spans`], applies intra-span byte-offset -//! replacements right-to-left (so earlier offsets stay valid), and -//! writes the results back via [`TextHandler::edit_text`]. -//! -//! Overlap detection is owned by [`Redactions`]; this transform -//! trusts that ranges within a span do not overlap. - -use std::cmp::Reverse; - -use futures::StreamExt; -use nvisy_core::Error; -use nvisy_ontology::entity::TextLocation; - -use super::instruction::TextRedaction; -use crate::document::{Span, SpanStream}; -use crate::handler::{TextData, TextHandler}; -use crate::transform::Redactions; - -const TARGET: &str = "nvisy_codec::transform::text"; - -/// Extension trait for handlers that support text redaction. -#[async_trait::async_trait] -pub trait TextTransform: TextHandler { - /// Apply a batch of text redactions, mutating in place. - /// - /// Redactions are grouped by [`TextLocation`] span in the input - /// [`Redactions`] collection. The implementation assumes ranges - /// within a single span do not overlap — the [`Redactions`] - /// collection enforces this on insert. - async fn redact_text( - &mut self, - redactions: Redactions, - ) -> Result<(), Error>; -} - -#[async_trait::async_trait] -impl TextTransform for H { - async fn redact_text( - &mut self, - redactions: Redactions, - ) -> Result<(), Error> { - tracing::debug!( - target: TARGET, - redaction_count = redactions.len(), - "applying text redactions" - ); - if redactions.is_empty() { - return Ok(()); - } - - // Read current content for all spans, then walk each affected span. - let all_spans: Vec<_> = self.text_spans().await.collect().await; - - let mut edits: Vec> = Vec::new(); - for (span_loc, mut items) in redactions { - let Some(span) = all_spans - .iter() - .find(|s| s.id.start_offset == span_loc.start_offset) - else { - continue; - }; - let content: &str = span.data.as_ref(); - - // Sort right-to-left so earlier byte offsets stay valid. - items.sort_by_key(|r| Reverse(r.start)); - - let mut result = content.to_string(); - for r in &items { - let value = r.output.replacement_value().unwrap_or_default(); - let s = r.start.min(result.len()); - let e = r.end.min(result.len()); - if s >= e { - continue; - } - if !result.is_char_boundary(s) || !result.is_char_boundary(e) { - return Err(Error::validation( - format!( - "redaction offset falls mid-character \ - (start={}, end={}, len={})", - r.start, - r.end, - result.len() - ), - "text-redact", - )); - } - result.replace_range(s..e, value); - } - - edits.push(Span::new(span.id.clone(), TextData::from(result))); - } - - let edit_count = edits.len(); - if !edits.is_empty() { - self.edit_text(SpanStream::new(futures::stream::iter(edits))) - .await?; - } - - tracing::debug!(target: TARGET, edit_count, "text redactions applied"); - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use futures::StreamExt; - use nvisy_core::Result; - - use super::*; - use crate::handler::TxtHandler; - use crate::transform::{ConflictPolicy, TextOutput}; - - fn handler(text: &str) -> TxtHandler { - let trailing_newline = text.ends_with('\n'); - let lines = text.lines().map(String::from).collect(); - TxtHandler::new(lines, trailing_newline) - } - - async fn span_for(h: &TxtHandler, line_idx: usize) -> TextLocation { - let spans: Vec<_> = h.text_spans().await.collect().await; - spans[line_idx].id.clone() - } - - #[tokio::test] - async fn single_span_single_redaction() -> Result<()> { - let mut h = handler("hello world\n"); - let mut rs = Redactions::new(ConflictPolicy::Reject); - rs.try_insert( - span_for(&h, 0).await, - TextRedaction::new(0, 5, TextOutput::replace("[NAME]")), - ) - .unwrap(); - TextTransform::redact_text(&mut h, rs).await?; - - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans[0].data, "[NAME] world"); - Ok(()) - } - - #[tokio::test] - async fn multiple_redactions_within_one_span() -> Result<()> { - let mut h = handler("Alice met Bob\n"); - let id = span_for(&h, 0).await; - let mut rs = Redactions::new(ConflictPolicy::Reject); - rs.try_insert( - id.clone(), - TextRedaction::new(0, 5, TextOutput::replace("[X]")), - ) - .unwrap(); - rs.try_insert(id, TextRedaction::new(10, 13, TextOutput::replace("[Y]"))) - .unwrap(); - TextTransform::redact_text(&mut h, rs).await?; - - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans[0].data, "[X] met [Y]"); - Ok(()) - } - - #[tokio::test] - async fn redaction_spanning_entire_content_replace() -> Result<()> { - let mut h = handler("secret\n"); - let mut rs = Redactions::new(ConflictPolicy::Reject); - rs.try_insert( - span_for(&h, 0).await, - TextRedaction::new(0, 6, TextOutput::replace("[REDACTED]")), - ) - .unwrap(); - TextTransform::redact_text(&mut h, rs).await?; - - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans[0].data, "[REDACTED]"); - Ok(()) - } - - #[tokio::test] - async fn redaction_spanning_entire_content_remove() -> Result<()> { - let mut h = handler("secret\n"); - let mut rs = Redactions::new(ConflictPolicy::Reject); - rs.try_insert( - span_for(&h, 0).await, - TextRedaction::new(0, 6, TextOutput::Remove), - ) - .unwrap(); - TextTransform::redact_text(&mut h, rs).await?; - - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans[0].data, ""); - Ok(()) - } - - #[tokio::test] - async fn empty_redactions_is_noop() -> Result<()> { - let mut h = handler("unchanged\n"); - let rs: Redactions = Redactions::default(); - TextTransform::redact_text(&mut h, rs).await?; - - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans[0].data, "unchanged"); - Ok(()) - } - - #[tokio::test] - async fn multiple_spans_with_separate_redactions() -> Result<()> { - let mut h = handler("hello\nworld\n"); - let mut rs = Redactions::new(ConflictPolicy::Reject); - rs.try_insert( - span_for(&h, 0).await, - TextRedaction::new(0, 5, TextOutput::replace("[A]")), - ) - .unwrap(); - rs.try_insert( - span_for(&h, 1).await, - TextRedaction::new(0, 5, TextOutput::replace("[B]")), - ) - .unwrap(); - TextTransform::redact_text(&mut h, rs).await?; - - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans[0].data, "[A]"); - assert_eq!(spans[1].data, "[B]"); - Ok(()) - } -} From 62c3b5c350784a26dd414c17e31a7e9cedd24cef Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 19 May 2026 02:21:07 +0200 Subject: [PATCH 5/8] refactor(codec): migrate concrete handlers to location/read/redact API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every text, image, audio, and rich handler now implements the new capability traits directly: locations() yields cheap Located identities, read(&loc) fetches typed *Data on demand, and redact(Redactions) applies a batch in place. Byte-level replacement logic lives in pub(crate) helpers under transform/text/apply.rs and transform/image/apply.rs; handlers walk the Redactions collection and call the helper on the affected slice of their internal model (lines, cells, pages, image buffer). Per-handler tests are rewritten against the new API. The codec crate compiles and 85/85 codec tests pass. The engine still references the old API and does not compile — that's the next commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/handler/audio/audio_handler_macro.rs | 63 ++-- .../src/handler/audio/mp3_handler.rs | 42 +-- .../src/handler/audio/wav_handler.rs | 42 +-- .../src/handler/image/image_handler_macro.rs | 56 ++-- .../src/handler/image/jpeg_handler.rs | 22 +- .../src/handler/image/png_handler.rs | 22 +- .../src/handler/image/tiff_handler.rs | 4 +- .../src/handler/rich/pdf_handler.rs | 208 +++++++------ .../src/handler/rich/pdf_loader.rs | 6 +- .../src/handler/text/csv_handler.rs | 213 ++++++------- .../src/handler/text/csv_loader.rs | 14 +- .../src/handler/text/html_handler.rs | 135 ++++---- .../src/handler/text/json_handler.rs | 198 ++++++------ .../src/handler/text/txt_handler.rs | 293 ++++++++---------- .../src/handler/text/txt_loader.rs | 12 +- .../src/handler/text/xlsx_handler.rs | 19 +- .../nvisy-codec/src/transform/image/apply.rs | 51 +++ crates/nvisy-codec/src/transform/image/mod.rs | 3 +- crates/nvisy-codec/src/transform/mod.rs | 3 +- .../nvisy-codec/src/transform/text/apply.rs | 102 ++++++ crates/nvisy-codec/src/transform/text/mod.rs | 2 + 21 files changed, 786 insertions(+), 724 deletions(-) create mode 100644 crates/nvisy-codec/src/transform/image/apply.rs create mode 100644 crates/nvisy-codec/src/transform/text/apply.rs diff --git a/crates/nvisy-codec/src/handler/audio/audio_handler_macro.rs b/crates/nvisy-codec/src/handler/audio/audio_handler_macro.rs index 94a2fd28..fe61b310 100644 --- a/crates/nvisy-codec/src/handler/audio/audio_handler_macro.rs +++ b/crates/nvisy-codec/src/handler/audio/audio_handler_macro.rs @@ -29,18 +29,17 @@ macro_rules! impl_audio_handler { #[async_trait::async_trait] impl crate::handler::AudioHandler for $handler { - async fn audio_spans( + fn locations( &self, - ) -> crate::document::SpanStream< - '_, - nvisy_ontology::entity::AudioLocation, - crate::handler::AudioData, - > { - // Single-track audio: the entire audio as one span - // with a time span covering the full duration. - // Duration is unknown without decoding — use 0..0 as - // a placeholder. The actual time span is set by the - // STT extraction operation after transcription. + ) -> crate::document::LocationStream<'_, nvisy_ontology::entity::AudioLocation> + { + use ::std::iter; + + // Single-track audio: the entire audio as one location + // with a time span covering the full duration. Duration + // is unknown without decoding — use 0..0 as a + // placeholder. The actual time span is set by the STT + // extraction operation after transcription. let location = nvisy_ontology::entity::AudioLocation { time_span: nvisy_ontology::primitive::TimeSpan { start_us: 0, @@ -49,33 +48,12 @@ macro_rules! impl_audio_handler { speaker_id: None, audio_id: None, }; - use ::std::iter; - - crate::document::SpanStream::new(futures::stream::iter(iter::once( - crate::document::Span::new( - location, - crate::handler::AudioData::new(self.bytes.clone()), - ), + crate::document::LocationStream::new(futures::stream::iter(iter::once( + crate::document::Located::new(self.source, location), ))) } - async fn edit_audio( - &mut self, - edits: crate::document::SpanStream< - '_, - nvisy_ontology::entity::AudioLocation, - crate::handler::AudioData, - >, - ) -> Result<(), nvisy_core::Error> { - use futures::StreamExt; - let edits: Vec<_> = edits.collect().await; - if let Some(edit) = edits.into_iter().next() { - self.bytes = edit.data.into_inner(); - } - Ok(()) - } - - async fn value_at( + async fn read( &self, _location: &nvisy_ontology::entity::AudioLocation, ) -> Option { @@ -83,6 +61,21 @@ macro_rules! impl_audio_handler { // time span requires decoding, which we don't do here. Some(crate::handler::AudioData::new(self.bytes.clone())) } + + async fn redact( + &mut self, + _redactions: crate::transform::Redactions< + nvisy_ontology::entity::AudioLocation, + crate::transform::AudioRedaction, + >, + ) -> Result<(), nvisy_core::Error> { + // TODO: implement audio redaction (silence/remove time ranges) + tracing::warn!( + target: $origin, + "audio redaction is not yet implemented" + ); + Ok(()) + } } impl $handler { diff --git a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs index dd9311ce..cb843619 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs @@ -1,11 +1,13 @@ -//! MP3 handler: holds raw MP3 audio bytes and provides span-based +//! MP3 handler: holds raw MP3 audio bytes and provides location-based //! access via [`AudioHandler`](crate::handler::AudioHandler). //! -//! # Span model +//! [`AudioHandler::locations`] yields a single full-duration +//! [`AudioLocation`]; [`AudioHandler::read`] returns the underlying +//! bytes as [`AudioData`]. Redaction is currently a no-op. //! -//! [`AudioHandler::audio_spans`] yields a single [`Span`] carrying the -//! entire audio payload as [`AudioData`]. [`AudioHandler::edit_audio`] -//! replaces the payload from the first incoming edit. +//! [`AudioHandler::locations`]: crate::handler::AudioHandler::locations +//! [`AudioHandler::read`]: crate::handler::AudioHandler::read +//! [`AudioLocation`]: nvisy_ontology::entity::AudioLocation use nvisy_core::content::ContentSource; @@ -31,35 +33,25 @@ impl_audio_handler!( #[cfg(test)] mod tests { use bytes::Bytes; + use futures::StreamExt; use nvisy_core::Error; use super::*; - use crate::document::{Span, SpanStream}; - use crate::handler::{AudioData, AudioHandler, Handler}; + use crate::handler::{AudioHandler, Handler}; #[tokio::test] - async fn view_spans_returns_single_span() { - use futures::StreamExt; + async fn locations_yields_single_location() { let h = Mp3Handler::new(Bytes::from_static(b"ID3-mp3-data")); - let spans: Vec<_> = h.audio_spans().await.collect().await; - assert_eq!(spans.len(), 1); - assert_eq!(spans[0].data.as_bytes().as_ref(), b"ID3-mp3-data"); + let items: Vec<_> = h.locations().collect().await; + assert_eq!(items.len(), 1); } #[tokio::test] - async fn edit_spans_replaces_bytes() -> Result<(), Error> { - let mut h = Mp3Handler::new(Bytes::from_static(b"original")); - let spans: Vec<_> = { - use futures::StreamExt; - h.audio_spans().await.collect().await - }; - h.edit_audio(SpanStream::new(futures::stream::iter(vec![Span::new( - spans[0].id.clone(), - AudioData::new(Bytes::from_static(b"replaced")), - )]))) - .await?; - assert_eq!(h.bytes().as_ref(), b"replaced"); - Ok(()) + async fn read_returns_full_audio() { + let h = Mp3Handler::new(Bytes::from_static(b"ID3-mp3-data")); + let items: Vec<_> = h.locations().collect().await; + let data = h.read(&items[0].location).await.unwrap(); + assert_eq!(data.as_bytes().as_ref(), b"ID3-mp3-data"); } #[test] diff --git a/crates/nvisy-codec/src/handler/audio/wav_handler.rs b/crates/nvisy-codec/src/handler/audio/wav_handler.rs index 1bb9f592..3af1f89a 100644 --- a/crates/nvisy-codec/src/handler/audio/wav_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/wav_handler.rs @@ -1,11 +1,13 @@ -//! WAV handler: holds raw WAV audio bytes and provides span-based +//! WAV handler: holds raw WAV audio bytes and provides location-based //! access via [`AudioHandler`](crate::handler::AudioHandler). //! -//! # Span model +//! [`AudioHandler::locations`] yields a single full-duration +//! [`AudioLocation`]; [`AudioHandler::read`] returns the underlying +//! bytes as [`AudioData`]. Redaction is currently a no-op. //! -//! [`AudioHandler::audio_spans`] yields a single [`Span`] carrying the -//! entire audio payload as [`AudioData`]. [`AudioHandler::edit_audio`] -//! replaces the payload from the first incoming edit. +//! [`AudioHandler::locations`]: crate::handler::AudioHandler::locations +//! [`AudioHandler::read`]: crate::handler::AudioHandler::read +//! [`AudioLocation`]: nvisy_ontology::entity::AudioLocation use nvisy_core::content::ContentSource; @@ -31,35 +33,25 @@ impl_audio_handler!( #[cfg(test)] mod tests { use bytes::Bytes; + use futures::StreamExt; use nvisy_core::Error; use super::*; - use crate::document::{Span, SpanStream}; - use crate::handler::{AudioData, AudioHandler, Handler}; + use crate::handler::{AudioHandler, Handler}; #[tokio::test] - async fn view_spans_returns_single_span() { - use futures::StreamExt; + async fn locations_yields_single_location() { let h = WavHandler::new(Bytes::from_static(b"RIFF-wav-data")); - let spans: Vec<_> = h.audio_spans().await.collect().await; - assert_eq!(spans.len(), 1); - assert_eq!(spans[0].data.as_bytes().as_ref(), b"RIFF-wav-data"); + let items: Vec<_> = h.locations().collect().await; + assert_eq!(items.len(), 1); } #[tokio::test] - async fn edit_spans_replaces_bytes() -> Result<(), Error> { - let mut h = WavHandler::new(Bytes::from_static(b"original")); - let spans: Vec<_> = { - use futures::StreamExt; - h.audio_spans().await.collect().await - }; - h.edit_audio(SpanStream::new(futures::stream::iter(vec![Span::new( - spans[0].id.clone(), - AudioData::new(Bytes::from_static(b"replaced")), - )]))) - .await?; - assert_eq!(h.bytes().as_ref(), b"replaced"); - Ok(()) + async fn read_returns_full_audio() { + let h = WavHandler::new(Bytes::from_static(b"RIFF-wav-data")); + let items: Vec<_> = h.locations().collect().await; + let data = h.read(&items[0].location).await.unwrap(); + assert_eq!(data.as_bytes().as_ref(), b"RIFF-wav-data"); } #[test] diff --git a/crates/nvisy-codec/src/handler/image/image_handler_macro.rs b/crates/nvisy-codec/src/handler/image/image_handler_macro.rs index 88c84e84..e21ad54e 100644 --- a/crates/nvisy-codec/src/handler/image/image_handler_macro.rs +++ b/crates/nvisy-codec/src/handler/image/image_handler_macro.rs @@ -33,13 +33,12 @@ macro_rules! impl_image_handler { #[async_trait::async_trait] impl crate::handler::ImageHandler for $handler { - async fn image_spans( + fn locations( &self, - ) -> crate::document::SpanStream< - '_, - nvisy_ontology::entity::ImageLocation, - crate::handler::ImageData, - > { + ) -> crate::document::LocationStream<'_, nvisy_ontology::entity::ImageLocation> + { + use ::std::iter; + let (w, h) = (self.image.width(), self.image.height()); let location = nvisy_ontology::entity::ImageLocation { bounding_box: nvisy_ontology::primitive::BoundingBox { @@ -51,33 +50,12 @@ macro_rules! impl_image_handler { image_id: None, page_number: None, }; - use ::std::iter; - - crate::document::SpanStream::new(futures::stream::iter(iter::once( - crate::document::Span::new( - location, - crate::handler::ImageData::from(self.image.clone()), - ), + crate::document::LocationStream::new(futures::stream::iter(iter::once( + crate::document::Located::new(self.source, location), ))) } - async fn edit_images( - &mut self, - edits: crate::document::SpanStream< - '_, - nvisy_ontology::entity::ImageLocation, - crate::handler::ImageData, - >, - ) -> Result<(), nvisy_core::Error> { - use futures::StreamExt; - let edits: Vec<_> = edits.collect().await; - if let Some(edit) = edits.into_iter().next() { - self.image = edit.data.into_inner(); - } - Ok(()) - } - - async fn value_at( + async fn read( &self, location: &nvisy_ontology::entity::ImageLocation, ) -> Option { @@ -92,6 +70,24 @@ macro_rules! impl_image_handler { let cropped = self.image.crop_imm(x, y, w, h); Some(crate::handler::ImageData::from(cropped)) } + + async fn redact( + &mut self, + redactions: crate::transform::Redactions< + nvisy_ontology::entity::ImageLocation, + crate::transform::ImageRedaction, + >, + ) -> Result<(), nvisy_core::Error> { + if redactions.is_empty() { + return Ok(()); + } + // Image handlers expose a single full-image location; apply + // every redaction in the collection to the single image. + for (_loc, items) in redactions { + crate::transform::apply_image_redactions(&mut self.image, &items); + } + Ok(()) + } } impl $handler { diff --git a/crates/nvisy-codec/src/handler/image/jpeg_handler.rs b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs index d6951d6a..413dc1e0 100644 --- a/crates/nvisy-codec/src/handler/image/jpeg_handler.rs +++ b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs @@ -1,15 +1,17 @@ -//! JPEG handler: holds a decoded image and provides single-span access -//! via [`ImageHandler`](crate::handler::ImageHandler). +//! JPEG handler: holds a decoded image and provides single-location +//! access via [`ImageHandler`]. //! -//! # Span model +//! [`ImageHandler::locations`] yields exactly one full-image +//! [`ImageLocation`]; [`ImageHandler::read`] returns the current +//! [`DynamicImage`](image::DynamicImage) cropped to the location's +//! bounding box; [`ImageHandler::redact`] applies bounding-box +//! redactions in place. //! -//! [`ImageHandler::image_spans`](crate::handler::ImageHandler::image_spans) -//! yields exactly one [`Span`] whose data is the current -//! [`DynamicImage`](image::DynamicImage). -//! [`ImageHandler::edit_images`](crate::handler::ImageHandler::edit_images) -//! replaces the image in-place. -//! -//! [`Span`]: crate::document::Span +//! [`ImageHandler`]: crate::handler::ImageHandler +//! [`ImageHandler::locations`]: crate::handler::ImageHandler::locations +//! [`ImageHandler::read`]: crate::handler::ImageHandler::read +//! [`ImageHandler::redact`]: crate::handler::ImageHandler::redact +//! [`ImageLocation`]: nvisy_ontology::entity::ImageLocation use nvisy_core::content::ContentSource; diff --git a/crates/nvisy-codec/src/handler/image/png_handler.rs b/crates/nvisy-codec/src/handler/image/png_handler.rs index 4e302986..6b07d30a 100644 --- a/crates/nvisy-codec/src/handler/image/png_handler.rs +++ b/crates/nvisy-codec/src/handler/image/png_handler.rs @@ -1,15 +1,17 @@ -//! PNG handler: holds a decoded image and provides single-span access -//! via [`ImageHandler`](crate::handler::ImageHandler). +//! PNG handler: holds a decoded image and provides single-location +//! access via [`ImageHandler`]. //! -//! # Span model +//! [`ImageHandler::locations`] yields exactly one full-image +//! [`ImageLocation`]; [`ImageHandler::read`] returns the current +//! [`DynamicImage`](image::DynamicImage) (cropped to the location's +//! bounding box); [`ImageHandler::redact`] applies bounding-box +//! redactions in place. //! -//! [`ImageHandler::image_spans`](crate::handler::ImageHandler::image_spans) -//! yields exactly one [`Span`] whose data is the current -//! [`DynamicImage`](image::DynamicImage). -//! [`ImageHandler::edit_images`](crate::handler::ImageHandler::edit_images) -//! replaces the image in-place. -//! -//! [`Span`]: crate::document::Span +//! [`ImageHandler`]: crate::handler::ImageHandler +//! [`ImageHandler::locations`]: crate::handler::ImageHandler::locations +//! [`ImageHandler::read`]: crate::handler::ImageHandler::read +//! [`ImageHandler::redact`]: crate::handler::ImageHandler::redact +//! [`ImageLocation`]: nvisy_ontology::entity::ImageLocation use nvisy_core::content::ContentSource; diff --git a/crates/nvisy-codec/src/handler/image/tiff_handler.rs b/crates/nvisy-codec/src/handler/image/tiff_handler.rs index a5a47014..3bd643fc 100644 --- a/crates/nvisy-codec/src/handler/image/tiff_handler.rs +++ b/crates/nvisy-codec/src/handler/image/tiff_handler.rs @@ -1,5 +1,5 @@ -//! TIFF handler: holds a decoded image and provides single-span access -//! via [`ImageHandler`](crate::handler::ImageHandler). +//! TIFF handler: holds a decoded image and provides single-location +//! access via [`ImageHandler`](crate::handler::ImageHandler). use nvisy_core::content::ContentSource; diff --git a/crates/nvisy-codec/src/handler/rich/pdf_handler.rs b/crates/nvisy-codec/src/handler/rich/pdf_handler.rs index 9c4a940a..69dba3c7 100644 --- a/crates/nvisy-codec/src/handler/rich/pdf_handler.rs +++ b/crates/nvisy-codec/src/handler/rich/pdf_handler.rs @@ -1,20 +1,17 @@ //! Rich-text handler: holds per-page extracted text and raw document -//! bytes, providing span-based access via [`Handler`] + [`TextHandler`] + -//! [`ImageHandler`]. +//! bytes, providing location-based access via [`Handler`] + +//! [`TextHandler`] + [`ImageHandler`]. //! -//! # Text span model +//! [`TextHandler::locations`] yields one [`TextLocation`] per page, +//! with byte offsets computed from cumulative page text lengths and +//! `page_number` set. [`ImageHandler::locations`] yields one +//! [`ImageLocation`] per embedded image. //! -//! [`TextHandler::text_spans`] yields one [`Span`] per page, addressed -//! by [`TextLocation`] with byte offsets computed from cumulative page -//! text lengths and `page_number` set. -//! -//! # Encoding -//! -//! [`Handler::encode`] returns the raw document bytes. Edits applied via -//! [`edit_text`](TextHandler::edit_text) are baked into the raw bytes. +//! [`Handler::encode`] returns the raw document bytes; text redactions +//! applied via [`TextHandler::redact`] are baked into the raw PDF +//! content streams. use bytes::Bytes; -use futures::StreamExt; use nvisy_core::Error; use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::DocumentType; @@ -22,10 +19,15 @@ use nvisy_ontology::entity::{ImageLocation, TextLocation}; use nvisy_ontology::primitive::Dpi; use super::pdf_render::PdfRenderer; -use crate::document::{Span, SpanStream}; +use crate::document::{Located, LocationStream}; use crate::handler::image::ImageData; use crate::handler::text::TextData; use crate::handler::{Handler, ImageHandler, TextHandler}; +use crate::transform::{ + ImageRedaction, Redactions, TextRedaction, apply_text_redactions, +}; + +const TARGET: &str = "rich-text-handler"; /// Handler for rich documents containing pages of text and images. #[derive(Debug)] @@ -59,7 +61,7 @@ impl RichTextHandler { let mut doc = lopdf::Document::load_mem(&raw).map_err(|e| { Error::runtime( format!("failed to extract text from PDF: {e}"), - "rich-text-handler", + TARGET, false, ) })?; @@ -67,7 +69,7 @@ impl RichTextHandler { doc.decrypt(password.unwrap_or("")).map_err(|e| { Error::runtime( format!("failed to extract text from PDF: {e}"), - "rich-text-handler", + TARGET, false, ) })?; @@ -157,102 +159,93 @@ impl Handler for RichTextHandler { #[async_trait::async_trait] impl TextHandler for RichTextHandler { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - let offsets = self.page_offsets(); - let spans: Vec<_> = self - .pages - .iter() - .zip(offsets.iter()) - .map(|(text, &(start, end, page))| { - Span::new( + fn locations(&self) -> LocationStream<'_, TextLocation> { + let source = self.source; + let items: Vec<_> = self + .page_offsets() + .into_iter() + .map(|(start, end, page)| { + Located::new( + source, TextLocation { start_offset: start, end_offset: end, page_number: Some(page), ..Default::default() }, - TextData::from(text.clone()), ) - .with_source(self.source) }) .collect(); - SpanStream::new(futures::stream::iter(spans)) + LocationStream::new(futures::stream::iter(items)) + } + + async fn read(&self, location: &TextLocation) -> Option { + let offsets = self.page_offsets(); + let page_idx = offsets + .iter() + .position(|&(start, _, _)| start == location.start_offset)?; + self.pages.get(page_idx).cloned().map(TextData::from) } - async fn edit_text( + async fn redact( &mut self, - edits: SpanStream<'_, TextLocation, TextData>, + redactions: Redactions, ) -> Result<(), Error> { - let edits: Vec<_> = edits.collect().await; - if edits.is_empty() { + if redactions.is_empty() { return Ok(()); } - let offsets = self.page_offsets(); - // Map byte offsets to page indices. - let mut page_edits: Vec<(usize, String)> = Vec::new(); - for edit in &edits { - let page_idx = offsets + // Compute new page texts by applying redactions to current values. + let mut page_updates: Vec<(usize, String)> = Vec::new(); + for (loc, items) in redactions { + let Some(page_idx) = offsets .iter() - .position(|&(start, _, _)| start == edit.id.start_offset) - .ok_or_else(|| { - Error::validation( - format!("no page at byte offset {}", edit.id.start_offset), - "rich-text-handler", - ) - })?; - page_edits.push((page_idx, edit.data.as_str().to_owned())); + .position(|&(start, _, _)| start == loc.start_offset) + else { + continue; + }; + let mut content = self.pages[page_idx].clone(); + apply_text_redactions(&mut content, &items, TARGET)?; + page_updates.push((page_idx, content)); } - // PDF-specific: apply replacements to content streams. + // PDF-specific: bake replacements into content streams. if self.document_type == DocumentType::Pdf { let mut doc = lopdf::Document::load_mem(&self.raw).map_err(|e| { Error::runtime( format!("failed to load PDF for editing: {e}"), - "rich-text-handler", + TARGET, false, ) })?; - for &(idx, ref new_text) in &page_edits { - let old_text = &self.pages[idx]; + for (idx, new_text) in &page_updates { + let old_text = &self.pages[*idx]; if !old_text.is_empty() && old_text != new_text { - let _ = doc.replace_text((idx as u32) + 1, old_text, new_text, None); + let _ = doc.replace_text((*idx as u32) + 1, old_text, new_text, None); } - self.pages[idx] = new_text.clone(); } let mut buf = Vec::new(); doc.save_to(&mut buf).map_err(|e| { - Error::runtime( - format!("failed to save edited PDF: {e}"), - "rich-text-handler", - false, - ) + Error::runtime(format!("failed to save edited PDF: {e}"), TARGET, false) })?; self.raw = Bytes::from(buf); - } else { - for (idx, new_text) in page_edits { - self.pages[idx] = new_text; - } } - Ok(()) - } + for (idx, new_text) in page_updates { + self.pages[idx] = new_text; + } - async fn value_at(&self, location: &TextLocation) -> Option { - let offsets = self.page_offsets(); - let page_idx = offsets - .iter() - .position(|&(start, _, _)| start == location.start_offset)?; - self.pages.get(page_idx).cloned() + Ok(()) } } #[async_trait::async_trait] impl ImageHandler for RichTextHandler { - async fn image_spans(&self) -> SpanStream<'_, ImageLocation, ImageData> { + fn locations(&self) -> LocationStream<'_, ImageLocation> { + let source = self.source; let images = match PdfRenderer::extract_images(&self.raw) { Ok(imgs) => imgs, Err(e) => { @@ -261,36 +254,38 @@ impl ImageHandler for RichTextHandler { error = %e, "failed to extract embedded images", ); - return SpanStream::new(futures::stream::empty()); + return LocationStream::empty(); } }; - SpanStream::new(futures::stream::iter(images.into_iter().enumerate().map( - |(i, data)| { - // Embedded image bounding box — exact position within - // the page requires PDF content stream parsing. For now, - // use a full-page placeholder that identifies the page. - let location = ImageLocation { - bounding_box: nvisy_ontology::primitive::BoundingBox::default(), - image_id: None, - page_number: Some((i + 1) as u32), - }; - Span::new(location, data) - }, - ))) - } - - async fn edit_images( - &mut self, - _edits: SpanStream<'_, ImageLocation, ImageData>, - ) -> Result<(), Error> { - Ok(()) + let items: Vec<_> = images + .into_iter() + .enumerate() + .map(|(i, _data)| { + Located::new( + source, + ImageLocation { + bounding_box: nvisy_ontology::primitive::BoundingBox::default(), + image_id: None, + page_number: Some((i + 1) as u32), + }, + ) + }) + .collect(); + LocationStream::new(futures::stream::iter(items)) } - async fn value_at(&self, _location: &ImageLocation) -> Option { + async fn read(&self, _location: &ImageLocation) -> Option { // Cropping embedded PDF images by bounding box is not yet // implemented. Requires re-rendering the page region. None } + + async fn redact( + &mut self, + _redactions: Redactions, + ) -> Result<(), Error> { + Ok(()) + } } #[cfg(test)] @@ -310,24 +305,33 @@ mod tests { } #[tokio::test] - async fn view_spans_yields_one_per_page() { + async fn locations_yields_one_per_page() { let h = handler(&["page one", "page two", "page three"]); - let spans: Vec<_> = h.text_spans().await.collect().await; - - assert_eq!(spans.len(), 3); - assert_eq!(spans[0].id.page_number, Some(1)); - assert_eq!(spans[0].data, "page one"); - assert_eq!(spans[1].id.page_number, Some(2)); - assert_eq!(spans[1].data, "page two"); - assert_eq!(spans[2].id.page_number, Some(3)); - assert_eq!(spans[2].data, "page three"); + let items: Vec<_> = TextHandler::locations(&h).collect().await; + assert_eq!(items.len(), 3); + assert_eq!(items[0].location.page_number, Some(1)); + assert_eq!(items[1].location.page_number, Some(2)); + assert_eq!(items[2].location.page_number, Some(3)); } #[tokio::test] - async fn view_spans_empty_document() { + async fn locations_empty_document() { let h = handler(&[]); - let spans: Vec<_> = h.text_spans().await.collect().await; - assert!(spans.is_empty()); + let items: Vec<_> = TextHandler::locations(&h).collect().await; + assert!(items.is_empty()); + } + + #[tokio::test] + async fn read_returns_page_text() { + let h = handler(&["page one", "page two"]); + let items: Vec<_> = TextHandler::locations(&h).collect().await; + assert_eq!( + TextHandler::read(&h, &items[0].location) + .await + .unwrap() + .as_str(), + "page one" + ); } #[test] diff --git a/crates/nvisy-codec/src/handler/rich/pdf_loader.rs b/crates/nvisy-codec/src/handler/rich/pdf_loader.rs index 49e4109b..fe5e71af 100644 --- a/crates/nvisy-codec/src/handler/rich/pdf_loader.rs +++ b/crates/nvisy-codec/src/handler/rich/pdf_loader.rs @@ -137,7 +137,7 @@ mod tests { } #[tokio::test] - async fn view_spans_matches_pages() { + async fn locations_matches_pages() { let raw = minimal_pdf(); let content = content_from_bytes(&raw); let doc = PdfLoader @@ -145,7 +145,7 @@ mod tests { .await .unwrap(); - let spans: Vec<_> = doc.text_spans().await.collect().await; - assert_eq!(spans.len(), doc.page_count()); + let items: Vec<_> = TextHandler::locations(&doc).collect().await; + assert_eq!(items.len(), doc.page_count()); } } diff --git a/crates/nvisy-codec/src/handler/text/csv_handler.rs b/crates/nvisy-codec/src/handler/text/csv_handler.rs index abf2d8ab..67c60ba1 100644 --- a/crates/nvisy-codec/src/handler/text/csv_handler.rs +++ b/crates/nvisy-codec/src/handler/text/csv_handler.rs @@ -1,34 +1,22 @@ -//! CSV handler: holds parsed CSV content and provides span-based +//! CSV handler: holds parsed CSV content and provides location-based //! access via [`Handler`] + [`TextHandler`]. //! -//! The handler stores the parsed rows (and optional headers) together -//! with the detected delimiter so the file can be reconstructed after -//! edits. -//! -//! # Span model -//! -//! [`TextHandler::text_spans`] yields one [`Span`] per cell. If headers -//! are present, header cells are emitted first, followed by data cells -//! in row-major order. Each span is addressed by a [`TextLocation`] -//! with byte offsets computed from the **serialized** CSV form, -//! correctly accounting for quoted/escaped fields. -//! -//! # Offset semantics -//! -//! Offsets are into the serialized CSV string (after CRLF→LF -//! normalization). Quoted fields include the quote characters in their -//! offset range. The `value` field on [`TextLocation`] carries the -//! unescaped cell content. +//! [`TextHandler::locations`] yields one location per cell, ordered +//! header-then-row-major. Each location's byte offsets address the +//! field in the **serialized** CSV form (quoted/escaped if necessary). +//! [`TextHandler::read`] returns the unescaped cell value. -use futures::StreamExt; use nvisy_core::Error; use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::{DocumentType, SpreadsheetFormat}; use nvisy_ontology::entity::TextLocation; -use crate::document::{Span, SpanStream}; +use crate::document::{Located, LocationStream}; use crate::handler::text::TextData; use crate::handler::{Handler, TextHandler}; +use crate::transform::{Redactions, TextRedaction, apply_text_redactions}; + +const TARGET: &str = "csv-handler"; /// Parsed CSV content. #[derive(Debug, Clone)] @@ -81,77 +69,78 @@ impl Handler for CsvHandler { #[async_trait::async_trait] impl TextHandler for CsvHandler { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - let cells = self.locate_cells(); + fn locations(&self) -> LocationStream<'_, TextLocation> { let source = self.source; - let spans: Vec<_> = cells + let cells = self.locate_cells(); + let items: Vec<_> = cells .into_iter() .map(|c| { - Span::new( + Located::new( + source, TextLocation { start_offset: c.start, end_offset: c.end, line_number: Some(c.line_number), ..Default::default() }, - TextData::from(c.value), ) - .with_source(source) }) .collect(); - SpanStream::new(futures::stream::iter(spans)) + LocationStream::new(futures::stream::iter(items)) } - async fn edit_text( + async fn read(&self, location: &TextLocation) -> Option { + self.locate_cells() + .into_iter() + .find(|c| c.start == location.start_offset && c.end == location.end_offset) + .map(|c| TextData::from(c.value)) + } + + async fn redact( &mut self, - edits: SpanStream<'_, TextLocation, TextData>, + redactions: Redactions, ) -> Result<(), Error> { - let edits: Vec<_> = edits.collect().await; + if redactions.is_empty() { + return Ok(()); + } let cells = self.locate_cells(); - for edit in edits { - let cell = cells + + let mut updates: Vec<(bool, usize, usize, String)> = Vec::new(); + for (loc, items) in redactions { + let Some(cell) = cells .iter() - .find(|c| c.start == edit.id.start_offset && c.end == edit.id.end_offset) - .ok_or_else(|| { - Error::validation( - format!( - "no cell at byte offset {}..{}", - edit.id.start_offset, edit.id.end_offset - ), - "csv-handler", - ) - })?; + .find(|c| c.start == loc.start_offset && c.end == loc.end_offset) + else { + continue; + }; + let mut value = cell.value.clone(); + apply_text_redactions(&mut value, &items, TARGET)?; + updates.push((cell.is_header, cell.row, cell.col, value)); + } - if cell.is_header { + for (is_header, row, col, new_value) in updates { + if is_header { let headers = self .data .headers .as_mut() - .ok_or_else(|| Error::validation("no headers to edit", "csv-handler"))?; - headers[cell.col] = edit.data.into_inner(); + .ok_or_else(|| Error::validation("no headers to edit", TARGET))?; + headers[col] = new_value; } else { - let row = self.data.rows.get_mut(cell.row).ok_or_else(|| { - Error::validation(format!("row {} out of bounds", cell.row), "csv-handler") + let row_vec = self.data.rows.get_mut(row).ok_or_else(|| { + Error::validation(format!("row {row} out of bounds"), TARGET) })?; - let target = row.get_mut(cell.col).ok_or_else(|| { + let target = row_vec.get_mut(col).ok_or_else(|| { Error::validation( - format!("col {} out of bounds in row {}", cell.col, cell.row), - "csv-handler", + format!("col {col} out of bounds in row {row}"), + TARGET, ) })?; - *target = edit.data.into_inner(); + *target = new_value; } } Ok(()) } - - async fn value_at(&self, location: &TextLocation) -> Option { - let cells = self.locate_cells(); - cells - .iter() - .find(|c| c.start == location.start_offset && c.end == location.end_offset) - .map(|c| c.value.clone()) - } } impl CsvHandler { @@ -228,15 +217,15 @@ impl CsvHandler { if let Some(headers) = &self.data.headers { wtr.write_record(headers) - .map_err(|e| Error::validation(format!("CSV encode error: {e}"), "csv-handler"))?; + .map_err(|e| Error::validation(format!("CSV encode error: {e}"), TARGET))?; } for row in &self.data.rows { wtr.write_record(row) - .map_err(|e| Error::validation(format!("CSV encode error: {e}"), "csv-handler"))?; + .map_err(|e| Error::validation(format!("CSV encode error: {e}"), TARGET))?; } let mut bytes = wtr .into_inner() - .map_err(|e| Error::validation(format!("CSV encode error: {e}"), "csv-handler"))?; + .map_err(|e| Error::validation(format!("CSV encode error: {e}"), TARGET))?; bytes.retain(|&b| b != b'\r'); @@ -248,10 +237,6 @@ impl CsvHandler { } /// Locate all cells by serializing and finding field boundaries. - /// - /// Serializes the CSV once, splits by newlines, and uses a - /// field-position parser on each line that correctly handles - /// quoted/escaped fields. fn locate_cells(&self) -> Vec { let bytes = match self.serialize_bytes() { Ok(b) => b, @@ -318,11 +303,9 @@ fn find_field_in_line(line: &str, delimiter: u8, target_col: usize) -> Option<(u while pos < line.len() && col <= target_col { if col == target_col { - // Found the target column. if line[pos..].starts_with('"') { - // Quoted field — find closing quote. let content_start = pos; - pos += 1; // skip opening quote + pos += 1; loop { if pos >= line.len() { break; @@ -330,9 +313,9 @@ fn find_field_in_line(line: &str, delimiter: u8, target_col: usize) -> Option<(u if line.as_bytes()[pos] == b'"' { pos += 1; if pos < line.len() && line.as_bytes()[pos] == b'"' { - pos += 1; // escaped quote + pos += 1; } else { - break; // closing quote + break; } } else { pos += 1; @@ -340,7 +323,6 @@ fn find_field_in_line(line: &str, delimiter: u8, target_col: usize) -> Option<(u } return Some((content_start, pos)); } else { - // Unquoted field — find delimiter or end of line. let start = pos; while pos < line.len() && line.as_bytes()[pos] != delimiter @@ -352,7 +334,6 @@ fn find_field_in_line(line: &str, delimiter: u8, target_col: usize) -> Option<(u } } - // Skip to next field. if line[pos..].starts_with('"') { pos += 1; loop { @@ -380,7 +361,7 @@ fn find_field_in_line(line: &str, delimiter: u8, target_col: usize) -> Option<(u } if pos < line.len() && line.as_bytes()[pos] == delimiter { - pos += 1; // skip delimiter + pos += 1; } col += 1; } @@ -394,8 +375,7 @@ mod tests { use nvisy_core::Error; use super::*; - use crate::document::Span; - use crate::handler::TextHandler; + use crate::transform::{ConflictPolicy, TextOutput}; fn handler_with_headers(headers: Vec<&str>, rows: Vec>) -> CsvHandler { CsvHandler::new(CsvData { @@ -422,83 +402,80 @@ mod tests { } #[tokio::test] - async fn view_spans_with_headers() { + async fn locations_with_headers() { let h = handler_with_headers( vec!["name", "age"], vec![vec!["Alice", "30"], vec!["Bob", "25"]], ); - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans.len(), 6); - assert_eq!(spans[0].data, "name"); - assert_eq!(spans[1].data, "age"); - assert_eq!(spans[2].data, "Alice"); - assert_eq!(spans[3].data, "30"); - assert_eq!(spans[4].data, "Bob"); - assert_eq!(spans[5].data, "25"); + let items: Vec<_> = h.locations().collect().await; + assert_eq!(items.len(), 6); } #[tokio::test] - async fn view_spans_no_headers() { + async fn locations_no_headers() { let h = handler_no_headers(vec![vec!["x", "y"], vec!["1", "2"]]); - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans.len(), 4); - assert_eq!(spans[0].data, "x"); + let items: Vec<_> = h.locations().collect().await; + assert_eq!(items.len(), 4); } #[tokio::test] - async fn edit_cell() -> Result<(), Error> { + async fn redact_cell() -> Result<(), Error> { let mut h = handler_with_headers(vec!["ssn"], vec![vec!["123-45-6789"]]); - let spans: Vec<_> = h.text_spans().await.collect().await; - let data_loc = spans[1].id.clone(); - h.edit_text(SpanStream::new(futures::stream::iter(vec![Span::new( + let items: Vec<_> = h.locations().collect().await; + let data_loc = items[1].location.clone(); + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( data_loc, - "[REDACTED]".into(), - )]))) - .await?; + TextRedaction::new(0, 11, TextOutput::replace("[REDACTED]")), + ) + .unwrap(); + h.redact(rs).await?; assert_eq!(h.cell(0, 0), Some("[REDACTED]")); Ok(()) } #[tokio::test] - async fn edit_header() -> Result<(), Error> { + async fn redact_header() -> Result<(), Error> { let mut h = handler_with_headers(vec!["secret_field"], vec![vec!["value"]]); - let spans: Vec<_> = h.text_spans().await.collect().await; - let header_loc = spans[0].id.clone(); - h.edit_text(SpanStream::new(futures::stream::iter(vec![Span::new( + let items: Vec<_> = h.locations().collect().await; + let header_loc = items[0].location.clone(); + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( header_loc, - "redacted".into(), - )]))) - .await?; + TextRedaction::new(0, 12, TextOutput::replace("redacted")), + ) + .unwrap(); + h.redact(rs).await?; assert_eq!(h.headers(), Some(["redacted".to_string()].as_slice())); Ok(()) } #[tokio::test] - async fn value_at_returns_cell() { + async fn read_returns_cell() { let h = handler_with_headers(vec!["name"], vec![vec!["Alice"]]); - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(h.value_at(&spans[1].id).await, Some("Alice".to_string())); + let items: Vec<_> = h.locations().collect().await; + assert_eq!( + h.read(&items[1].location).await.unwrap().as_str(), + "Alice" + ); } #[tokio::test] async fn quoted_field_offsets_correct() { let h = handler_with_headers(vec!["bio"], vec![vec!["has, comma"]]); - let spans: Vec<_> = h.text_spans().await.collect().await; - let bio_span = spans.iter().find(|s| s.data.as_str() == "has, comma"); - assert!(bio_span.is_some(), "should find quoted field"); - let loc = &bio_span.unwrap().id; - // Offsets should include the quotes in the serialized form. - assert!(loc.end_offset > loc.start_offset); - assert_eq!(h.value_at(loc).await, Some("has, comma".to_string())); + let items: Vec<_> = h.locations().collect().await; + // Header + data cell. + assert_eq!(items.len(), 2); + let data_loc = &items[1].location; + assert!(data_loc.end_offset > data_loc.start_offset); + assert_eq!(h.read(data_loc).await.unwrap().as_str(), "has, comma"); } #[tokio::test] async fn empty_data_with_headers() { let h = handler_with_headers(vec!["a", "b"], vec![]); - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans.len(), 2); - assert_eq!(spans[0].data, "a"); - assert_eq!(spans[1].data, "b"); + let items: Vec<_> = h.locations().collect().await; + assert_eq!(items.len(), 2); } #[test] diff --git a/crates/nvisy-codec/src/handler/text/csv_loader.rs b/crates/nvisy-codec/src/handler/text/csv_loader.rs index 90c5d526..df25da0b 100644 --- a/crates/nvisy-codec/src/handler/text/csv_loader.rs +++ b/crates/nvisy-codec/src/handler/text/csv_loader.rs @@ -207,17 +207,17 @@ mod tests { } #[tokio::test] - async fn load_spans_round_trip() -> Result<(), Error> { + async fn load_locations_round_trip() -> Result<(), Error> { let content = content_from_str("name,age\nAlice,30\n"); let doc = CsvLoader.decode(&content, &CsvParams::default()).await?; - let spans: Vec<_> = doc.text_spans().await.collect().await; + let items: Vec<_> = doc.locations().collect().await; // 2 header + 2 data - assert_eq!(spans.len(), 4); - assert_eq!(spans[0].data, "name"); - assert_eq!(spans[1].data, "age"); - assert_eq!(spans[2].data, "Alice"); - assert_eq!(spans[3].data, "30"); + assert_eq!(items.len(), 4); + assert_eq!(doc.read(&items[0].location).await.unwrap().as_str(), "name"); + assert_eq!(doc.read(&items[1].location).await.unwrap().as_str(), "age"); + assert_eq!(doc.read(&items[2].location).await.unwrap().as_str(), "Alice"); + assert_eq!(doc.read(&items[3].location).await.unwrap().as_str(), "30"); Ok(()) } diff --git a/crates/nvisy-codec/src/handler/text/html_handler.rs b/crates/nvisy-codec/src/handler/text/html_handler.rs index 37f6dfbf..9607c434 100644 --- a/crates/nvisy-codec/src/handler/text/html_handler.rs +++ b/crates/nvisy-codec/src/handler/text/html_handler.rs @@ -1,30 +1,23 @@ -//! HTML handler: holds parsed HTML content and provides span-based +//! HTML handler: holds parsed HTML content and provides location-based //! access via [`Handler`] + [`TextHandler`]. //! -//! The handler stores extracted text nodes so the content can be -//! inspected and edited without holding the full DOM. -//! -//! # Span model -//! -//! [`TextHandler::text_spans`] yields one [`Span`] per text node in -//! document order, addressed by [`TextLocation`] with byte offsets -//! computed from cumulative text node lengths. -//! -//! # Encoding -//! -//! [`Handler::encode`] reconstructs the HTML by re-parsing the -//! original source into a DOM, applying edits via direct node -//! mutation, and serializing back with [`scraper::Html::html`]. +//! [`TextHandler::locations`] yields one location per text node in +//! document order; offsets are cumulative over the text-node sequence +//! (not raw HTML bytes). [`Handler::encode`] reconstructs the HTML by +//! re-parsing the original source into a DOM, applying mutations, and +//! serializing back with [`scraper::Html::html`]. -use futures::StreamExt; use nvisy_core::Error; use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::DocumentType; use nvisy_ontology::entity::TextLocation; -use crate::document::{Span, SpanStream}; +use crate::document::{Located, LocationStream}; use crate::handler::text::TextData; use crate::handler::{Handler, TextHandler}; +use crate::transform::{Redactions, TextRedaction, apply_text_redactions}; + +const TARGET: &str = "html-handler"; /// Parsed HTML content stored as extracted text nodes. #[derive(Debug, Clone)] @@ -81,58 +74,53 @@ impl Handler for HtmlHandler { #[async_trait::async_trait] impl TextHandler for HtmlHandler { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - let mut spans = Vec::with_capacity(self.data.text_nodes.len()); + fn locations(&self) -> LocationStream<'_, TextLocation> { + let source = self.source; + let mut items = Vec::with_capacity(self.data.text_nodes.len()); let mut offset = 0usize; - for text in &self.data.text_nodes { let start = offset; let end = start + text.len(); - spans.push( - Span::new( - TextLocation { - start_offset: start, - end_offset: end, - ..Default::default() - }, - TextData::from(text.clone()), - ) - .with_source(self.source), - ); + items.push(Located::new( + source, + TextLocation { + start_offset: start, + end_offset: end, + ..Default::default() + }, + )); offset = end; } + LocationStream::new(futures::stream::iter(items)) + } - SpanStream::new(futures::stream::iter(spans)) + async fn read(&self, location: &TextLocation) -> Option { + let offsets = self.node_offsets(); + let idx = offsets + .iter() + .position(|&(start, _)| start == location.start_offset)?; + self.data.text_nodes.get(idx).cloned().map(TextData::from) } - async fn edit_text( + async fn redact( &mut self, - edits: SpanStream<'_, TextLocation, TextData>, + redactions: Redactions, ) -> Result<(), Error> { - let edits: Vec<_> = edits.collect().await; + if redactions.is_empty() { + return Ok(()); + } let offsets = self.node_offsets(); - for edit in edits { - let idx = offsets + for (loc, items) in redactions { + let Some(idx) = offsets .iter() - .position(|&(start, _)| start == edit.id.start_offset) - .ok_or_else(|| { - Error::validation( - format!("no text node at byte offset {}", edit.id.start_offset), - "html-handler", - ) - })?; - self.data.text_nodes[idx] = edit.data.into_inner(); + .position(|&(start, _)| start == loc.start_offset) + else { + continue; + }; + apply_text_redactions(&mut self.data.text_nodes[idx], &items, TARGET)?; } Ok(()) } - - async fn value_at(&self, location: &TextLocation) -> Option { - let offsets = self.node_offsets(); - let idx = offsets - .iter() - .position(|&(start, _)| start == location.start_offset)?; - self.data.text_nodes.get(idx).cloned() - } } impl HtmlHandler { @@ -197,11 +185,11 @@ impl HtmlHandler { #[cfg(test)] mod tests { + use futures::StreamExt; use nvisy_core::Error; use super::*; - use crate::document::Span; - use crate::handler::{Handler, TextHandler}; + use crate::transform::{ConflictPolicy, TextOutput}; fn handler_from_html(raw: &str) -> HtmlHandler { let dom = scraper::Html::parse_document(raw); @@ -232,15 +220,17 @@ mod tests { } #[tokio::test] - async fn encode_after_edit() -> Result<(), Error> { + async fn encode_after_redact() -> Result<(), Error> { let raw = "

Hello

World

"; let mut h = handler_from_html(raw); - let spans: Vec<_> = h.text_spans().await.collect().await; - h.edit_text(SpanStream::new(futures::stream::iter(vec![Span::new( - spans[0].id.clone(), - "[REDACTED]".into(), - )]))) - .await?; + let items: Vec<_> = h.locations().collect().await; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + items[0].location.clone(), + TextRedaction::new(0, 5, TextOutput::replace("[REDACTED]")), + ) + .unwrap(); + h.redact(rs).await?; let result = h.encode()?.as_str().unwrap().to_owned(); assert!(result.contains("[REDACTED]")); assert!(result.contains("World")); @@ -248,18 +238,21 @@ mod tests { } #[tokio::test] - async fn view_spans_returns_text() { - let h = handler_from_html("

Alpha

Beta

"); - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(spans.len(), 2); - assert_eq!(spans[0].data, "Alpha"); - assert_eq!(spans[1].data, "Beta"); + async fn locations_returns_text_nodes() { + let h = handler_from_html( + "

Alpha

Beta

", + ); + let items: Vec<_> = h.locations().collect().await; + assert_eq!(items.len(), 2); } #[tokio::test] - async fn value_at_returns_text_node() { + async fn read_returns_text_node() { let h = handler_from_html("

Hello

"); - let spans: Vec<_> = h.text_spans().await.collect().await; - assert_eq!(h.value_at(&spans[0].id).await, Some("Hello".to_string())); + let items: Vec<_> = h.locations().collect().await; + assert_eq!( + h.read(&items[0].location).await.unwrap().as_str(), + "Hello" + ); } } diff --git a/crates/nvisy-codec/src/handler/text/json_handler.rs b/crates/nvisy-codec/src/handler/text/json_handler.rs index aa8f879a..c4e43771 100644 --- a/crates/nvisy-codec/src/handler/text/json_handler.rs +++ b/crates/nvisy-codec/src/handler/text/json_handler.rs @@ -1,38 +1,35 @@ -//! JSON handler: holds parsed JSON content and provides span-based +//! JSON handler: holds parsed JSON content and provides location-based //! access via [`Handler`] + [`TextHandler`]. //! //! The handler stores the parsed [`serde_json::Value`] tree together //! with formatting metadata captured during loading, so the original //! file can be reconstructed with identical whitespace after edits. //! -//! # Span model +//! [`TextHandler::locations`] yields string-typed JSON leaves and +//! object keys, addressed by [`TextLocation`]. Byte offsets correspond +//! to positions within the serialized JSON string and are computed via +//! monotonic cursor advancement during tree traversal to avoid +//! ambiguity from duplicate values. //! -//! The [`TextHandler`] implementation yields string-typed JSON leaves -//! and object keys as text spans addressed by [`TextLocation`]. Byte -//! offsets correspond to positions within the serialized JSON string -//! and are computed via monotonic cursor advancement during tree -//! traversal to avoid ambiguity from duplicate values. -//! -//! # Offset semantics -//! -//! Offsets are into the **serialized** form of the JSON document, -//! including quotes and escape sequences. The `value` field on -//! [`TextLocation`] carries the unescaped string content. +//! Offsets are into the **serialized** form (including quotes and +//! escapes). [`TextHandler::read`] returns the unescaped string value +//! at a location. use std::num::NonZeroU32; -use futures::StreamExt; use nvisy_core::Error; use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::{DocumentType, TextFormat}; use nvisy_ontology::entity::TextLocation; use serde::{Deserialize, Serialize}; -use crate::document::{Span, SpanStream}; +use crate::document::{Located, LocationStream}; use crate::handler::text::TextData; use crate::handler::{Handler, TextHandler}; +use crate::transform::{Redactions, TextRedaction, apply_text_redactions}; const DEFAULT_INDENT: NonZeroU32 = NonZeroU32::new(2).unwrap(); +const TARGET: &str = "json-handler"; /// [RFC 6901] JSON Pointer identifying a span within a JSON document. /// @@ -120,92 +117,81 @@ struct LocatedSpan { #[async_trait::async_trait] impl TextHandler for JsonHandler { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - let located = self.locate_spans(); + fn locations(&self) -> LocationStream<'_, TextLocation> { let source = self.source; - let spans: Vec<_> = located + let items: Vec<_> = self + .locate_spans() .into_iter() .map(|ls| { - Span::new( + Located::new( + source, TextLocation { start_offset: ls.start, end_offset: ls.end, ..Default::default() }, - TextData::from(ls.text), ) - .with_source(source) }) .collect(); - SpanStream::new(futures::stream::iter(spans)) + LocationStream::new(futures::stream::iter(items)) + } + + async fn read(&self, location: &TextLocation) -> Option { + self.locate_spans() + .into_iter() + .find(|ls| ls.start == location.start_offset && ls.end == location.end_offset) + .map(|ls| TextData::from(ls.text)) } - async fn edit_text( + async fn redact( &mut self, - edits: SpanStream<'_, TextLocation, TextData>, + redactions: Redactions, ) -> Result<(), Error> { - let edits: Vec<_> = edits.collect().await; + if redactions.is_empty() { + return Ok(()); + } let located = self.locate_spans(); - let mut value_edits = Vec::new(); - let mut key_edits = Vec::new(); + let mut value_updates = Vec::new(); + let mut key_updates = Vec::new(); - for edit in &edits { - let ls = located + for (loc, items) in redactions { + let Some(ls) = located .iter() - .find(|ls| ls.start == edit.id.start_offset && ls.end == edit.id.end_offset) - .ok_or_else(|| { - Error::validation( - format!( - "no JSON value at byte offset {}..{}", - edit.id.start_offset, edit.id.end_offset - ), - "json-handler", - ) - })?; - + .find(|ls| ls.start == loc.start_offset && ls.end == loc.end_offset) + else { + continue; + }; + let mut content = ls.text.clone(); + apply_text_redactions(&mut content, &items, TARGET)?; if ls.path.key_of { - key_edits.push((&ls.path, edit.data.clone())); + key_updates.push((ls.path.clone(), content)); } else { - value_edits.push((&ls.path, edit.data.clone())); + value_updates.push((ls.path.clone(), content)); } } - for (path, data) in &value_edits { + for (path, new_value) in value_updates { let target = self.data.value.pointer_mut(&path.pointer).ok_or_else(|| { Error::validation( format!("JSON pointer not found: {}", path.pointer), - "json-handler", + TARGET, ) })?; if target.is_string() { - *target = serde_json::Value::String(data.clone().into_inner()); + *target = serde_json::Value::String(new_value); } else { - let text = data.clone().into_inner(); - *target = serde_json::from_str(&text).unwrap_or(serde_json::Value::String(text)); + *target = serde_json::from_str(&new_value) + .unwrap_or(serde_json::Value::String(new_value)); } } - for (path, data) in &key_edits { - rename_key( - &mut self.data.value, - &path.pointer, - &serde_json::Value::String(data.as_str().to_owned()), - )?; + for (path, new_key) in key_updates { + rename_key(&mut self.data.value, &path.pointer, &new_key)?; } Ok(()) } - - async fn value_at(&self, location: &TextLocation) -> Option { - // Validate the offset range against known spans to ensure - // we return a structurally valid value. - let located = self.locate_spans(); - located - .iter() - .find(|ls| ls.start == location.start_offset && ls.end == location.end_offset) - .map(|ls| ls.text.clone()) - } } impl Handler for JsonHandler { @@ -278,9 +264,6 @@ impl JsonHandler { other => other.to_string(), }; - // Search for the quoted string or raw value starting from - // the cursor position. This ensures each match advances - // monotonically, resolving duplicate-value ambiguity. let needle = if ts.value.is_string() { format!("\"{}\"", json_escape(&text)) } else { @@ -312,24 +295,22 @@ impl JsonHandler { fn serialize_to_bytes(&self) -> Result, Error> { match self.data.indent { JsonIndent::Compact => serde_json::to_vec(&self.data.value) - .map_err(|e| Error::validation(format!("JSON encode error: {e}"), "json-handler")), + .map_err(|e| Error::validation(format!("JSON encode error: {e}"), TARGET)), JsonIndent::Spaces(n) => { let indent = " ".repeat(n.get() as usize); let mut buf = Vec::new(); let formatter = serde_json::ser::PrettyFormatter::with_indent(indent.as_bytes()); let mut ser = serde_json::Serializer::with_formatter(&mut buf, formatter); - serde::Serialize::serialize(&self.data.value, &mut ser).map_err(|e| { - Error::validation(format!("JSON encode error: {e}"), "json-handler") - })?; + serde::Serialize::serialize(&self.data.value, &mut ser) + .map_err(|e| Error::validation(format!("JSON encode error: {e}"), TARGET))?; Ok(buf) } JsonIndent::Tab => { let mut buf = Vec::new(); let formatter = serde_json::ser::PrettyFormatter::with_indent(b"\t"); let mut ser = serde_json::Serializer::with_formatter(&mut buf, formatter); - serde::Serialize::serialize(&self.data.value, &mut ser).map_err(|e| { - Error::validation(format!("JSON encode error: {e}"), "json-handler") - })?; + serde::Serialize::serialize(&self.data.value, &mut ser) + .map_err(|e| Error::validation(format!("JSON encode error: {e}"), TARGET))?; Ok(buf) } } @@ -403,14 +384,10 @@ impl Iterator for JsonSpanIter { fn rename_key( root: &mut serde_json::Value, pointer: &str, - new_key_value: &serde_json::Value, + new_key: &str, ) -> Result<(), Error> { - let new_key = new_key_value - .as_str() - .ok_or_else(|| Error::validation("key rename value must be a string", "json-handler"))?; - let (parent_ptr, old_key_segment) = pointer.rsplit_once('/').ok_or_else(|| { - Error::validation(format!("cannot rename root: {pointer}"), "json-handler") + Error::validation(format!("cannot rename root: {pointer}"), TARGET) })?; let old_key = old_key_segment.replace("~1", "/").replace("~0", "~"); @@ -421,14 +398,14 @@ fn rename_key( root.pointer_mut(parent_ptr).ok_or_else(|| { Error::validation( format!("parent pointer not found: {parent_ptr}"), - "json-handler", + TARGET, ) })? }; let obj = parent .as_object_mut() - .ok_or_else(|| Error::validation("parent is not an object", "json-handler"))?; + .ok_or_else(|| Error::validation("parent is not an object", TARGET))?; if let Some(value) = obj.remove(&old_key) { obj.insert(new_key.to_string(), value); @@ -443,7 +420,6 @@ mod tests { use nvisy_core::Error; use super::*; - use crate::handler::TextHandler; fn compact_handler(json: &str) -> JsonHandler { JsonHandler::new(JsonData { @@ -454,55 +430,63 @@ mod tests { } #[tokio::test] - async fn text_spans_yields_string_leaves() { + async fn locations_string_leaves() { let h = compact_handler(r#"{"name":"Alice","age":30}"#); - let spans: Vec<_> = h.text_spans().await.collect().await; - let texts: Vec<_> = spans.iter().map(|s| s.data.as_str()).collect(); - assert!(texts.contains(&"name")); - assert!(texts.contains(&"Alice")); - assert!(texts.contains(&"age")); - assert!(texts.contains(&"30")); + let items: Vec<_> = h.locations().collect().await; + // 2 keys + 2 leaves + assert_eq!(items.len(), 4); } #[tokio::test] async fn duplicate_values_get_distinct_offsets() { let h = compact_handler(r#"{"a":"same","b":"same"}"#); - let spans: Vec<_> = h.text_spans().await.collect().await; - let same_spans: Vec<_> = spans.iter().filter(|s| s.data.as_str() == "same").collect(); - assert_eq!(same_spans.len(), 2); - assert_ne!( - same_spans[0].id.start_offset, same_spans[1].id.start_offset, - "duplicate values must have distinct offsets" - ); + let mut same_offsets = Vec::new(); + for item in h.locations().collect::>().await { + if let Some(td) = h.read(&item.location).await + && td.as_str() == "same" + { + same_offsets.push(item.location.start_offset); + } + } + assert_eq!(same_offsets.len(), 2); + assert_ne!(same_offsets[0], same_offsets[1]); } #[tokio::test] - async fn value_at_returns_string() { + async fn read_returns_string() { let h = compact_handler(r#"{"name":"Alice"}"#); - let spans: Vec<_> = h.text_spans().await.collect().await; - let alice_span = spans.iter().find(|s| s.data.as_str() == "Alice").unwrap(); - assert_eq!(h.value_at(&alice_span.id).await, Some("Alice".to_string())); + let items: Vec<_> = h.locations().collect().await; + let alice = futures::future::join_all( + items.iter().map(|l| h.read(&l.location)), + ) + .await; + assert!(alice.iter().any(|d| d.as_ref().map(|d| d.as_str()) == Some("Alice"))); } #[tokio::test] - async fn value_at_rejects_arbitrary_offsets() { + async fn read_rejects_arbitrary_offsets() { let h = compact_handler(r#"{"name":"Alice"}"#); let bogus = TextLocation { start_offset: 3, end_offset: 7, ..Default::default() }; - assert_eq!(h.value_at(&bogus).await, None); + assert!(h.read(&bogus).await.is_none()); } #[tokio::test] async fn nested_structure() { let h = compact_handler(r#"{"user":{"name":"Bob","ids":[1,2]}}"#); - let spans: Vec<_> = h.text_spans().await.collect().await; - let texts: Vec<_> = spans.iter().map(|s| s.data.as_str()).collect(); - assert!(texts.contains(&"Bob")); - assert!(texts.contains(&"1")); - assert!(texts.contains(&"2")); + let items: Vec<_> = h.locations().collect().await; + let mut reads = Vec::new(); + for it in &items { + if let Some(td) = h.read(&it.location).await { + reads.push(td.as_str().to_owned()); + } + } + assert!(reads.iter().any(|s| s == "Bob")); + assert!(reads.iter().any(|s| s == "1")); + assert!(reads.iter().any(|s| s == "2")); } #[test] diff --git a/crates/nvisy-codec/src/handler/text/txt_handler.rs b/crates/nvisy-codec/src/handler/text/txt_handler.rs index bce5e4dd..98538a16 100644 --- a/crates/nvisy-codec/src/handler/text/txt_handler.rs +++ b/crates/nvisy-codec/src/handler/text/txt_handler.rs @@ -1,28 +1,26 @@ //! Plain-text handler: holds loaded text content and provides -//! span-based access via [`Handler`] + [`TextHandler`]. +//! location-based access via [`Handler`] + [`TextHandler`]. //! //! The handler stores the text as a vector of lines together with a //! trailing-newline flag so the original file can be reconstructed //! byte-for-byte after edits. //! -//! # Span model -//! -//! [`TextHandler::text_spans`] yields one [`Span`] per line. Each span -//! is addressed by a [`TextLocation`] with byte offsets computed from -//! cumulative line lengths. -//! -//! [`TextHandler::edit_text`] replaces the content of lines at the -//! given locations. +//! [`TextHandler::locations`] yields one [`TextLocation`] per line; +//! [`TextHandler::read`] returns the line at a given location; +//! [`TextHandler::redact`] applies redactions in place, mutating the +//! affected lines directly. -use futures::StreamExt; use nvisy_core::Error; use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::{DocumentType, TextFormat}; use nvisy_ontology::entity::TextLocation; -use crate::document::{Span, SpanStream}; +use crate::document::{Located, LocationStream}; use crate::handler::text::TextData; use crate::handler::{Handler, TextHandler}; +use crate::transform::{Redactions, TextRedaction, apply_text_redactions}; + +const TARGET: &str = "txt-handler"; /// Handler for loaded plain-text content. /// @@ -58,38 +56,28 @@ impl Handler for TxtHandler { #[async_trait::async_trait] impl TextHandler for TxtHandler { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - SpanStream::new(futures::stream::iter(TxtSpanIter { - lines: &self.lines, - source: self.source, - byte_offset: 0, - index: 0, - })) - } - - async fn edit_text( - &mut self, - edits: SpanStream<'_, TextLocation, TextData>, - ) -> Result<(), Error> { - let edits: Vec<_> = edits.collect().await; - // Map each edit's byte offset range to a line index and apply. - let offsets = self.line_offsets(); - for edit in edits { - let line_idx = offsets - .iter() - .position(|&(start, _)| start == edit.id.start_offset) - .ok_or_else(|| { - Error::validation( - format!("no line starts at byte offset {}", edit.id.start_offset), - "txt-handler", - ) - })?; - self.lines[line_idx] = edit.data.into_inner(); - } - Ok(()) + fn locations(&self) -> LocationStream<'_, TextLocation> { + let source = self.source; + let items: Vec<_> = self + .line_offsets() + .into_iter() + .enumerate() + .map(|(i, (start, end))| { + Located::new( + source, + TextLocation { + start_offset: start, + end_offset: end, + line_number: Some((i + 1) as u32), + ..Default::default() + }, + ) + }) + .collect(); + LocationStream::new(futures::stream::iter(items)) } - async fn value_at(&self, location: &TextLocation) -> Option { + async fn read(&self, location: &TextLocation) -> Option { let offsets = self.line_offsets(); let line_idx = offsets.iter().position(|&(start, end)| { location.start_offset >= start && location.end_offset <= end @@ -98,7 +86,27 @@ impl TextHandler for TxtHandler { let line_start = offsets[line_idx].0; let local_start = location.start_offset - line_start; let local_end = location.end_offset - line_start; - line.get(local_start..local_end).map(String::from) + line.get(local_start..local_end).map(TextData::from) + } + + async fn redact( + &mut self, + redactions: Redactions, + ) -> Result<(), Error> { + if redactions.is_empty() { + return Ok(()); + } + let offsets = self.line_offsets(); + for (loc, items) in redactions { + let Some(line_idx) = offsets + .iter() + .position(|&(start, _)| start == loc.start_offset) + else { + continue; + }; + apply_text_redactions(&mut self.lines[line_idx], &items, TARGET)?; + } + Ok(()) } } @@ -158,53 +166,13 @@ impl TxtHandler { } } -/// Iterator over lines of a plain-text document, producing -/// [`TextLocation`]-addressed spans. -struct TxtSpanIter<'a> { - lines: &'a [String], - source: ContentSource, - byte_offset: usize, - index: usize, -} - -impl<'a> Iterator for TxtSpanIter<'a> { - type Item = Span; - - fn next(&mut self) -> Option { - let line = self.lines.get(self.index)?; - let start = self.byte_offset; - let end = start + line.len(); - - let location = TextLocation { - start_offset: start, - end_offset: end, - line_number: Some((self.index + 1) as u32), - ..Default::default() - }; - let span = Span::new(location, TextData::from(line.clone())).with_source(self.source); - - // Advance past this line + newline separator. - self.byte_offset = end + 1; - self.index += 1; - Some(span) - } - - fn size_hint(&self) -> (usize, Option) { - let remaining = self.lines.len() - self.index; - (remaining, Some(remaining)) - } -} - -impl<'a> ExactSizeIterator for TxtSpanIter<'a> {} - #[cfg(test)] mod tests { use futures::StreamExt; use nvisy_core::Error; use super::*; - use crate::document::Span; - use crate::handler::TextHandler; + use crate::transform::{ConflictPolicy, TextOutput}; fn handler(text: &str) -> TxtHandler { let trailing_newline = text.ends_with('\n'); @@ -213,88 +181,114 @@ mod tests { } #[tokio::test] - async fn view_spans_multiline() { + async fn locations_multiline() { let h = handler("hello\nworld\n"); - let spans: Vec<_> = h.text_spans().await.collect().await; - - assert_eq!(spans.len(), 2); - assert_eq!(spans[0].id.start_offset, 0); - assert_eq!(spans[0].id.end_offset, 5); - assert_eq!(spans[0].id.line_number, Some(1)); - assert_eq!(spans[0].data, "hello"); - assert_eq!(spans[1].id.start_offset, 6); - assert_eq!(spans[1].id.end_offset, 11); - assert_eq!(spans[1].id.line_number, Some(2)); - assert_eq!(spans[1].data, "world"); + let items: Vec<_> = h.locations().collect().await; + + assert_eq!(items.len(), 2); + assert_eq!(items[0].location.start_offset, 0); + assert_eq!(items[0].location.end_offset, 5); + assert_eq!(items[0].location.line_number, Some(1)); + assert_eq!(items[1].location.start_offset, 6); + assert_eq!(items[1].location.end_offset, 11); + assert_eq!(items[1].location.line_number, Some(2)); } #[tokio::test] - async fn view_spans_single_line_no_newline() { + async fn locations_single_line_no_newline() { let h = handler("no newline"); - let spans: Vec<_> = h.text_spans().await.collect().await; + let items: Vec<_> = h.locations().collect().await; - assert_eq!(spans.len(), 1); - assert_eq!(spans[0].data, "no newline"); - assert_eq!(spans[0].id.start_offset, 0); - assert_eq!(spans[0].id.end_offset, 10); + assert_eq!(items.len(), 1); + assert_eq!(items[0].location.start_offset, 0); + assert_eq!(items[0].location.end_offset, 10); assert!(!h.trailing_newline()); } #[tokio::test] - async fn edit_spans_replace_line() -> Result<(), Error> { - let mut h = handler("hello\nworld\n"); + async fn read_returns_line() { + let h = handler("hello\nworld\n"); let loc = TextLocation { start_offset: 6, end_offset: 11, ..Default::default() }; - h.edit_text(SpanStream::new(futures::stream::iter(vec![Span::new( - loc, - "[REDACTED]".into(), - )]))) - .await?; - assert_eq!(h.lines(), &["hello", "[REDACTED]"]); - Ok(()) + assert_eq!(h.read(&loc).await.unwrap().as_str(), "world"); } #[tokio::test] - async fn edit_spans_bad_offset() { - let mut h = handler("one line"); + async fn read_substring() { + let h = handler("hello world"); let loc = TextLocation { - start_offset: 999, - end_offset: 1000, + start_offset: 6, + end_offset: 11, ..Default::default() }; - let err = h - .edit_text(SpanStream::new(futures::stream::iter(vec![Span::new( - loc, - "nope".into(), - )]))) - .await - .unwrap_err(); - assert!(err.to_string().contains("no line starts at")); + assert_eq!(h.read(&loc).await.unwrap().as_str(), "world"); } #[tokio::test] - async fn value_at_returns_line() { + async fn read_cross_line_returns_none() { let h = handler("hello\nworld\n"); let loc = TextLocation { - start_offset: 6, - end_offset: 11, + start_offset: 3, + end_offset: 8, ..Default::default() }; - assert_eq!(h.value_at(&loc).await, Some("world".to_string())); + assert!(h.read(&loc).await.is_none()); } #[tokio::test] - async fn value_at_substring() { - let h = handler("hello world"); - let loc = TextLocation { - start_offset: 6, - end_offset: 11, - ..Default::default() - }; - assert_eq!(h.value_at(&loc).await, Some("world".to_string())); + async fn redact_replaces_substring() -> Result<(), Error> { + let mut h = handler("hello\nworld\n"); + let items: Vec<_> = h.locations().collect().await; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + items[1].location.clone(), + TextRedaction::new(0, 5, TextOutput::replace("[REDACTED]")), + ) + .unwrap(); + h.redact(rs).await?; + assert_eq!(h.lines(), &["hello", "[REDACTED]"]); + Ok(()) + } + + #[tokio::test] + async fn redact_multiple_lines() -> Result<(), Error> { + let mut h = handler("aaa\nbbb\nccc\n"); + let items: Vec<_> = h.locations().collect().await; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + items[0].location.clone(), + TextRedaction::new(0, 3, TextOutput::replace("[X]")), + ) + .unwrap(); + rs.try_insert( + items[2].location.clone(), + TextRedaction::new(0, 3, TextOutput::replace("[Y]")), + ) + .unwrap(); + h.redact(rs).await?; + assert_eq!(h.lines(), &["[X]", "bbb", "[Y]"]); + Ok(()) + } + + #[tokio::test] + async fn redact_unknown_location_skipped() -> Result<(), Error> { + let mut h = handler("one line"); + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + TextLocation { + start_offset: 999, + end_offset: 1000, + ..Default::default() + }, + TextRedaction::new(0, 1, TextOutput::replace("nope")), + ) + .unwrap(); + h.redact(rs).await?; + assert_eq!(h.lines(), &["one line"]); + Ok(()) } #[test] @@ -314,34 +308,9 @@ mod tests { } #[tokio::test] - async fn value_at_cross_line_returns_none() { - let h = handler("hello\nworld\n"); - // Offsets spanning two lines should return None. - let loc = TextLocation { - start_offset: 3, - end_offset: 8, - ..Default::default() - }; - assert_eq!(h.value_at(&loc).await, None); - } - - #[tokio::test] - async fn edit_multiple_lines() -> Result<(), Error> { - let mut h = handler("aaa\nbbb\nccc\n"); - let spans: Vec<_> = h.text_spans().await.collect().await; - h.edit_text(SpanStream::new(futures::stream::iter(vec![ - Span::new(spans[0].id.clone(), "[X]".into()), - Span::new(spans[2].id.clone(), "[Y]".into()), - ]))) - .await?; - assert_eq!(h.lines(), &["[X]", "bbb", "[Y]"]); - Ok(()) - } - - #[tokio::test] - async fn empty_handler_spans() { + async fn empty_handler_locations() { let h = TxtHandler::new(vec![], false); - let spans: Vec<_> = h.text_spans().await.collect().await; - assert!(spans.is_empty()); + let items: Vec<_> = h.locations().collect().await; + assert!(items.is_empty()); } } diff --git a/crates/nvisy-codec/src/handler/text/txt_loader.rs b/crates/nvisy-codec/src/handler/text/txt_loader.rs index aa074513..ff219ccc 100644 --- a/crates/nvisy-codec/src/handler/text/txt_loader.rs +++ b/crates/nvisy-codec/src/handler/text/txt_loader.rs @@ -85,15 +85,15 @@ mod tests { } #[tokio::test] - async fn load_preserves_spans_through_round_trip() -> Result<(), Error> { + async fn load_preserves_lines_through_round_trip() -> Result<(), Error> { let content = content_from_str("Alice\nBob\nCharlie\n"); let doc = TxtLoader.decode(&content, &TxtParams::default()).await?; - let spans: Vec<_> = doc.text_spans().await.collect().await; - assert_eq!(spans.len(), 3); - assert_eq!(spans[0].data, "Alice"); - assert_eq!(spans[1].data, "Bob"); - assert_eq!(spans[2].data, "Charlie"); + let items: Vec<_> = doc.locations().collect().await; + assert_eq!(items.len(), 3); + assert_eq!(doc.read(&items[0].location).await.unwrap().as_str(), "Alice"); + assert_eq!(doc.read(&items[1].location).await.unwrap().as_str(), "Bob"); + assert_eq!(doc.read(&items[2].location).await.unwrap().as_str(), "Charlie"); Ok(()) } diff --git a/crates/nvisy-codec/src/handler/text/xlsx_handler.rs b/crates/nvisy-codec/src/handler/text/xlsx_handler.rs index 864c418d..ac6d0e92 100644 --- a/crates/nvisy-codec/src/handler/text/xlsx_handler.rs +++ b/crates/nvisy-codec/src/handler/text/xlsx_handler.rs @@ -5,9 +5,10 @@ use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::{DocumentType, SpreadsheetFormat}; use nvisy_ontology::entity::TextLocation; -use crate::document::SpanStream; +use crate::document::LocationStream; use crate::handler::text::TextData; use crate::handler::{Handler, TextHandler}; +use crate::transform::{Redactions, TextRedaction}; #[derive(Debug, Default)] pub struct XlsxHandler { @@ -47,18 +48,18 @@ impl Handler for XlsxHandler { #[async_trait::async_trait] impl TextHandler for XlsxHandler { - async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - SpanStream::new(futures::stream::empty()) + fn locations(&self) -> LocationStream<'_, TextLocation> { + LocationStream::empty() } - async fn edit_text( + async fn read(&self, _location: &TextLocation) -> Option { + None + } + + async fn redact( &mut self, - _edits: SpanStream<'_, TextLocation, TextData>, + _redactions: Redactions, ) -> Result<(), Error> { Ok(()) } - - async fn value_at(&self, _location: &TextLocation) -> Option { - None - } } diff --git a/crates/nvisy-codec/src/transform/image/apply.rs b/crates/nvisy-codec/src/transform/image/apply.rs new file mode 100644 index 00000000..7061c7ff --- /dev/null +++ b/crates/nvisy-codec/src/transform/image/apply.rs @@ -0,0 +1,51 @@ +//! Helper for applying a batch of [`ImageRedaction`]s to a +//! [`DynamicImage`] in place. + +use image::DynamicImage; + +use super::instruction::{ImageOutput, ImageRedaction}; +use super::ops::ImageOps; + +const TARGET: &str = "nvisy_codec::transform::image"; + +/// Apply a slice of image redactions to `img` in place. +/// +/// Each redaction's bounding box is converted to pixel coordinates and +/// the corresponding output method is applied. Replace outputs whose +/// embedded image data fails to decode are skipped with a warning. +pub(crate) fn apply_image_redactions(img: &mut DynamicImage, redactions: &[ImageRedaction]) { + for redaction in redactions { + let region = redaction.bounding_box.to_pixel(); + match &redaction.output { + ImageOutput::Blur { sigma } => { + img.apply_gaussian_blur(®ion, *sigma); + } + ImageOutput::Block { color } => { + img.apply_block_overlay(®ion, *color); + } + ImageOutput::Pixelate { block_size } => { + img.apply_pixelate(®ion, *block_size); + } + ImageOutput::Replace { data } => { + let replacement = match image::load_from_memory(data) { + Ok(r) => r, + Err(e) => { + tracing::warn!( + target: TARGET, + region = ?region, + error = %e, + "failed to decode replacement image data, skipping region" + ); + continue; + } + }; + let resized = replacement.resize_exact( + region.width, + region.height, + image::imageops::FilterType::Lanczos3, + ); + image::imageops::overlay(img, &resized, region.x as i64, region.y as i64); + } + } + } +} diff --git a/crates/nvisy-codec/src/transform/image/mod.rs b/crates/nvisy-codec/src/transform/image/mod.rs index 38fd933c..2016351f 100644 --- a/crates/nvisy-codec/src/transform/image/mod.rs +++ b/crates/nvisy-codec/src/transform/image/mod.rs @@ -1,7 +1,8 @@ //! Image redaction primitives. +mod apply; mod instruction; mod ops; +pub(crate) use self::apply::apply_image_redactions; pub use self::instruction::{ImageOutput, ImageRedaction}; -pub(crate) use self::ops::ImageOps; diff --git a/crates/nvisy-codec/src/transform/mod.rs b/crates/nvisy-codec/src/transform/mod.rs index 3f931666..eee4a1a9 100644 --- a/crates/nvisy-codec/src/transform/mod.rs +++ b/crates/nvisy-codec/src/transform/mod.rs @@ -21,10 +21,11 @@ mod tabular; mod text; pub use self::audio::{AudioOutput, AudioRedaction}; -pub(crate) use self::image::ImageOps; +pub(crate) use self::image::apply_image_redactions; pub use self::image::{ImageOutput, ImageRedaction}; pub use self::mergeable::Mergeable; pub use self::policy::{ConflictPolicy, InsertError}; pub use self::redactions::Redactions; pub use self::tabular::TabularRedaction; +pub(crate) use self::text::apply_text_redactions; pub use self::text::{TextOutput, TextRedaction}; diff --git a/crates/nvisy-codec/src/transform/text/apply.rs b/crates/nvisy-codec/src/transform/text/apply.rs new file mode 100644 index 00000000..62c9369c --- /dev/null +++ b/crates/nvisy-codec/src/transform/text/apply.rs @@ -0,0 +1,102 @@ +//! Byte-level helper for applying a batch of [`TextRedaction`]s +//! to a string in place. + +use std::cmp::Reverse; + +use nvisy_core::Error; + +use super::instruction::TextRedaction; + +/// Apply a slice of redactions to `content` in place. +/// +/// Redactions are sorted right-to-left so that earlier byte offsets +/// remain valid as later ones are replaced. Returns an error if any +/// offset falls mid-character. +/// +/// The slice must not contain overlapping ranges — that invariant is +/// owned by [`Redactions`] on insert. +/// +/// [`Redactions`]: crate::transform::Redactions +pub(crate) fn apply_text_redactions( + content: &mut String, + redactions: &[TextRedaction], + target: &'static str, +) -> Result<(), Error> { + let mut items: Vec<&TextRedaction> = redactions.iter().collect(); + items.sort_by_key(|r| Reverse(r.start)); + + for r in items { + let value = r.output.replacement_value().unwrap_or_default(); + let s = r.start.min(content.len()); + let e = r.end.min(content.len()); + if s >= e { + continue; + } + if !content.is_char_boundary(s) || !content.is_char_boundary(e) { + return Err(Error::validation( + format!( + "redaction offset falls mid-character \ + (start={}, end={}, len={})", + r.start, + r.end, + content.len() + ), + target, + )); + } + content.replace_range(s..e, value); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::transform::TextOutput; + + fn redaction(start: usize, end: usize, replacement: &str) -> TextRedaction { + TextRedaction::new(start, end, TextOutput::replace(replacement)) + } + + #[test] + fn single_replacement() { + let mut s = String::from("hello world"); + apply_text_redactions(&mut s, &[redaction(0, 5, "[X]")], "test").unwrap(); + assert_eq!(s, "[X] world"); + } + + #[test] + fn right_to_left_application() { + let mut s = String::from("aaa bbb ccc"); + let rs = vec![redaction(0, 3, "[A]"), redaction(8, 11, "[C]")]; + apply_text_redactions(&mut s, &rs, "test").unwrap(); + assert_eq!(s, "[A] bbb [C]"); + } + + #[test] + fn remove_output() { + let mut s = String::from("hello world"); + apply_text_redactions( + &mut s, + &[TextRedaction::new(5, 11, TextOutput::Remove)], + "test", + ) + .unwrap(); + assert_eq!(s, "hello"); + } + + #[test] + fn out_of_bounds_clipped() { + let mut s = String::from("short"); + apply_text_redactions(&mut s, &[redaction(0, 999, "[X]")], "test").unwrap(); + assert_eq!(s, "[X]"); + } + + #[test] + fn mid_character_rejected() { + let mut s = String::from("héllo"); // 'é' is 2 bytes + let err = apply_text_redactions(&mut s, &[redaction(0, 2, "[X]")], "test").unwrap_err(); + assert!(err.to_string().contains("mid-character")); + } +} diff --git a/crates/nvisy-codec/src/transform/text/mod.rs b/crates/nvisy-codec/src/transform/text/mod.rs index 8620b840..0fb821d0 100644 --- a/crates/nvisy-codec/src/transform/text/mod.rs +++ b/crates/nvisy-codec/src/transform/text/mod.rs @@ -1,5 +1,7 @@ //! Text redaction primitives. +mod apply; mod instruction; +pub(crate) use self::apply::apply_text_redactions; pub use self::instruction::{TextOutput, TextRedaction}; From 900a02035dfcd88212a9321ad2c1c0342658f6e1 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 19 May 2026 02:26:34 +0200 Subject: [PATCH 6/8] refactor(engine): switch to location/read/redact codec API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Engine operations now consume the new codec surface: - Document gains collect_text/image/audio_locations + read_text/image/audio + value_at(&Location); old text_spans/image_spans/audio_spans and the *_at typed accessors are gone. - EntityRecognitionOp and PatternRecognitionOp build their (TextLocation, TextData) work lists by walking locations() and calling read_text per item. - VisualExtractionOp builds (ContentSource, ImageData) pairs the same way for OCR batches and verification. - ValidationOp concatenates current text by reading each location. - RedactionApplicator reads each entity's text value via read_text instead of the old enum-typed value_at(&Location). - Document::apply_tabular_redactions is dropped — nothing in the engine drove it, and the (row, col) → byte-offset bridge it relied on (TabularTransform) is gone with the rest of the *Transform blanket impls. cargo check + clippy + tests all clean across the workspace. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../operation/detection/entity_recognition.rs | 26 ++-- .../detection/pattern_recognition.rs | 23 ++-- .../src/operation/envelope/document.rs | 119 +++++------------- .../src/operation/extraction/vision.rs | 61 +++++---- .../src/operation/redaction/apply.rs | 3 +- .../nvisy-engine/src/operation/validation.rs | 17 +-- 6 files changed, 105 insertions(+), 144 deletions(-) diff --git a/crates/nvisy-engine/src/operation/detection/entity_recognition.rs b/crates/nvisy-engine/src/operation/detection/entity_recognition.rs index 704a70c2..6ef7cff8 100644 --- a/crates/nvisy-engine/src/operation/detection/entity_recognition.rs +++ b/crates/nvisy-engine/src/operation/detection/entity_recognition.rs @@ -3,10 +3,9 @@ //! Runs at **phase 2**, after extraction. Drives language-model inference //! to identify and classify named entities within extracted text. -use nvisy_codec::Span; use nvisy_codec::handler::TextData; use nvisy_core::{Error, ErrorKind, Result}; -use nvisy_ontology::entity::Entity; +use nvisy_ontology::entity::{Entity, TextLocation}; use nvisy_ontology::workflow::NerDetection; use nvisy_provider::agent::{DetectionConfig, NerAgent}; use nvisy_provider::http::HttpClient; @@ -47,17 +46,14 @@ impl EntityRecognitionOp { Ok(Self { agent, config }) } - async fn detect( - &self, - spans: &[Span], - ) -> Result> { + async fn detect(&self, spans: &[(TextLocation, TextData)]) -> Result> { tracing::debug!(target: TARGET, span_count = spans.len(), "running NER"); let mut entities = Vec::new(); - for span in spans { + for (loc, data) in spans { let detected = self .agent - .detect_entities(span.data.as_str(), &self.config) + .detect_entities(data.as_str(), &self.config) .await .map_err(|e| Error::runtime(e.to_string(), "ner-agent", e.is_retryable()))?; @@ -65,9 +61,9 @@ impl EntityRecognitionOp { // Adjust entity's text location offsets to be relative // to the document (not the span) by adding the span's // start offset. - if let nvisy_ontology::entity::Location::Text(ref mut loc) = entity.location { - loc.start_offset += span.id.start_offset; - loc.end_offset += span.id.start_offset; + if let nvisy_ontology::entity::Location::Text(ref mut elem) = entity.location { + elem.start_offset += loc.start_offset; + elem.end_offset += loc.start_offset; } entities.push(entity); @@ -80,7 +76,13 @@ impl EntityRecognitionOp { impl Operation for EntityRecognitionOp { async fn execute(&self, envelope: &mut DocumentEnvelope) -> Result<()> { - let spans: Vec<_> = envelope.document.collect_text_spans().await; + let locations = envelope.document.collect_text_locations().await; + let mut spans: Vec<(TextLocation, TextData)> = Vec::with_capacity(locations.len()); + for located in locations { + if let Some(data) = envelope.document.read_text(&located.location).await { + spans.push((located.location, data)); + } + } if !spans.is_empty() { let detected = self.detect(&spans).await?; tracing::debug!( diff --git a/crates/nvisy-engine/src/operation/detection/pattern_recognition.rs b/crates/nvisy-engine/src/operation/detection/pattern_recognition.rs index ab78fa06..739226c4 100644 --- a/crates/nvisy-engine/src/operation/detection/pattern_recognition.rs +++ b/crates/nvisy-engine/src/operation/detection/pattern_recognition.rs @@ -8,10 +8,9 @@ use std::ops::Deref; -use nvisy_codec::Span; use nvisy_codec::handler::TextData; use nvisy_core::Result; -use nvisy_ontology::entity::Entity; +use nvisy_ontology::entity::{Entity, TextLocation}; use nvisy_ontology::workflow::PatternDetection; use crate::operation::{DocumentEnvelope, Operation}; @@ -73,18 +72,18 @@ impl PatternRecognitionOp { Self { engine } } - fn scan(&self, spans: &[Span]) -> Vec { + fn scan(&self, spans: &[(TextLocation, TextData)]) -> Vec { let scan_ctx = nvisy_pattern::ScanContext::default(); let mut entities = Vec::new(); - for span in spans { - let detected = self.engine.scan_entities(span.data.as_str(), &scan_ctx); + for (loc, data) in spans { + let detected = self.engine.scan_entities(data.as_str(), &scan_ctx); for mut entity in detected { // Adjust offsets to be document-relative. - if let nvisy_ontology::entity::Location::Text(ref mut loc) = entity.location { - loc.start_offset += span.id.start_offset; - loc.end_offset += span.id.start_offset; + if let nvisy_ontology::entity::Location::Text(ref mut elem) = entity.location { + elem.start_offset += loc.start_offset; + elem.end_offset += loc.start_offset; } entities.push(entity); @@ -104,7 +103,13 @@ impl PatternRecognitionOp { impl Operation for PatternRecognitionOp { async fn execute(&self, envelope: &mut DocumentEnvelope) -> Result<()> { - let spans: Vec<_> = envelope.document.collect_text_spans().await; + let locations = envelope.document.collect_text_locations().await; + let mut spans: Vec<(TextLocation, TextData)> = Vec::with_capacity(locations.len()); + for located in locations { + if let Some(data) = envelope.document.read_text(&located.location).await { + spans.push((located.location, data)); + } + } if !spans.is_empty() { let detected = self.scan(&spans); tracing::debug!( diff --git a/crates/nvisy-engine/src/operation/envelope/document.rs b/crates/nvisy-engine/src/operation/envelope/document.rs index 9bce0f1c..5ffb2df7 100644 --- a/crates/nvisy-engine/src/operation/envelope/document.rs +++ b/crates/nvisy-engine/src/operation/envelope/document.rs @@ -7,18 +7,17 @@ use std::fmt; +use futures::StreamExt; use nvisy_codec::handler::{AudioData, ImageData, TextData}; use nvisy_codec::transform::{ - AudioRedaction, ImageRedaction, Redactions, TabularRedaction, TextRedaction, + AudioRedaction, ImageRedaction, Redactions, TextRedaction, }; -use nvisy_codec::{ContentHandle, Span, SpanStream}; +use nvisy_codec::{ContentHandle, Located}; use nvisy_core::Error; use nvisy_core::content::{ContentData, ContentMetadata, ContentSource}; use nvisy_core::media::DocumentType; use nvisy_ontology::artifacts::ContentArtifacts; -use nvisy_ontology::entity::{ - AudioLocation, ImageLocation, Location, TabularLocation, TextLocation, -}; +use nvisy_ontology::entity::{AudioLocation, ImageLocation, Location, TextLocation}; /// Engine-level document combining content, metadata, and artifacts. /// @@ -35,9 +34,6 @@ pub struct Document { impl Document { /// Create a new document from a content handle and metadata. - /// - /// Artifacts are initialized to the appropriate empty variant based - /// on the handle's modality. pub fn new(handle: ContentHandle, metadata: ContentMetadata) -> Self { let artifacts = match &handle { ContentHandle::Text(_) => ContentArtifacts::text(), @@ -52,8 +48,6 @@ impl Document { } } - // -- Delegated handle methods -- - /// The document type of the underlying content. pub fn document_type(&self) -> DocumentType { self.handle.document_type() @@ -69,96 +63,59 @@ impl Document { self.handle.encode() } - /// Stream text spans from text or rich documents. - pub async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> { - self.handle.text_spans().await + /// Collect all text locations into a `Vec`. + pub async fn collect_text_locations(&self) -> Vec> { + self.handle.text_locations().collect().await } - /// Stream image spans from image or rich documents. - pub async fn image_spans(&self) -> SpanStream<'_, ImageLocation, ImageData> { - self.handle.image_spans().await + /// Collect all image locations into a `Vec`. + pub async fn collect_image_locations(&self) -> Vec> { + self.handle.image_locations().collect().await } - /// Stream audio spans from audio documents. - pub async fn audio_spans(&self) -> SpanStream<'_, AudioLocation, AudioData> { - self.handle.audio_spans().await + /// Collect all audio locations into a `Vec`. + pub async fn collect_audio_locations(&self) -> Vec> { + self.handle.audio_locations().collect().await } - /// Collect all text spans into a `Vec`. - pub async fn collect_text_spans(&self) -> Vec> { - self.handle.collect_text_spans().await + /// Read the text content at the given text location. + pub async fn read_text(&self, location: &TextLocation) -> Option { + self.handle.read_text(location).await } - /// Collect all image spans into a `Vec`. - pub async fn collect_image_spans(&self) -> Vec> { - self.handle.collect_image_spans().await + /// Read the image data at the given image location. + pub async fn read_image(&self, location: &ImageLocation) -> Option { + self.handle.read_image(location).await } - /// Collect all audio spans into a `Vec`. - pub async fn collect_audio_spans(&self) -> Vec> { - self.handle.collect_audio_spans().await + /// Read the audio data at the given audio location. + pub async fn read_audio(&self, location: &AudioLocation) -> Option { + self.handle.read_audio(location).await } - // -- Type-safe value access -- - - /// Extract the text value at the given text location. - pub async fn text_at(&self, location: &TextLocation) -> Option { - use nvisy_codec::handler::TextHandler; - match &self.handle { - ContentHandle::Text(h) => h.value_at(location).await, - ContentHandle::Rich(h) => TextHandler::value_at(h, location).await, - _ => None, - } - } - - /// Extract the image data at the given image location. - pub async fn image_at(&self, location: &ImageLocation) -> Option { - use nvisy_codec::handler::ImageHandler; - match &self.handle { - ContentHandle::Image(h) => h.value_at(location).await, - ContentHandle::Rich(h) => ImageHandler::value_at(h, location).await, - _ => None, - } - } - - /// Extract the audio data at the given audio location. - pub async fn audio_at(&self, location: &AudioLocation) -> Option { - use nvisy_codec::handler::AudioHandler; - match &self.handle { - ContentHandle::Audio(h) => h.value_at(location).await, - _ => None, - } - } - - /// Extract the cell value at the given tabular location. - /// - /// Currently returns `None` — tabular documents are handled as text - /// internally and don't have a dedicated `ContentHandle` variant yet. - pub async fn tabular_at(&self, _location: &TabularLocation) -> Option { - None - } - - /// Extract the text value at a [`Location`], dispatching by modality. + /// Resolve a [`Location`] to its text representation, dispatching by modality. /// - /// Returns the text content for Text/Tabular locations and the - /// extracted text (OCR/STT) for Image/Audio locations. + /// - Text and tabular locations: read from the underlying handler. + /// - Audio locations: read the transcription text from artifacts. + /// - Image locations: not yet implemented (OCR results are multi-region). pub async fn value_at(&self, location: &Location) -> Option { match location { - Location::Text(loc) => self.text_at(loc).await, - Location::Tabular(loc) => self.tabular_at(loc).await, + Location::Text(loc) => self.read_text(loc).await.map(TextData::into_inner), Location::Audio(_) => self .artifacts .as_audio() .and_then(|a| a.transcription.as_ref()) .map(|t| t.text()), - // Image OCR results are multi-region; location-specific + // Tabular locations are currently surfaced through the + // underlying TextHandler; per-cell extraction would need a + // (row, col) → byte-offset bridge that the engine no longer + // owns. Image OCR results are multi-region; per-location // lookup is not yet implemented. + Location::Tabular(_) | Location::Image(_) => None, _ => None, } } - // -- Redaction delegates -- - /// Apply a batch of text redactions to the document. pub async fn apply_text_redactions( &mut self, @@ -182,18 +139,6 @@ impl Document { ) -> Result<(), Error> { self.handle.apply_audio_redactions(redactions).await } - - /// Apply a batch of tabular redactions to the document. - pub async fn apply_tabular_redactions( - &mut self, - redactions: Redactions, - ) -> Result<(), Error> { - use nvisy_codec::transform::TabularTransform; - match &mut self.handle { - ContentHandle::Text(h) => h.redact_tabular(redactions).await, - _ => Ok(()), - } - } } impl fmt::Debug for Document { diff --git a/crates/nvisy-engine/src/operation/extraction/vision.rs b/crates/nvisy-engine/src/operation/extraction/vision.rs index cc7a415f..97546b93 100644 --- a/crates/nvisy-engine/src/operation/extraction/vision.rs +++ b/crates/nvisy-engine/src/operation/extraction/vision.rs @@ -4,8 +4,8 @@ //! optionally verifying detected entities against the source image, //! and optionally running computer vision. -use nvisy_codec::Span; use nvisy_codec::handler::ImageData; +use nvisy_core::content::ContentSource; use nvisy_core::{Error, ErrorKind, Result}; use nvisy_ontology::entity::Entities; use nvisy_ontology::workflow::VisualExtraction as VisualExtractionCfg; @@ -74,44 +74,37 @@ impl VisualExtractionOp { Ok(Self { agent }) } - /// Run OCR extraction on a batch of image spans. - async fn extract( - &self, - spans: &[Span], - ) -> Result> { - if spans.is_empty() { + /// Run OCR extraction on a batch of `(source, image)` pairs. + async fn extract(&self, images: &[(ContentSource, ImageData)]) -> Result> { + if images.is_empty() { return Ok(Vec::new()); } - let images = spans + let inputs = images .iter() - .map(|span| { - let png_bytes = span.data.encode_png()?; - Ok(ImageInput::with_source( - span.source, - png_bytes, - ImageFormat::Png, - )) + .map(|(source, data)| { + let png_bytes = data.encode_png()?; + Ok(ImageInput::with_source(*source, png_bytes, ImageFormat::Png)) }) .collect::>>()?; - self.agent.run_batch(&images).await + self.agent.run_batch(&inputs).await } /// Verify detected entities against the source images. async fn verify( &self, - image_spans: &[Span], + images: &[(ContentSource, ImageData)], entities: Entities, document: &crate::operation::Document, ) -> Result { - if entities.is_empty() || image_spans.is_empty() { + if entities.is_empty() || images.is_empty() { return Ok(entities); } use nvisy_provider::agent::VerificationCandidate; let mut verified = entities.into_inner(); - for span in image_spans { - let png_bytes = span.data.encode_png()?; - let image = ImageInput::with_source(span.source, png_bytes, ImageFormat::Png); + for (source, data) in images { + let png_bytes = data.encode_png()?; + let image = ImageInput::with_source(*source, png_bytes, ImageFormat::Png); let mut candidates = Vec::with_capacity(verified.len()); for entity in verified { let value = document @@ -128,22 +121,36 @@ impl VisualExtractionOp { } Ok(verified.into()) } + + /// Collect all `(source, image)` pairs from the document's image locations. + async fn collect_images( + document: &crate::operation::Document, + ) -> Vec<(ContentSource, ImageData)> { + let locations = document.collect_image_locations().await; + let mut images = Vec::with_capacity(locations.len()); + for located in locations { + if let Some(data) = document.read_image(&located.location).await { + images.push((located.source, data)); + } + } + images + } } impl Operation for VisualExtractionOp { async fn execute(&self, envelope: &mut DocumentEnvelope) -> Result<()> { - let image_spans = envelope.document.collect_image_spans().await; - if image_spans.is_empty() { + let images = Self::collect_images(&envelope.document).await; + if images.is_empty() { return Ok(()); } tracing::debug!( target: TARGET, - spans = image_spans.len(), + spans = images.len(), "running OCR extraction", ); - let ocr_output = self.extract(&image_spans).await?; + let ocr_output = self.extract(&images).await?; // Store OCR results in image artifacts. if let Some(image_artifacts) = envelope.document.artifacts.as_image_mut() { @@ -153,10 +160,10 @@ impl Operation for VisualExtractionOp { } if self.agent.has_verifier() && !envelope.audit.entities.is_empty() { - let verify_spans = envelope.document.collect_image_spans().await; + let verify_images = Self::collect_images(&envelope.document).await; match self .verify( - &verify_spans, + &verify_images, envelope.audit.entities.clone(), &envelope.document, ) diff --git a/crates/nvisy-engine/src/operation/redaction/apply.rs b/crates/nvisy-engine/src/operation/redaction/apply.rs index 7c7756de..38f76d21 100644 --- a/crates/nvisy-engine/src/operation/redaction/apply.rs +++ b/crates/nvisy-engine/src/operation/redaction/apply.rs @@ -80,8 +80,9 @@ impl<'a> RedactionApplicator<'a> { let value = self .envelope .document - .value_at(&entity.location) + .read_text(loc) .await + .map(|d| d.into_inner()) .unwrap_or_default(); let output = match &record.redaction.strategy { diff --git a/crates/nvisy-engine/src/operation/validation.rs b/crates/nvisy-engine/src/operation/validation.rs index 4de8e310..875e75b6 100644 --- a/crates/nvisy-engine/src/operation/validation.rs +++ b/crates/nvisy-engine/src/operation/validation.rs @@ -93,16 +93,17 @@ impl Operation for ValidationOp { async fn execute(&self, envelope: &mut DocumentEnvelope) -> Result<()> { tracing::debug!(target: TARGET, "running post-redaction validation"); - let text_spans: Vec<_> = envelope.document.collect_text_spans().await; - let redacted_text = if text_spans.is_empty() { + let locations = envelope.document.collect_text_locations().await; + let redacted_text = if locations.is_empty() { None } else { - Some( - text_spans - .iter() - .map(|s| s.data.as_str()) - .collect::(), - ) + let mut buf = String::new(); + for located in &locations { + if let Some(data) = envelope.document.read_text(&located.location).await { + buf.push_str(data.as_str()); + } + } + Some(buf) }; let result = Self::check( From 9b892fe2355c7f2be03106af90ffa5dba64d5e24 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 19 May 2026 02:36:42 +0200 Subject: [PATCH 7/8] refactor(codec, engine): reintroduce Span for engine work lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Engine operations that enumerate a document's locations together with their content (NER detector, pattern scanner, OCR extractor) previously walked locations() + read_*() and pushed (Location, Data) tuples into a Vec. Tuples obscure intent and force destructuring at every call site. Span { source, location, data } lives in codec — same shape as the type we deleted at the start of the refactor, but intentionally *not* used on handler trait signatures. Handlers still expose only cheap identity via locations() plus on-demand read(); engine callers that want enumerate-with-content build Span::from_located in their read loops. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/nvisy-codec/src/document/mod.rs | 2 + crates/nvisy-codec/src/document/span.rs | 80 +++++++++++++++++++ crates/nvisy-codec/src/lib.rs | 2 +- .../operation/detection/entity_recognition.rs | 15 ++-- .../detection/pattern_recognition.rs | 15 ++-- .../src/operation/extraction/vision.rs | 59 ++++++++------ 6 files changed, 132 insertions(+), 41 deletions(-) create mode 100644 crates/nvisy-codec/src/document/span.rs diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs index 3133c6b7..d87c7008 100644 --- a/crates/nvisy-codec/src/document/mod.rs +++ b/crates/nvisy-codec/src/document/mod.rs @@ -1,6 +1,7 @@ //! Type-erased content handle for all supported formats. mod located; +mod span; mod stream; use std::fmt; @@ -14,6 +15,7 @@ use nvisy_core::media::{ use nvisy_ontology::entity::{AudioLocation, ImageLocation, TextLocation}; pub use self::located::Located; +pub use self::span::Span; pub use self::stream::LocationStream; use crate::handler::{ AudioData, AudioHandler, BoxedAudioHandler, BoxedImageHandler, BoxedRichHandler, diff --git a/crates/nvisy-codec/src/document/span.rs b/crates/nvisy-codec/src/document/span.rs new file mode 100644 index 00000000..2bf82858 --- /dev/null +++ b/crates/nvisy-codec/src/document/span.rs @@ -0,0 +1,80 @@ +//! [`Span`]: a located piece of content with its production-time +//! provenance. + +use nvisy_core::content::ContentSource; + +use super::Located; + +/// A located piece of content paired with its data and provenance. +/// +/// Useful when a caller wants to enumerate all of a handler's +/// locations *with* the content attached — typically when feeding a +/// batch into a downstream service (LLM detector, OCR, validator). +/// +/// Not used on handler trait signatures: handlers expose only cheap +/// identity via [`locations`] plus on-demand [`read`]. Construct a +/// `Span` by walking the locations stream and calling `read`. +/// +/// [`locations`]: crate::handler::TextHandler::locations +/// [`read`]: crate::handler::TextHandler::read +#[derive(Debug, Clone, PartialEq)] +pub struct Span { + /// The handler-level source that produced this span. + pub source: ContentSource, + /// The structural location within the handler's data model. + pub location: L, + /// The content at the location. + pub data: D, +} + +impl Span { + /// Create a new span from its components. + pub fn new(source: ContentSource, location: L, data: D) -> Self { + Self { + source, + location, + data, + } + } + + /// Construct from a [`Located`] by attaching `data`. + pub fn from_located(located: Located, data: D) -> Self { + Self { + source: located.source, + location: located.location, + data, + } + } + + /// Transform the data, keeping source and location unchanged. + pub fn map(self, f: impl FnOnce(D) -> T) -> Span { + Span { + source: self.source, + location: self.location, + data: f(self.data), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_located_preserves_source_and_location() { + let src = ContentSource::new(); + let located = Located::new(src, 7u32); + let span = Span::from_located(located, "data"); + assert_eq!(span.source, src); + assert_eq!(span.location, 7); + assert_eq!(span.data, "data"); + } + + #[test] + fn map_transforms_data() { + let span = Span::new(ContentSource::new(), 1u32, "hello"); + let mapped = span.map(|d| d.len()); + assert_eq!(mapped.location, 1); + assert_eq!(mapped.data, 5); + } +} diff --git a/crates/nvisy-codec/src/lib.rs b/crates/nvisy-codec/src/lib.rs index 02ff2810..5865a0fd 100644 --- a/crates/nvisy-codec/src/lib.rs +++ b/crates/nvisy-codec/src/lib.rs @@ -6,4 +6,4 @@ mod document; pub mod handler; pub mod transform; -pub use self::document::{ContentHandle, Located, LocationStream}; +pub use self::document::{ContentHandle, Located, LocationStream, Span}; diff --git a/crates/nvisy-engine/src/operation/detection/entity_recognition.rs b/crates/nvisy-engine/src/operation/detection/entity_recognition.rs index 6ef7cff8..c382bebc 100644 --- a/crates/nvisy-engine/src/operation/detection/entity_recognition.rs +++ b/crates/nvisy-engine/src/operation/detection/entity_recognition.rs @@ -3,6 +3,7 @@ //! Runs at **phase 2**, after extraction. Drives language-model inference //! to identify and classify named entities within extracted text. +use nvisy_codec::Span; use nvisy_codec::handler::TextData; use nvisy_core::{Error, ErrorKind, Result}; use nvisy_ontology::entity::{Entity, TextLocation}; @@ -46,14 +47,14 @@ impl EntityRecognitionOp { Ok(Self { agent, config }) } - async fn detect(&self, spans: &[(TextLocation, TextData)]) -> Result> { + async fn detect(&self, spans: &[Span]) -> Result> { tracing::debug!(target: TARGET, span_count = spans.len(), "running NER"); let mut entities = Vec::new(); - for (loc, data) in spans { + for span in spans { let detected = self .agent - .detect_entities(data.as_str(), &self.config) + .detect_entities(span.data.as_str(), &self.config) .await .map_err(|e| Error::runtime(e.to_string(), "ner-agent", e.is_retryable()))?; @@ -62,8 +63,8 @@ impl EntityRecognitionOp { // to the document (not the span) by adding the span's // start offset. if let nvisy_ontology::entity::Location::Text(ref mut elem) = entity.location { - elem.start_offset += loc.start_offset; - elem.end_offset += loc.start_offset; + elem.start_offset += span.location.start_offset; + elem.end_offset += span.location.start_offset; } entities.push(entity); @@ -77,10 +78,10 @@ impl EntityRecognitionOp { impl Operation for EntityRecognitionOp { async fn execute(&self, envelope: &mut DocumentEnvelope) -> Result<()> { let locations = envelope.document.collect_text_locations().await; - let mut spans: Vec<(TextLocation, TextData)> = Vec::with_capacity(locations.len()); + let mut spans: Vec> = Vec::with_capacity(locations.len()); for located in locations { if let Some(data) = envelope.document.read_text(&located.location).await { - spans.push((located.location, data)); + spans.push(Span::from_located(located, data)); } } if !spans.is_empty() { diff --git a/crates/nvisy-engine/src/operation/detection/pattern_recognition.rs b/crates/nvisy-engine/src/operation/detection/pattern_recognition.rs index 739226c4..7b3b0f06 100644 --- a/crates/nvisy-engine/src/operation/detection/pattern_recognition.rs +++ b/crates/nvisy-engine/src/operation/detection/pattern_recognition.rs @@ -8,6 +8,7 @@ use std::ops::Deref; +use nvisy_codec::Span; use nvisy_codec::handler::TextData; use nvisy_core::Result; use nvisy_ontology::entity::{Entity, TextLocation}; @@ -72,18 +73,18 @@ impl PatternRecognitionOp { Self { engine } } - fn scan(&self, spans: &[(TextLocation, TextData)]) -> Vec { + fn scan(&self, spans: &[Span]) -> Vec { let scan_ctx = nvisy_pattern::ScanContext::default(); let mut entities = Vec::new(); - for (loc, data) in spans { - let detected = self.engine.scan_entities(data.as_str(), &scan_ctx); + for span in spans { + let detected = self.engine.scan_entities(span.data.as_str(), &scan_ctx); for mut entity in detected { // Adjust offsets to be document-relative. if let nvisy_ontology::entity::Location::Text(ref mut elem) = entity.location { - elem.start_offset += loc.start_offset; - elem.end_offset += loc.start_offset; + elem.start_offset += span.location.start_offset; + elem.end_offset += span.location.start_offset; } entities.push(entity); @@ -104,10 +105,10 @@ impl PatternRecognitionOp { impl Operation for PatternRecognitionOp { async fn execute(&self, envelope: &mut DocumentEnvelope) -> Result<()> { let locations = envelope.document.collect_text_locations().await; - let mut spans: Vec<(TextLocation, TextData)> = Vec::with_capacity(locations.len()); + let mut spans: Vec> = Vec::with_capacity(locations.len()); for located in locations { if let Some(data) = envelope.document.read_text(&located.location).await { - spans.push((located.location, data)); + spans.push(Span::from_located(located, data)); } } if !spans.is_empty() { diff --git a/crates/nvisy-engine/src/operation/extraction/vision.rs b/crates/nvisy-engine/src/operation/extraction/vision.rs index 97546b93..fe081a63 100644 --- a/crates/nvisy-engine/src/operation/extraction/vision.rs +++ b/crates/nvisy-engine/src/operation/extraction/vision.rs @@ -4,10 +4,10 @@ //! optionally verifying detected entities against the source image, //! and optionally running computer vision. +use nvisy_codec::Span; use nvisy_codec::handler::ImageData; -use nvisy_core::content::ContentSource; use nvisy_core::{Error, ErrorKind, Result}; -use nvisy_ontology::entity::Entities; +use nvisy_ontology::entity::{Entities, ImageLocation}; use nvisy_ontology::workflow::VisualExtraction as VisualExtractionCfg; use nvisy_provider::agent::{ImageFormat, ImageInput, ImageOutput, OcrAgent}; use nvisy_provider::http::HttpClient; @@ -74,16 +74,23 @@ impl VisualExtractionOp { Ok(Self { agent }) } - /// Run OCR extraction on a batch of `(source, image)` pairs. - async fn extract(&self, images: &[(ContentSource, ImageData)]) -> Result> { - if images.is_empty() { + /// Run OCR extraction on a batch of image spans. + async fn extract( + &self, + spans: &[Span], + ) -> Result> { + if spans.is_empty() { return Ok(Vec::new()); } - let inputs = images + let inputs = spans .iter() - .map(|(source, data)| { - let png_bytes = data.encode_png()?; - Ok(ImageInput::with_source(*source, png_bytes, ImageFormat::Png)) + .map(|span| { + let png_bytes = span.data.encode_png()?; + Ok(ImageInput::with_source( + span.source, + png_bytes, + ImageFormat::Png, + )) }) .collect::>>()?; self.agent.run_batch(&inputs).await @@ -92,19 +99,19 @@ impl VisualExtractionOp { /// Verify detected entities against the source images. async fn verify( &self, - images: &[(ContentSource, ImageData)], + spans: &[Span], entities: Entities, document: &crate::operation::Document, ) -> Result { - if entities.is_empty() || images.is_empty() { + if entities.is_empty() || spans.is_empty() { return Ok(entities); } use nvisy_provider::agent::VerificationCandidate; let mut verified = entities.into_inner(); - for (source, data) in images { - let png_bytes = data.encode_png()?; - let image = ImageInput::with_source(*source, png_bytes, ImageFormat::Png); + for span in spans { + let png_bytes = span.data.encode_png()?; + let image = ImageInput::with_source(span.source, png_bytes, ImageFormat::Png); let mut candidates = Vec::with_capacity(verified.len()); for entity in verified { let value = document @@ -122,35 +129,35 @@ impl VisualExtractionOp { Ok(verified.into()) } - /// Collect all `(source, image)` pairs from the document's image locations. - async fn collect_images( + /// Collect all image spans from the document's image locations. + async fn collect_spans( document: &crate::operation::Document, - ) -> Vec<(ContentSource, ImageData)> { + ) -> Vec> { let locations = document.collect_image_locations().await; - let mut images = Vec::with_capacity(locations.len()); + let mut spans = Vec::with_capacity(locations.len()); for located in locations { if let Some(data) = document.read_image(&located.location).await { - images.push((located.source, data)); + spans.push(Span::from_located(located, data)); } } - images + spans } } impl Operation for VisualExtractionOp { async fn execute(&self, envelope: &mut DocumentEnvelope) -> Result<()> { - let images = Self::collect_images(&envelope.document).await; - if images.is_empty() { + let spans = Self::collect_spans(&envelope.document).await; + if spans.is_empty() { return Ok(()); } tracing::debug!( target: TARGET, - spans = images.len(), + spans = spans.len(), "running OCR extraction", ); - let ocr_output = self.extract(&images).await?; + let ocr_output = self.extract(&spans).await?; // Store OCR results in image artifacts. if let Some(image_artifacts) = envelope.document.artifacts.as_image_mut() { @@ -160,10 +167,10 @@ impl Operation for VisualExtractionOp { } if self.agent.has_verifier() && !envelope.audit.entities.is_empty() { - let verify_images = Self::collect_images(&envelope.document).await; + let verify_spans = Self::collect_spans(&envelope.document).await; match self .verify( - &verify_images, + &verify_spans, envelope.audit.entities.clone(), &envelope.document, ) From 8bff45ace5640811ad948f1a00f9ac81fc15069a Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 19 May 2026 02:55:21 +0200 Subject: [PATCH 8/8] style(workspace): rustdoc refs at bottom, no inline imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two parallel sweeps: 1. Inline rustdoc reference-style links — [`Foo`](crate::path::Foo) — moved to bottom-of-docblock references: [`Foo`] + `[`Foo`]: crate::path::Foo` after a blank /// separator. 83 conversions across 51 files. 2. Inline `use` statements inside function bodies, impl scopes, and non-test inner blocks hoisted to top of file. Cfg-gated inline uses preserved their cfg with a wrapping #[cfg(...)] on the hoisted form. Macro-body uses and test-module-top uses left as-is. Tests, clippy, and rustdoc all clean across the workspace. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/nvisy-codec/src/document/mod.rs | 19 +++++----- .../src/handler/audio/audio_data.rs | 6 ++- .../src/handler/audio/mp3_handler.rs | 7 +++- .../src/handler/audio/wav_handler.rs | 7 +++- .../src/handler/image/jpeg_handler.rs | 10 +++-- .../src/handler/image/png_handler.rs | 10 +++-- .../src/handler/image/tiff_handler.rs | 4 +- .../src/handler/rich/pdf_handler.rs | 4 +- .../src/handler/rich/pdf_loader.rs | 3 +- .../nvisy-codec/src/handler/text/text_data.rs | 4 +- crates/nvisy-core/src/content/bundle.rs | 5 +-- crates/nvisy-core/src/content/content_data.rs | 15 ++++++-- crates/nvisy-core/src/error.rs | 4 +- crates/nvisy-core/src/media/document_type.rs | 4 +- .../src/operation/deduplication/span_size.rs | 4 +- .../src/operation/envelope/document.rs | 4 +- .../src/operation/envelope/mod.rs | 12 ++++-- .../src/operation/envelope/shared.rs | 4 +- .../nvisy-engine/src/operation/export_file.rs | 2 +- .../src/operation/extraction/vision.rs | 3 +- .../src/pipeline/config/subsystem.rs | 4 +- .../src/pipeline/config/validate.rs | 38 +++++-------------- .../src/pipeline/runs/analytics.rs | 7 +++- crates/nvisy-engine/src/pipeline/runs/mod.rs | 4 +- .../nvisy-engine/src/pipeline/runs/state.rs | 8 +++- .../src/utility/encryption/provider.rs | 3 +- .../src/utility/encryption/service.rs | 10 ++--- crates/nvisy-ontology/src/context/entry.rs | 4 +- .../src/context/temporal/datetime.rs | 4 +- crates/nvisy-ontology/src/entity/category.rs | 11 ++++-- crates/nvisy-ontology/src/entity/kind.rs | 4 +- .../nvisy-ontology/src/primitive/time_span.rs | 5 ++- crates/nvisy-ontology/src/provenance/entry.rs | 7 +++- crates/nvisy-ontology/src/provenance/mod.rs | 7 +++- .../nvisy-ontology/src/provenance/review.rs | 4 +- .../src/dictionaries/dictionary_registry.rs | 4 +- crates/nvisy-pattern/src/engine/allow_list.rs | 4 +- crates/nvisy-pattern/src/engine/builder.rs | 8 +++- crates/nvisy-pattern/src/engine/error.rs | 4 +- .../nvisy-pattern/src/engine/scan_context.rs | 4 +- .../src/patterns/json_pattern.rs | 4 +- crates/nvisy-pattern/src/patterns/pattern.rs | 4 +- .../src/patterns/pattern_registry.rs | 4 +- crates/nvisy-pattern/src/validators/mod.rs | 7 +++- .../nvisy-provider/src/agent/base/builder.rs | 8 +++- crates/nvisy-provider/src/agent/cv/mod.rs | 8 +++- .../nvisy-provider/src/agent/generate/mod.rs | 8 +++- .../nvisy-provider/src/agent/ner/context.rs | 7 +++- crates/nvisy-provider/src/agent/ner/mod.rs | 12 ++++-- crates/nvisy-provider/src/agent/ocr/input.rs | 4 +- crates/nvisy-provider/src/agent/ocr/mod.rs | 12 ++++-- .../nvisy-provider/src/audio/stt/provider.rs | 4 +- .../nvisy-provider/src/audio/tts/provider.rs | 4 +- crates/nvisy-provider/src/ocr/params.rs | 4 +- crates/nvisy-python/src/bridge/mod.rs | 5 +-- crates/nvisy-server/src/extract/actor.rs | 4 +- crates/nvisy-server/src/extract/json.rs | 4 +- crates/nvisy-server/src/extract/path.rs | 4 +- .../nvisy-server/src/handler/request/mod.rs | 7 +++- .../nvisy-server/src/handler/response/mod.rs | 7 +++- .../nvisy-server/src/middleware/recovery.rs | 14 ++++--- 61 files changed, 264 insertions(+), 146 deletions(-) diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs index d87c7008..f192d6b5 100644 --- a/crates/nvisy-codec/src/document/mod.rs +++ b/crates/nvisy-codec/src/document/mod.rs @@ -17,6 +17,12 @@ use nvisy_ontology::entity::{AudioLocation, ImageLocation, TextLocation}; pub use self::located::Located; pub use self::span::Span; pub use self::stream::LocationStream; +#[cfg(feature = "docx")] +use crate::handler::{DocxLoader, DocxParams}; +#[cfg(feature = "html")] +use crate::handler::{HtmlLoader, HtmlParams}; +#[cfg(feature = "pdf")] +use crate::handler::{PdfLoader, PdfParams}; use crate::handler::{ AudioData, AudioHandler, BoxedAudioHandler, BoxedImageHandler, BoxedRichHandler, BoxedTextHandler, CsvLoader, CsvParams, Handler, ImageData, ImageHandler, JpegLoader, @@ -208,13 +214,10 @@ impl ContentHandle { .await? .into(), #[cfg(feature = "html")] - DocumentType::Html => { - use crate::handler::{HtmlLoader, HtmlParams}; - HtmlLoader - .decode(content, &HtmlParams::default()) - .await? - .into() - } + DocumentType::Html => HtmlLoader + .decode(content, &HtmlParams::default()) + .await? + .into(), DocumentType::Spreadsheet(SpreadsheetFormat::Csv) => CsvLoader .decode(content, &CsvParams::default()) .await? @@ -277,7 +280,6 @@ impl ContentHandle { DocumentType::Pdf => { #[cfg(feature = "pdf")] { - use crate::handler::{PdfLoader, PdfParams}; let handler = PdfLoader.decode(content, &PdfParams::default()).await?; Ok(Self::from(BoxedRichHandler::from(handler))) } @@ -292,7 +294,6 @@ impl ContentHandle { DocumentType::Word(WordFormat::Docx) => { #[cfg(feature = "docx")] { - use crate::handler::{DocxLoader, DocxParams}; let handler = DocxLoader.decode(content, &DocxParams).await?; Ok(Self::from(BoxedRichHandler::from(handler))) } diff --git a/crates/nvisy-codec/src/handler/audio/audio_data.rs b/crates/nvisy-codec/src/handler/audio/audio_data.rs index cbb48e77..fe3aaea5 100644 --- a/crates/nvisy-codec/src/handler/audio/audio_data.rs +++ b/crates/nvisy-codec/src/handler/audio/audio_data.rs @@ -5,10 +5,12 @@ use derive_more::{AsRef, From, Into}; /// Opaque wrapper around raw audio bytes. /// -/// Mirrors [`ImageData`](crate::handler::ImageData) and -/// [`TextData`](crate::handler::TextData) for audio-bearing handlers, +/// Mirrors [`ImageData`] and [`TextData`] for audio-bearing handlers, /// providing a consistent type boundary at the `AudioHandler` trait /// level. +/// +/// [`ImageData`]: crate::handler::ImageData +/// [`TextData`]: crate::handler::TextData #[derive(Debug, Clone, From, Into, AsRef)] pub struct AudioData(Bytes); diff --git a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs index cb843619..d3e0cd2e 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs @@ -1,10 +1,11 @@ //! MP3 handler: holds raw MP3 audio bytes and provides location-based -//! access via [`AudioHandler`](crate::handler::AudioHandler). +//! access via [`AudioHandler`]. //! //! [`AudioHandler::locations`] yields a single full-duration //! [`AudioLocation`]; [`AudioHandler::read`] returns the underlying //! bytes as [`AudioData`]. Redaction is currently a no-op. //! +//! [`AudioHandler`]: crate::handler::AudioHandler //! [`AudioHandler::locations`]: crate::handler::AudioHandler::locations //! [`AudioHandler::read`]: crate::handler::AudioHandler::read //! [`AudioLocation`]: nvisy_ontology::entity::AudioLocation @@ -16,7 +17,9 @@ use super::impl_audio_handler; /// Handler for loaded MP3 content. /// /// Stores the raw audio bytes directly. The bytes can be produced -/// on demand via [`Handler::encode`](crate::handler::Handler::encode). +/// on demand via [`Handler::encode`]. +/// +/// [`Handler::encode`]: crate::handler::Handler::encode #[derive(Debug)] pub struct Mp3Handler { source: ContentSource, diff --git a/crates/nvisy-codec/src/handler/audio/wav_handler.rs b/crates/nvisy-codec/src/handler/audio/wav_handler.rs index 3af1f89a..399f9da2 100644 --- a/crates/nvisy-codec/src/handler/audio/wav_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/wav_handler.rs @@ -1,10 +1,11 @@ //! WAV handler: holds raw WAV audio bytes and provides location-based -//! access via [`AudioHandler`](crate::handler::AudioHandler). +//! access via [`AudioHandler`]. //! //! [`AudioHandler::locations`] yields a single full-duration //! [`AudioLocation`]; [`AudioHandler::read`] returns the underlying //! bytes as [`AudioData`]. Redaction is currently a no-op. //! +//! [`AudioHandler`]: crate::handler::AudioHandler //! [`AudioHandler::locations`]: crate::handler::AudioHandler::locations //! [`AudioHandler::read`]: crate::handler::AudioHandler::read //! [`AudioLocation`]: nvisy_ontology::entity::AudioLocation @@ -16,7 +17,9 @@ use super::impl_audio_handler; /// Handler for loaded WAV content. /// /// Stores the raw audio bytes directly. The bytes can be produced -/// on demand via [`Handler::encode`](crate::handler::Handler::encode). +/// on demand via [`Handler::encode`]. +/// +/// [`Handler::encode`]: crate::handler::Handler::encode #[derive(Debug)] pub struct WavHandler { source: ContentSource, diff --git a/crates/nvisy-codec/src/handler/image/jpeg_handler.rs b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs index 413dc1e0..c19f3c90 100644 --- a/crates/nvisy-codec/src/handler/image/jpeg_handler.rs +++ b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs @@ -3,10 +3,11 @@ //! //! [`ImageHandler::locations`] yields exactly one full-image //! [`ImageLocation`]; [`ImageHandler::read`] returns the current -//! [`DynamicImage`](image::DynamicImage) cropped to the location's +//! [`DynamicImage`] cropped to the location's //! bounding box; [`ImageHandler::redact`] applies bounding-box //! redactions in place. //! +//! [`DynamicImage`]: image::DynamicImage //! [`ImageHandler`]: crate::handler::ImageHandler //! [`ImageHandler::locations`]: crate::handler::ImageHandler::locations //! [`ImageHandler::read`]: crate::handler::ImageHandler::read @@ -19,9 +20,12 @@ use super::impl_image_handler; /// Handler for loaded JPEG content. /// -/// Stores the decoded [`DynamicImage`](image::DynamicImage) directly. +/// Stores the decoded [`DynamicImage`] directly. /// The raw JPEG bytes can be produced on demand via -/// [`Handler::encode`](crate::handler::Handler::encode). +/// [`Handler::encode`]. +/// +/// [`DynamicImage`]: image::DynamicImage +/// [`Handler::encode`]: crate::handler::Handler::encode #[derive(Debug)] pub struct JpegHandler { source: ContentSource, diff --git a/crates/nvisy-codec/src/handler/image/png_handler.rs b/crates/nvisy-codec/src/handler/image/png_handler.rs index 6b07d30a..69cf5909 100644 --- a/crates/nvisy-codec/src/handler/image/png_handler.rs +++ b/crates/nvisy-codec/src/handler/image/png_handler.rs @@ -3,10 +3,11 @@ //! //! [`ImageHandler::locations`] yields exactly one full-image //! [`ImageLocation`]; [`ImageHandler::read`] returns the current -//! [`DynamicImage`](image::DynamicImage) (cropped to the location's +//! [`DynamicImage`] (cropped to the location's //! bounding box); [`ImageHandler::redact`] applies bounding-box //! redactions in place. //! +//! [`DynamicImage`]: image::DynamicImage //! [`ImageHandler`]: crate::handler::ImageHandler //! [`ImageHandler::locations`]: crate::handler::ImageHandler::locations //! [`ImageHandler::read`]: crate::handler::ImageHandler::read @@ -19,9 +20,12 @@ use super::impl_image_handler; /// Handler for loaded PNG content. /// -/// Stores the decoded [`DynamicImage`](image::DynamicImage) directly. +/// Stores the decoded [`DynamicImage`] directly. /// The raw PNG bytes can be produced on demand via -/// [`Handler::encode`](crate::handler::Handler::encode). +/// [`Handler::encode`]. +/// +/// [`DynamicImage`]: image::DynamicImage +/// [`Handler::encode`]: crate::handler::Handler::encode #[derive(Debug)] pub struct PngHandler { source: ContentSource, diff --git a/crates/nvisy-codec/src/handler/image/tiff_handler.rs b/crates/nvisy-codec/src/handler/image/tiff_handler.rs index 3bd643fc..7a9d065a 100644 --- a/crates/nvisy-codec/src/handler/image/tiff_handler.rs +++ b/crates/nvisy-codec/src/handler/image/tiff_handler.rs @@ -1,5 +1,7 @@ //! TIFF handler: holds a decoded image and provides single-location -//! access via [`ImageHandler`](crate::handler::ImageHandler). +//! access via [`ImageHandler`]. +//! +//! [`ImageHandler`]: crate::handler::ImageHandler use nvisy_core::content::ContentSource; diff --git a/crates/nvisy-codec/src/handler/rich/pdf_handler.rs b/crates/nvisy-codec/src/handler/rich/pdf_handler.rs index 69dba3c7..636ad31b 100644 --- a/crates/nvisy-codec/src/handler/rich/pdf_handler.rs +++ b/crates/nvisy-codec/src/handler/rich/pdf_handler.rs @@ -109,7 +109,9 @@ impl RichTextHandler { &self.raw } - /// Total number of pages (alias for [`page_count`](Self::page_count)). + /// Total number of pages (alias for [`page_count`]). + /// + /// [`page_count`]: Self::page_count pub fn len(&self) -> usize { self.pages.len() } diff --git a/crates/nvisy-codec/src/handler/rich/pdf_loader.rs b/crates/nvisy-codec/src/handler/rich/pdf_loader.rs index fe5e71af..885a49a6 100644 --- a/crates/nvisy-codec/src/handler/rich/pdf_loader.rs +++ b/crates/nvisy-codec/src/handler/rich/pdf_loader.rs @@ -50,6 +50,7 @@ impl Loader for PdfLoader { mod tests { use bytes::Bytes; use futures::StreamExt; + use lopdf::{Dictionary, Document, Object, Stream, dictionary}; use nvisy_core::content::ContentSource; use nvisy_core::media::DocumentType; @@ -62,8 +63,6 @@ mod tests { /// Build a minimal valid PDF with one blank page using lopdf. fn minimal_pdf() -> Vec { - use lopdf::{Dictionary, Document, Object, Stream, dictionary}; - let mut doc = Document::with_version("1.5"); let pages_id = doc.new_object_id(); diff --git a/crates/nvisy-codec/src/handler/text/text_data.rs b/crates/nvisy-codec/src/handler/text/text_data.rs index 8a12599a..8b26ec57 100644 --- a/crates/nvisy-codec/src/handler/text/text_data.rs +++ b/crates/nvisy-codec/src/handler/text/text_data.rs @@ -5,11 +5,13 @@ use hipstr::HipStr; /// Opaque wrapper around a text span's content. /// -/// Mirrors [`ImageData`](crate::handler::ImageData) for text-bearing +/// Mirrors [`ImageData`] for text-bearing /// handlers, providing a consistent type boundary at the `Handler` /// trait level. /// /// Internally backed by [`HipStr`] for cheap cloning. +/// +/// [`ImageData`]: crate::handler::ImageData #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Display, From, AsRef)] #[as_ref(forward)] diff --git a/crates/nvisy-core/src/content/bundle.rs b/crates/nvisy-core/src/content/bundle.rs index 22731cd5..57bbebe1 100644 --- a/crates/nvisy-core/src/content/bundle.rs +++ b/crates/nvisy-core/src/content/bundle.rs @@ -147,6 +147,7 @@ impl Content { #[cfg(test)] mod tests { use super::*; + use crate::media::{ImageFormat, TextFormat}; #[test] fn content_with_metadata() { @@ -186,8 +187,6 @@ mod tests { #[test] fn infer_document_type_from_metadata() { - use crate::media::TextFormat; - let data = ContentData::from("plain text"); let metadata = ContentMetadata::new().with_content_type("text/plain"); let content = Content::with_metadata(data, metadata); @@ -200,8 +199,6 @@ mod tests { #[test] fn infer_document_type_from_magic_bytes() { - use crate::media::ImageFormat; - let png = vec![ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, diff --git a/crates/nvisy-core/src/content/content_data.rs b/crates/nvisy-core/src/content/content_data.rs index 232e8255..98833394 100644 --- a/crates/nvisy-core/src/content/content_data.rs +++ b/crates/nvisy-core/src/content/content_data.rs @@ -3,7 +3,9 @@ //! [`ContentData`] is the pure data half of the content model. It holds //! the raw bytes and a [`ContentSource`] identifier. All descriptive //! attributes (MIME type, filename, arbitrary metadata) live on -//! [`ContentMetadata`](super::ContentMetadata). +//! [`ContentMetadata`]. +//! +//! [`ContentMetadata`]: super::ContentMetadata use std::{fmt, str}; @@ -19,8 +21,11 @@ use crate::error::{Error, ErrorKind, Result}; /// /// This is the data-only half of the content model — it does not carry /// MIME type, filename, or other descriptive metadata. Pair with -/// [`ContentMetadata`](super::ContentMetadata) via -/// [`Content`](super::Content) for a complete representation. +/// [`ContentMetadata`] via +/// [`Content`] for a complete representation. +/// +/// [`ContentMetadata`]: super::ContentMetadata +/// [`Content`]: super::Content #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ContentData { /// Unique identifier for the content source. @@ -197,7 +202,9 @@ impl ContentData { /// /// Returns `None` for content with no recognizable magic bytes /// (e.g. plain text). Use this to populate - /// [`ContentMetadata::detected_content_type`](super::ContentMetadata::detected_content_type). + /// [`ContentMetadata::detected_content_type`]. + /// + /// [`ContentMetadata::detected_content_type`]: super::ContentMetadata::detected_content_type #[must_use] pub fn detect_mime(&self) -> Option { infer::get(&self.data).map(|t| t.mime_type().to_owned()) diff --git a/crates/nvisy-core/src/error.rs b/crates/nvisy-core/src/error.rs index 6370748c..ace184fb 100644 --- a/crates/nvisy-core/src/error.rs +++ b/crates/nvisy-core/src/error.rs @@ -36,8 +36,10 @@ pub enum ErrorKind { /// Unified error type for the nvisy platform. /// -/// Carries a [`kind`](ErrorKind), a human-readable message, an optional +/// Carries a [`kind`], a human-readable message, an optional /// source component name, a retryable flag, and an optional wrapped cause. +/// +/// [`kind`]: ErrorKind #[derive(Debug, thiserror::Error)] #[error("{kind}: {message}")] pub struct Error { diff --git a/crates/nvisy-core/src/media/document_type.rs b/crates/nvisy-core/src/media/document_type.rs index 54dd91e7..3aa22835 100644 --- a/crates/nvisy-core/src/media/document_type.rs +++ b/crates/nvisy-core/src/media/document_type.rs @@ -318,6 +318,8 @@ impl DocumentType { #[cfg(test)] mod tests { + use std::ffi::OsStr; + use super::*; #[test] @@ -337,8 +339,6 @@ mod tests { #[test] fn from_extension_common_formats() { - use std::ffi::OsStr; - assert_eq!( DocumentType::from_extension(OsStr::new("png")), Some(DocumentType::Image(ImageFormat::Png)), diff --git a/crates/nvisy-engine/src/operation/deduplication/span_size.rs b/crates/nvisy-engine/src/operation/deduplication/span_size.rs index deb133a7..d0dc706b 100644 --- a/crates/nvisy-engine/src/operation/deduplication/span_size.rs +++ b/crates/nvisy-engine/src/operation/deduplication/span_size.rs @@ -15,7 +15,7 @@ use nvisy_ontology::entity::Location; /// over "John", or the larger bounding box). /// /// Returns `None` for cross-modality comparisons (meaningless). -/// Returns `Some(cmp)` with a standard [`Ordering`](std::cmp::Ordering) +/// Returns `Some(cmp)` with a standard [`Ordering`] /// for same-modality pairs. /// /// Size metric per modality: @@ -23,6 +23,8 @@ use nvisy_ontology::entity::Location; /// - **Image**: bounding box area (`width * height`). /// - **Audio**: time span duration. /// - **Tabular**: cell text length. +/// +/// [`Ordering`]: std::cmp::Ordering pub(crate) trait SpanSize { /// Compare the extent of two locations. /// diff --git a/crates/nvisy-engine/src/operation/envelope/document.rs b/crates/nvisy-engine/src/operation/envelope/document.rs index 5ffb2df7..cb114457 100644 --- a/crates/nvisy-engine/src/operation/envelope/document.rs +++ b/crates/nvisy-engine/src/operation/envelope/document.rs @@ -22,7 +22,9 @@ use nvisy_ontology::entity::{AudioLocation, ImageLocation, Location, TextLocatio /// Engine-level document combining content, metadata, and artifacts. /// /// Created during import and carried through the entire pipeline via -/// [`DocumentEnvelope`](crate::operation::DocumentEnvelope). +/// [`DocumentEnvelope`]. +/// +/// [`DocumentEnvelope`]: crate::operation::DocumentEnvelope pub struct Document { /// The decoded content handle (text, image, audio, or rich). pub handle: ContentHandle, diff --git a/crates/nvisy-engine/src/operation/envelope/mod.rs b/crates/nvisy-engine/src/operation/envelope/mod.rs index 51e44afe..798896c6 100644 --- a/crates/nvisy-engine/src/operation/envelope/mod.rs +++ b/crates/nvisy-engine/src/operation/envelope/mod.rs @@ -18,8 +18,9 @@ //! //! Each operation receives `&mut DocumentEnvelope` and reads/writes //! fields directly. Run-wide shared state (policies, registry, key -//! provider) is available via the [`shared`](DocumentEnvelope::shared) -//! field. +//! provider) is available via the [`shared`] field. +//! +//! [`shared`]: DocumentEnvelope::shared use std::fmt; use std::sync::Arc; @@ -41,10 +42,13 @@ pub use self::shared::SharedData; /// Created by import from a decoded [`ContentHandle`], then progressively /// enriched by detection, policy, and redaction operations. Operations /// receive `&mut DocumentEnvelope` and access run-wide shared state -/// via the [`shared`](DocumentEnvelope::shared) field. +/// via the [`shared`] field. /// -/// Detected entities live on [`audit.entities`](Audit::entities), +/// Detected entities live on [`audit.entities`], /// not as a top-level field. +/// +/// [`shared`]: DocumentEnvelope::shared +/// [`audit.entities`]: Audit::entities pub struct DocumentEnvelope { /// The document: content handle + metadata + artifacts. /// diff --git a/crates/nvisy-engine/src/operation/envelope/shared.rs b/crates/nvisy-engine/src/operation/envelope/shared.rs index f9e8d6ec..a12700e8 100644 --- a/crates/nvisy-engine/src/operation/envelope/shared.rs +++ b/crates/nvisy-engine/src/operation/envelope/shared.rs @@ -16,7 +16,9 @@ use crate::utility::encryption::SharedKeyProvider; /// Immutable run-wide state shared across all envelopes via `Arc`. /// /// Constructed once at the start of a pipeline run and stored on each -/// [`DocumentEnvelope`](crate::operation::DocumentEnvelope). +/// [`DocumentEnvelope`]. +/// +/// [`DocumentEnvelope`]: crate::operation::DocumentEnvelope #[derive(Clone)] pub struct SharedData { /// Unique identifier for this pipeline run. diff --git a/crates/nvisy-engine/src/operation/export_file.rs b/crates/nvisy-engine/src/operation/export_file.rs index dc498971..10c28b2f 100644 --- a/crates/nvisy-engine/src/operation/export_file.rs +++ b/crates/nvisy-engine/src/operation/export_file.rs @@ -11,6 +11,7 @@ //! 2. **Compression** — compress for storage or transfer (if format specified) use nvisy_core::Result; +use nvisy_core::content::{Content, ContentData, ContentSource}; use nvisy_ontology::workflow::{CompressionAlgorithm, EncryptionConfig}; use uuid::Uuid; @@ -77,7 +78,6 @@ impl ExportFileOp { } for &content_id in &self.content_ids { - use nvisy_core::content::{Content, ContentData, ContentSource}; let source = ContentSource::from_uuid(content_id); let data = ContentData::new(source, output_bytes.clone()); let content = Content::new(data); diff --git a/crates/nvisy-engine/src/operation/extraction/vision.rs b/crates/nvisy-engine/src/operation/extraction/vision.rs index fe081a63..8134ec33 100644 --- a/crates/nvisy-engine/src/operation/extraction/vision.rs +++ b/crates/nvisy-engine/src/operation/extraction/vision.rs @@ -9,7 +9,7 @@ use nvisy_codec::handler::ImageData; use nvisy_core::{Error, ErrorKind, Result}; use nvisy_ontology::entity::{Entities, ImageLocation}; use nvisy_ontology::workflow::VisualExtraction as VisualExtractionCfg; -use nvisy_provider::agent::{ImageFormat, ImageInput, ImageOutput, OcrAgent}; +use nvisy_provider::agent::{ImageFormat, ImageInput, ImageOutput, OcrAgent, VerificationCandidate}; use nvisy_provider::http::HttpClient; use crate::operation::{DocumentEnvelope, Operation}; @@ -106,7 +106,6 @@ impl VisualExtractionOp { if entities.is_empty() || spans.is_empty() { return Ok(entities); } - use nvisy_provider::agent::VerificationCandidate; let mut verified = entities.into_inner(); for span in spans { diff --git a/crates/nvisy-engine/src/pipeline/config/subsystem.rs b/crates/nvisy-engine/src/pipeline/config/subsystem.rs index 97970f0a..33ce7929 100644 --- a/crates/nvisy-engine/src/pipeline/config/subsystem.rs +++ b/crates/nvisy-engine/src/pipeline/config/subsystem.rs @@ -30,8 +30,10 @@ pub struct LlmSection { /// Speech-to-text subsystem configuration. /// -/// Controls the STT provider used by [`Extraction`](nvisy_ontology::workflow::Extraction) +/// Controls the STT provider used by [`Extraction`] /// nodes for audio transcription. +/// +/// [`Extraction`]: nvisy_ontology::workflow::Extraction #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct SttSection { /// STT provider selection and connection settings. diff --git a/crates/nvisy-engine/src/pipeline/config/validate.rs b/crates/nvisy-engine/src/pipeline/config/validate.rs index f9bb9d51..44e2f246 100644 --- a/crates/nvisy-engine/src/pipeline/config/validate.rs +++ b/crates/nvisy-engine/src/pipeline/config/validate.rs @@ -3,6 +3,8 @@ use std::env; use nvisy_core::Error; +use nvisy_provider::agent::{AgentProvider, OcrProvider}; +use nvisy_provider::audio::{SttProvider, TtsProvider}; use validator::Validate; use super::RuntimeConfig; @@ -55,13 +57,8 @@ impl RuntimeConfig { } } -#[allow(unused_variables, unused_imports)] -fn resolve_agent_provider_key( - provider: &mut Option, - env_var: &str, -) { - use nvisy_provider::agent::AgentProvider; - +#[allow(unused_variables)] +fn resolve_agent_provider_key(provider: &mut Option, env_var: &str) { let Some(p) = provider else { return }; match p { #[cfg(feature = "openai")] @@ -74,13 +71,8 @@ fn resolve_agent_provider_key( } } -#[allow(unused_variables, unused_imports)] -fn resolve_stt_provider_key( - provider: &mut Option, - env_var: &str, -) { - use nvisy_provider::audio::SttProvider; - +#[allow(unused_variables)] +fn resolve_stt_provider_key(provider: &mut Option, env_var: &str) { let Some(p) = provider else { return }; match p { #[cfg(feature = "openai")] @@ -89,13 +81,8 @@ fn resolve_stt_provider_key( } } -#[allow(unused_variables, unused_imports)] -fn resolve_tts_provider_key( - provider: &mut Option, - env_var: &str, -) { - use nvisy_provider::audio::TtsProvider; - +#[allow(unused_variables)] +fn resolve_tts_provider_key(provider: &mut Option, env_var: &str) { let Some(p) = provider else { return }; match p { #[cfg(feature = "openai")] @@ -104,13 +91,8 @@ fn resolve_tts_provider_key( } } -#[allow(unused_variables, unused_imports)] -fn resolve_ocr_provider_key( - provider: &mut Option, - env_var: &str, -) { - use nvisy_provider::agent::OcrProvider; - +#[allow(unused_variables)] +fn resolve_ocr_provider_key(provider: &mut Option, env_var: &str) { let Some(p) = provider else { return }; match p { #[cfg(feature = "google")] diff --git a/crates/nvisy-engine/src/pipeline/runs/analytics.rs b/crates/nvisy-engine/src/pipeline/runs/analytics.rs index 859b24e4..ee1d2ecb 100644 --- a/crates/nvisy-engine/src/pipeline/runs/analytics.rs +++ b/crates/nvisy-engine/src/pipeline/runs/analytics.rs @@ -2,8 +2,11 @@ //! //! [`AnalyticsSnapshot`] captures status counts, actor count, and //! duration stats. The snapshot is computed on demand from the -//! in-memory [`RunState`](super::runs::state::RunState) and exposed -//! via [`Engine::snapshot`](super::Engine::snapshot). +//! in-memory [`RunState`] and exposed +//! via [`Engine::snapshot`]. +//! +//! [`RunState`]: super::runs::state::RunState +//! [`Engine::snapshot`]: super::Engine::snapshot use jiff::Timestamp; use schemars::JsonSchema; diff --git a/crates/nvisy-engine/src/pipeline/runs/mod.rs b/crates/nvisy-engine/src/pipeline/runs/mod.rs index 66d829c5..d9c1f9fb 100644 --- a/crates/nvisy-engine/src/pipeline/runs/mod.rs +++ b/crates/nvisy-engine/src/pipeline/runs/mod.rs @@ -12,7 +12,9 @@ //! detail (`GET /runs`). //! //! The [`state`] submodule contains the volatile in-memory storage -//! ([`RunState`](state::RunState)) backing all run queries and mutations. +//! ([`RunState`]) backing all run queries and mutations. +//! +//! [`RunState`]: state::RunState mod analytics; pub(crate) mod state; diff --git a/crates/nvisy-engine/src/pipeline/runs/state.rs b/crates/nvisy-engine/src/pipeline/runs/state.rs index d3c08901..82941c70 100644 --- a/crates/nvisy-engine/src/pipeline/runs/state.rs +++ b/crates/nvisy-engine/src/pipeline/runs/state.rs @@ -2,9 +2,11 @@ //! //! [`RunState`] wraps an `Arc>>` providing //! concurrent read/write access to run records. It is cheaply clonable -//! (single `Arc` bump) and shared between the [`Engine`](super::super::Engine) +//! (single `Arc` bump) and shared between the [`Engine`] //! and the [orchestrator](super::super::orchestrator). //! +//! [`Engine`]: super::super::Engine +//! //! All queries are scoped by `actor_id` — an actor can only see and //! mutate their own runs. Finalization forces any still-pending or //! still-running nodes into `Failed` status to ensure every node reaches @@ -146,13 +148,15 @@ impl RunState { /// Transition a run to its final status and record aggregate counters. /// - /// If the run was already cancelled (via [`cancel_run`](Self::cancel_run)), + /// If the run was already cancelled (via [`cancel_run`]), /// the `Cancelled` status is preserved rather than being overwritten by /// the orchestrator's computed status. /// /// Any nodes still in `Pending` or `Running` state are forced to /// `Failed` with an explanatory error message, ensuring all nodes /// reach a terminal status. + /// + /// [`cancel_run`]: Self::cancel_run pub async fn finalize( &self, run_id: Uuid, diff --git a/crates/nvisy-engine/src/utility/encryption/provider.rs b/crates/nvisy-engine/src/utility/encryption/provider.rs index 36e47dea..ce7eaeef 100644 --- a/crates/nvisy-engine/src/utility/encryption/provider.rs +++ b/crates/nvisy-engine/src/utility/encryption/provider.rs @@ -4,6 +4,7 @@ use std::collections::HashMap; use std::fmt; use std::sync::Arc; +use base64::Engine; use bytes::Bytes; use nvisy_core::{Error, Result}; @@ -69,8 +70,6 @@ impl StaticKeyProvider { /// Expects an object whose keys are key IDs and values are /// base64-encoded key bytes. Any other shape returns an empty provider. pub fn from_json(value: &serde_json::Value) -> Result { - use base64::Engine; - let Some(map) = value.as_object() else { return Ok(Self::default()); }; diff --git a/crates/nvisy-engine/src/utility/encryption/service.rs b/crates/nvisy-engine/src/utility/encryption/service.rs index 383eebfb..306124b6 100644 --- a/crates/nvisy-engine/src/utility/encryption/service.rs +++ b/crates/nvisy-engine/src/utility/encryption/service.rs @@ -4,11 +4,12 @@ use aes_gcm::aead::Aead; use aes_gcm::{Aes256Gcm, KeyInit, Nonce}; use bytes::Bytes; use nvisy_core::content::ContentData; -use nvisy_core::{Error, Result}; +use nvisy_core::{Error, ErrorKind, Result}; use nvisy_ontology::workflow::EncryptionAlgorithm; +use rand::RngExt; use super::provider::{KeyProvider, SharedKeyProvider}; -use super::wire::{EncryptedContent, WireEnvelope}; +use super::wire::{EncryptedContent, NONCE_SIZE, WireEnvelope}; use crate::operation::DocumentEnvelope; const TARGET: &str = "nvisy_engine::op::encryption"; @@ -30,11 +31,6 @@ impl CryptoService { /// Encrypt a [`DocumentEnvelope`] into an [`EncryptedContent`] blob. pub async fn encrypt(&self, envelope: &DocumentEnvelope) -> Result { - use nvisy_core::ErrorKind; - use rand::RngExt; - - use super::wire::NONCE_SIZE; - let content_data = envelope.document.encode()?; let source = content_data.content_source; let plaintext = content_data.as_bytes(); diff --git a/crates/nvisy-ontology/src/context/entry.rs b/crates/nvisy-ontology/src/context/entry.rs index 50d74231..bc0fc9fa 100644 --- a/crates/nvisy-ontology/src/context/entry.rs +++ b/crates/nvisy-ontology/src/context/entry.rs @@ -35,7 +35,9 @@ pub enum ContextEntryData { Document(DocumentVariant), } -/// A single reference-data entry within a [`Context`](super::Context). +/// A single reference-data entry within a [`Context`]. +/// +/// [`Context`]: super::Context #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct ContextEntry { diff --git a/crates/nvisy-ontology/src/context/temporal/datetime.rs b/crates/nvisy-ontology/src/context/temporal/datetime.rs index e57e66b0..01cc7751 100644 --- a/crates/nvisy-ontology/src/context/temporal/datetime.rs +++ b/crates/nvisy-ontology/src/context/temporal/datetime.rs @@ -8,7 +8,9 @@ use serde::{Deserialize, Serialize}; /// /// Uses naive (timezone-unaware) date-times from [`jiff::civil::DateTime`]. /// For timezone-aware timestamps, use the entry-level `created_at` / -/// `expires_at` fields on [`ContextEntry`](crate::context::ContextEntry). +/// `expires_at` fields on [`ContextEntry`]. +/// +/// [`ContextEntry`]: crate::context::ContextEntry #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct DateTimeData { diff --git a/crates/nvisy-ontology/src/entity/category.rs b/crates/nvisy-ontology/src/entity/category.rs index c285b1a1..0e0258b5 100644 --- a/crates/nvisy-ontology/src/entity/category.rs +++ b/crates/nvisy-ontology/src/entity/category.rs @@ -1,9 +1,11 @@ //! Broad entity category classification. //! -//! [`EntityCategory`] groups related [`EntityKind`](super::EntityKind) +//! [`EntityCategory`] groups related [`EntityKind`] //! variants into policy-addressable buckets. Policy selectors can //! target an entire category (e.g. "redact all financial data") without //! enumerating individual kinds. +//! +//! [`EntityKind`]: super::EntityKind use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -11,8 +13,11 @@ use strum::{Display, EnumString}; /// Broad category of sensitive data. /// -/// Each [`EntityKind`](super::EntityKind) maps to exactly one category -/// via [`EntityKind::category()`](super::EntityKind::category). +/// Each [`EntityKind`] maps to exactly one category +/// via [`EntityKind::category()`]. +/// +/// [`EntityKind`]: super::EntityKind +/// [`EntityKind::category()`]: super::EntityKind::category #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] diff --git a/crates/nvisy-ontology/src/entity/kind.rs b/crates/nvisy-ontology/src/entity/kind.rs index 67304329..a5477627 100644 --- a/crates/nvisy-ontology/src/entity/kind.rs +++ b/crates/nvisy-ontology/src/entity/kind.rs @@ -109,7 +109,9 @@ pub enum EntityKind { Voiceprint, /// Retina or iris scan data. RetinaScan, - /// Facial geometry or face embedding (not a photo: see [`Face`](Self::Face)). + /// Facial geometry or face embedding (not a photo: see [`Face`]). + /// + /// [`Face`]: Self::Face FacialGeometry, // Credentials diff --git a/crates/nvisy-ontology/src/primitive/time_span.rs b/crates/nvisy-ontology/src/primitive/time_span.rs index d5fd8935..f8953311 100644 --- a/crates/nvisy-ontology/src/primitive/time_span.rs +++ b/crates/nvisy-ontology/src/primitive/time_span.rs @@ -12,8 +12,11 @@ const US_PER_SEC: i64 = 1_000_000; /// point rounding. At 48kHz sample rate, one sample is ~20.8μs — well /// within the 1μs resolution. /// -/// Use [`from_secs`](Self::from_secs) and [`start_secs`](Self::start_secs) +/// Use [`from_secs`] and [`start_secs`] /// for ergonomic conversion to/from seconds. +/// +/// [`from_secs`]: Self::from_secs +/// [`start_secs`]: Self::start_secs #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Serialize, Deserialize, JsonSchema)] pub struct TimeSpan { diff --git a/crates/nvisy-ontology/src/provenance/entry.rs b/crates/nvisy-ontology/src/provenance/entry.rs index 43d15618..4a48f525 100644 --- a/crates/nvisy-ontology/src/provenance/entry.rs +++ b/crates/nvisy-ontology/src/provenance/entry.rs @@ -34,8 +34,11 @@ pub enum AuditEntryStatus { /// the applicator with the replacement value and `is_applied` flag. /// /// Location and confidence are not stored here: they live on the -/// corresponding [`Entity`](crate::entity::Entity) in -/// [`Audit::entities`](super::Audit::entities), linked by `entity_id`. +/// corresponding [`Entity`] in +/// [`Audit::entities`], linked by `entity_id`. +/// +/// [`Entity`]: crate::entity::Entity +/// [`Audit::entities`]: super::Audit::entities #[derive(Debug, Clone, Builder, Serialize, Deserialize, JsonSchema)] #[builder( name = "AuditEntryBuilder", diff --git a/crates/nvisy-ontology/src/provenance/mod.rs b/crates/nvisy-ontology/src/provenance/mod.rs index 77d45cae..1b1c2e00 100644 --- a/crates/nvisy-ontology/src/provenance/mod.rs +++ b/crates/nvisy-ontology/src/provenance/mod.rs @@ -26,8 +26,11 @@ use crate::entity::{ContentSource, Entities}; /// A per-document audit trail: detected entities and redaction entries. /// /// `Audit` is the single compliance artifact for a document. It tracks: -/// - **What was found**: via [`entities`](Self::entities) -/// - **What was redacted and how**: via [`entries`](Self::entries) +/// - **What was found**: via [`entities`] +/// - **What was redacted and how**: via [`entries`] +/// +/// [`entities`]: Self::entities +/// [`entries`]: Self::entries #[derive(Debug, Clone, Builder, Serialize, Deserialize, JsonSchema)] #[builder( name = "AuditBuilder", diff --git a/crates/nvisy-ontology/src/provenance/review.rs b/crates/nvisy-ontology/src/provenance/review.rs index 37ef8a07..e68e5bf9 100644 --- a/crates/nvisy-ontology/src/provenance/review.rs +++ b/crates/nvisy-ontology/src/provenance/review.rs @@ -25,9 +25,11 @@ pub enum ReviewStatus { /// A review decision recorded against a redaction, including versioning. /// -/// Present on an [`AuditEntry`](super::AuditEntry) only when the +/// Present on an [`AuditEntry`] only when the /// redaction has been reviewed (or is pending review). Absent for /// entries that have not entered the review workflow. +/// +/// [`AuditEntry`]: super::AuditEntry #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct ReviewDecision { diff --git a/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs b/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs index 9654ff8d..396d5d49 100644 --- a/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs +++ b/crates/nvisy-pattern/src/dictionaries/dictionary_registry.rs @@ -173,13 +173,15 @@ impl DictionaryRegistry { /// /// Files with unrecognised extensions are logged as warnings and /// skipped. Loaded dictionaries are inserted into `self`, so this - /// can be called after [`load_builtins`](Self::load_builtins) to + /// can be called after [`load_builtins`] to /// layer user-provided dictionaries on top of the built-ins. /// /// # Errors /// /// Returns [`nvisy_core::Error`] if the directory cannot be read, /// a file cannot be read, or a CSV file fails to parse. + /// + /// [`load_builtins`]: Self::load_builtins #[tracing::instrument(target = TARGET, name = "dictionaries.load_dir", skip_all, fields(path = %dir.as_ref().display(), count))] pub fn load_dir(&mut self, dir: impl AsRef) -> nvisy_core::Result<()> { let dir = dir.as_ref(); diff --git a/crates/nvisy-pattern/src/engine/allow_list.rs b/crates/nvisy-pattern/src/engine/allow_list.rs index 7604401e..58388ccf 100644 --- a/crates/nvisy-pattern/src/engine/allow_list.rs +++ b/crates/nvisy-pattern/src/engine/allow_list.rs @@ -5,7 +5,7 @@ use std::collections::HashSet; /// Exact-match allow list for suppressing known false positives. /// /// Values that appear in the allow list are silently dropped from -/// [`PatternEngine::scan_entities`](super::PatternEngine::scan_entities) results. +/// [`PatternEngine::scan_entities`] results. /// /// # Examples /// @@ -14,6 +14,8 @@ use std::collections::HashSet; /// .with("123-45-6789") /// .with("000-00-0000"); /// ``` +/// +/// [`PatternEngine::scan_entities`]: super::PatternEngine::scan_entities #[derive(Debug, Clone, Default)] pub struct AllowList { pub(crate) values: HashSet, diff --git a/crates/nvisy-pattern/src/engine/builder.rs b/crates/nvisy-pattern/src/engine/builder.rs index dd0efb2d..1a5f9d95 100644 --- a/crates/nvisy-pattern/src/engine/builder.rs +++ b/crates/nvisy-pattern/src/engine/builder.rs @@ -11,7 +11,9 @@ use crate::validators::ValidatorResolver; /// Builder for [`PatternEngine`]. /// /// By default all built-in patterns are included. Use -/// [`with_patterns`](Self::with_patterns) to restrict to a subset. +/// [`with_patterns`] to restrict to a subset. +/// +/// [`with_patterns`]: Self::with_patterns #[derive(Default)] pub struct PatternEngineBuilder { pattern_names: Option>, @@ -33,7 +35,9 @@ impl PatternEngineBuilder { /// Set the minimum confidence score for matches. /// /// Matches with confidence below this value are discarded during - /// [`scan_entities`](PatternEngine::scan_entities). Defaults to `0.0`. + /// [`scan_entities`]. Defaults to `0.0`. + /// + /// [`scan_entities`]: PatternEngine::scan_entities pub fn with_confidence_threshold(mut self, threshold: f64) -> Self { self.confidence_threshold = threshold; self diff --git a/crates/nvisy-pattern/src/engine/error.rs b/crates/nvisy-pattern/src/engine/error.rs index 337a18b0..ad801eb5 100644 --- a/crates/nvisy-pattern/src/engine/error.rs +++ b/crates/nvisy-pattern/src/engine/error.rs @@ -4,7 +4,9 @@ use nvisy_core::{Error, ErrorKind}; -/// Errors that can occur while building a [`PatternEngine`](super::PatternEngine). +/// Errors that can occur while building a [`PatternEngine`]. +/// +/// [`PatternEngine`]: super::PatternEngine #[derive(Debug, thiserror::Error)] pub(crate) enum PatternEngineError { /// A regex pattern string failed to compile. diff --git a/crates/nvisy-pattern/src/engine/scan_context.rs b/crates/nvisy-pattern/src/engine/scan_context.rs index 8469f5ed..48b86d6f 100644 --- a/crates/nvisy-pattern/src/engine/scan_context.rs +++ b/crates/nvisy-pattern/src/engine/scan_context.rs @@ -5,7 +5,7 @@ use super::deny_list::DenyList; /// Per-scan configuration for allow and deny lists. /// -/// Passed to [`PatternEngine::scan_entities`](super::PatternEngine::scan_entities) +/// Passed to [`PatternEngine::scan_entities`] /// to control per-invocation suppression and forced detection without /// rebuilding the engine. /// @@ -24,6 +24,8 @@ use super::deny_list::DenyList; /// })); /// let matches = PatternEngine::instance().scan_entities("text", &ctx); /// ``` +/// +/// [`PatternEngine::scan_entities`]: super::PatternEngine::scan_entities #[derive(Debug, Clone, Default)] pub struct ScanContext { pub(super) allow: AllowList, diff --git a/crates/nvisy-pattern/src/patterns/json_pattern.rs b/crates/nvisy-pattern/src/patterns/json_pattern.rs index e9a17b55..765feef5 100644 --- a/crates/nvisy-pattern/src/patterns/json_pattern.rs +++ b/crates/nvisy-pattern/src/patterns/json_pattern.rs @@ -1,9 +1,11 @@ //! JSON-backed [`JsonPattern`] implementation. //! //! Each JSON file under `assets/patterns/` is deserialized into a -//! [`JsonPattern`] via [`from_bytes`](JsonPattern::from_bytes). The method +//! [`JsonPattern`] via [`from_bytes`]. The method //! returns the validated pattern together with any non-fatal //! [`JsonPatternWarning`]s so the caller can decide how to surface them. +//! +//! [`from_bytes`]: JsonPattern::from_bytes use nvisy_ontology::entity::{EntityCategory, EntityKind}; use serde::Deserialize; diff --git a/crates/nvisy-pattern/src/patterns/pattern.rs b/crates/nvisy-pattern/src/patterns/pattern.rs index 7d0640e4..9fa70cf4 100644 --- a/crates/nvisy-pattern/src/patterns/pattern.rs +++ b/crates/nvisy-pattern/src/patterns/pattern.rs @@ -32,7 +32,9 @@ pub struct RegexPattern { impl RegexPattern { /// Return the regex string ready for compilation. /// - /// Prepends `(?i)` when [`case_sensitive`](Self::case_sensitive) is `false`. + /// Prepends `(?i)` when [`case_sensitive`] is `false`. + /// + /// [`case_sensitive`]: Self::case_sensitive pub fn effective_regex(&self) -> String { if self.case_sensitive { self.regex.clone() diff --git a/crates/nvisy-pattern/src/patterns/pattern_registry.rs b/crates/nvisy-pattern/src/patterns/pattern_registry.rs index a11f00e0..1b7d6842 100644 --- a/crates/nvisy-pattern/src/patterns/pattern_registry.rs +++ b/crates/nvisy-pattern/src/patterns/pattern_registry.rs @@ -184,13 +184,15 @@ impl PatternRegistry { /// /// Non-`.json` files are logged as warnings and skipped. Loaded /// patterns are inserted into `self`, so this can be called after - /// [`load_builtins`](Self::load_builtins) to layer user-provided + /// [`load_builtins`] to layer user-provided /// patterns on top of the built-ins. /// /// # Errors /// /// Returns [`nvisy_core::Error`] if the directory cannot be read, /// a file cannot be read, or a JSON file fails to parse. + /// + /// [`load_builtins`]: Self::load_builtins #[tracing::instrument(target = TARGET, name = "patterns.load_dir", skip_all, fields(path = %dir.as_ref().display(), count))] pub fn load_dir(&mut self, dir: impl AsRef) -> nvisy_core::Result<()> { let dir = dir.as_ref(); diff --git a/crates/nvisy-pattern/src/validators/mod.rs b/crates/nvisy-pattern/src/validators/mod.rs index 955916cd..c1ffd132 100644 --- a/crates/nvisy-pattern/src/validators/mod.rs +++ b/crates/nvisy-pattern/src/validators/mod.rs @@ -24,9 +24,12 @@ pub type ValidatorFn = fn(&str) -> bool; /// Maps validator names to [`ValidatorFn`]s. /// -/// Created with the built-in validators via [`builtins`](Self::builtins) +/// Created with the built-in validators via [`builtins`] /// (or [`Default`]), then optionally extended with -/// [`register`](Self::register) for custom validators. +/// [`register`] for custom validators. +/// +/// [`builtins`]: Self::builtins +/// [`register`]: Self::register #[derive(Debug, Clone)] pub struct ValidatorResolver { table: HashMap<&'static str, ValidatorFn>, diff --git a/crates/nvisy-provider/src/agent/base/builder.rs b/crates/nvisy-provider/src/agent/base/builder.rs index 738cda96..42170fee 100644 --- a/crates/nvisy-provider/src/agent/base/builder.rs +++ b/crates/nvisy-provider/src/agent/base/builder.rs @@ -1,4 +1,6 @@ -//! Builder for [`BaseAgent`](super::BaseAgent). +//! Builder for [`BaseAgent`]. +//! +//! [`BaseAgent`]: super::BaseAgent use rig::agent::{Agent, AgentBuilder}; use rig::client::CompletionClient; @@ -16,7 +18,9 @@ use crate::http::{HttpClient, HttpConfig}; /// /// Created via [`BaseAgent::builder`]. Collects a provider reference, config, /// and optional tools, then constructs the concrete rig-core agent on -/// [`build`](Self::build). +/// [`build`]. +/// +/// [`build`]: Self::build pub(crate) struct BaseAgentBuilder { provider: AgentProvider, config: AgentConfig, diff --git a/crates/nvisy-provider/src/agent/cv/mod.rs b/crates/nvisy-provider/src/agent/cv/mod.rs index 6731aa01..46708e6c 100644 --- a/crates/nvisy-provider/src/agent/cv/mod.rs +++ b/crates/nvisy-provider/src/agent/cv/mod.rs @@ -1,9 +1,11 @@ //! Computer vision agent for face, license plate, and signature detection. //! -//! [`CvAgent`] wraps a [`BaseAgent`](crate::backend::BaseAgent) with a +//! [`CvAgent`] wraps a [`BaseAgent`] with a //! [`CvProvider`]-backed tool. It encodes an image as base64, prompts the //! VLM to call the CV tool, and returns classified entities with bounding //! boxes. +//! +//! [`BaseAgent`]: crate::backend::BaseAgent mod output; mod prompt; @@ -54,13 +56,15 @@ pub trait CvProvider: Send + Sync { /// /// # Workflow /// -/// 1. Caller passes raw image bytes to [`detect`](Self::detect). +/// 1. Caller passes raw image bytes to [`detect`]. /// 2. The agent base64-encodes the image and builds a user prompt via /// `CvPromptBuilder`. /// 3. The VLM is instructed to call the `cv_detect_objects` tool (backed /// by the [`CvProvider`]) and then classify each detection into an /// entity category and type. /// 4. Structured output is parsed into a `Vec`. +/// +/// [`detect`]: Self::detect pub struct CvAgent { base: BaseAgent, } diff --git a/crates/nvisy-provider/src/agent/generate/mod.rs b/crates/nvisy-provider/src/agent/generate/mod.rs index 15ba04e7..a4ea67c2 100644 --- a/crates/nvisy-provider/src/agent/generate/mod.rs +++ b/crates/nvisy-provider/src/agent/generate/mod.rs @@ -1,8 +1,10 @@ //! Text generation agent for generating synthetic replacement values. //! -//! [`GenAgent`] wraps a [`BaseAgent`](super::BaseAgent) with +//! [`GenAgent`] wraps a [`BaseAgent`] with //! generation-specific prompts. It is a pure LLM agent (no tools) that //! generates realistic fake values to replace detected PII/entities. +//! +//! [`BaseAgent`]: super::BaseAgent mod output; mod prompt; @@ -35,9 +37,11 @@ pub struct GenRequest { /// # Workflow /// /// 1. Caller passes a batch of [`GenRequest`]s to -/// [`generate`](Self::generate). +/// [`generate`]. /// 2. The agent builds a user prompt via `GenPromptBuilder`. /// 3. Structured output is parsed into `Vec`. +/// +/// [`generate`]: Self::generate pub struct GenAgent { base: BaseAgent, } diff --git a/crates/nvisy-provider/src/agent/ner/context.rs b/crates/nvisy-provider/src/agent/ner/context.rs index c4f1c456..eb0dbcb5 100644 --- a/crates/nvisy-provider/src/agent/ner/context.rs +++ b/crates/nvisy-provider/src/agent/ner/context.rs @@ -8,9 +8,12 @@ use super::{KnownNerEntity, NerEntity}; /// entities so the LLM can assign consistent `entity_id` values across /// chunks or sequential calls. /// -/// Use [`merge`](Self::merge) to accumulate entities from successive -/// detection calls, then update the text with [`set_text`](Self::set_text) +/// Use [`merge`] to accumulate entities from successive +/// detection calls, then update the text with [`set_text`] /// before the next call. +/// +/// [`merge`]: Self::merge +/// [`set_text`]: Self::set_text pub struct NerContext<'a> { /// The text to analyse. pub text: &'a str, diff --git a/crates/nvisy-provider/src/agent/ner/mod.rs b/crates/nvisy-provider/src/agent/ner/mod.rs index c5a3ea40..178680bd 100644 --- a/crates/nvisy-provider/src/agent/ner/mod.rs +++ b/crates/nvisy-provider/src/agent/ner/mod.rs @@ -1,8 +1,10 @@ //! Named Entity Recognition (NER) agent for textual PII/entity detection. //! -//! [`NerAgent`] wraps a [`BaseAgent`](crate::backend::BaseAgent) with +//! [`NerAgent`] wraps a [`BaseAgent`] with //! NER-specific prompts. It is a pure LLM agent (no tools) that analyses //! text and returns structured entity detections. +//! +//! [`BaseAgent`]: crate::backend::BaseAgent mod context; mod output; @@ -31,10 +33,12 @@ const TARGET: &str = "nvisy_provider::agent::ner"; /// # Workflow /// /// 1. Caller passes a [`NerContext`] and a [`DetectionConfig`] to -/// [`detect`](Self::detect). +/// [`detect`]. /// 2. The agent builds a user prompt via `NerPromptBuilder` that /// specifies entity types, confidence thresholds, and known entities. /// 3. Structured output is parsed into `Vec`. +/// +/// [`detect`]: Self::detect pub struct NerAgent { base: BaseAgent, state: Mutex>, @@ -124,11 +128,13 @@ impl NerAgent { /// /// Manages coreference state internally: previously detected entities /// are carried forward so the LLM can assign consistent `entity_id` - /// values across successive calls. Call [`reset`](Self::reset) to + /// values across successive calls. Call [`reset`] to /// clear the state between documents. /// /// The caller is responsible for adjusting byte offsets to be /// document-relative after this call. + /// + /// [`reset`]: Self::reset #[tracing::instrument( target = "nvisy_provider::agent::ner", skip_all, diff --git a/crates/nvisy-provider/src/agent/ocr/input.rs b/crates/nvisy-provider/src/agent/ocr/input.rs index c50efb3d..1770d82b 100644 --- a/crates/nvisy-provider/src/agent/ocr/input.rs +++ b/crates/nvisy-provider/src/agent/ocr/input.rs @@ -17,7 +17,9 @@ pub struct VerificationCandidate { /// An entity proposed by NER that the VLM should verify against the image. #[derive(Debug, Clone, PartialEq, Deserialize, Serialize, JsonSchema)] pub struct ProposedEntity { - /// Index used to correlate with [`VerifiedEntity::id`](super::VerifiedEntity::id). + /// Index used to correlate with [`VerifiedEntity::id`]. + /// + /// [`VerifiedEntity::id`]: super::VerifiedEntity::id pub id: usize, /// Broad classification. pub category: EntityCategory, diff --git a/crates/nvisy-provider/src/agent/ocr/mod.rs b/crates/nvisy-provider/src/agent/ocr/mod.rs index 61722f5f..2517fb72 100644 --- a/crates/nvisy-provider/src/agent/ocr/mod.rs +++ b/crates/nvisy-provider/src/agent/ocr/mod.rs @@ -1,8 +1,10 @@ //! Unified OCR agent: extraction + optional LLM-based verification. //! -//! [`OcrAgent`] wraps an [`OcrEngine`](crate::ocr::OcrEngine) with an +//! [`OcrAgent`] wraps an [`OcrEngine`] with an //! optional LLM verifier. It is the single public entry point for all //! OCR operations. +//! +//! [`OcrEngine`]: crate::ocr::OcrEngine mod input; mod output; @@ -70,7 +72,9 @@ impl OcrAgent { /// Run OCR on a single image. /// /// Uses the params provided at construction. To override, use - /// [`run_with_params`](Self::run_with_params). + /// [`run_with_params`]. + /// + /// [`run_with_params`]: Self::run_with_params #[tracing::instrument( target = TARGET, skip_all, @@ -97,7 +101,9 @@ impl OcrAgent { /// Run OCR on multiple images. /// /// Uses the params provided at construction. To override, use - /// [`run_batch_with_params`](Self::run_batch_with_params). + /// [`run_batch_with_params`]. + /// + /// [`run_batch_with_params`]: Self::run_batch_with_params #[tracing::instrument(target = TARGET, skip_all, fields(count = images.len()))] pub async fn run_batch(&self, images: &[ImageInput]) -> Result> { self.engine.run_batch(images, &self.params).await diff --git a/crates/nvisy-provider/src/audio/stt/provider.rs b/crates/nvisy-provider/src/audio/stt/provider.rs index 3d8e0f92..c73658b7 100644 --- a/crates/nvisy-provider/src/audio/stt/provider.rs +++ b/crates/nvisy-provider/src/audio/stt/provider.rs @@ -51,7 +51,9 @@ impl SttProvider { /// Create a local transcription provider with a custom base URL. /// - /// **Not yet implemented** — see [`local`](Self::local). + /// **Not yet implemented** — see [`local`]. + /// + /// [`local`]: Self::local pub fn local_with_url(model: &str, url: &str) -> Self { Self::Local(UnauthenticatedProvider { model: model.to_owned(), diff --git a/crates/nvisy-provider/src/audio/tts/provider.rs b/crates/nvisy-provider/src/audio/tts/provider.rs index d8b002ab..b5f8eefc 100644 --- a/crates/nvisy-provider/src/audio/tts/provider.rs +++ b/crates/nvisy-provider/src/audio/tts/provider.rs @@ -51,7 +51,9 @@ impl TtsProvider { /// Create a local TTS provider with a custom base URL. /// - /// **Not yet implemented** — see [`local`](Self::local). + /// **Not yet implemented** — see [`local`]. + /// + /// [`local`]: Self::local pub fn local_with_url(model: &str, url: &str) -> Self { Self::Local(UnauthenticatedProvider { model: model.to_owned(), diff --git a/crates/nvisy-provider/src/ocr/params.rs b/crates/nvisy-provider/src/ocr/params.rs index ec731f13..f3c2c079 100644 --- a/crates/nvisy-provider/src/ocr/params.rs +++ b/crates/nvisy-provider/src/ocr/params.rs @@ -13,8 +13,10 @@ use crate::ocr::provider::{PaddleXBackend, PaddleXParams, SuryaBackend, SuryaPar /// Union of all provider parameter types. /// /// Each variant holds the configuration needed to construct one OCR backend. -/// Use [`into_engine`](OcrProvider::into_engine) to build a ready-to-use +/// Use [`into_engine`] to build a ready-to-use /// `OcrEngine` from any variant. +/// +/// [`into_engine`]: OcrProvider::into_engine #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind", rename_all = "kebab-case")] pub enum OcrProvider { diff --git a/crates/nvisy-python/src/bridge/mod.rs b/crates/nvisy-python/src/bridge/mod.rs index 95a1d23a..6992ed98 100644 --- a/crates/nvisy-python/src/bridge/mod.rs +++ b/crates/nvisy-python/src/bridge/mod.rs @@ -7,6 +7,8 @@ mod error; use std::fmt; +use std::future::Future; +use std::pin::Pin; use hipstr::HipStr; use nvisy_core::Error; @@ -139,9 +141,6 @@ impl PythonBridge { where F: FnOnce(Python<'_>) -> Result, Error> + Send + 'static, { - use std::future::Future; - use std::pin::Pin; - tracing::Span::current().record("method", method); let future: Pin>> + Send>> = diff --git a/crates/nvisy-server/src/extract/actor.rs b/crates/nvisy-server/src/extract/actor.rs index 871cb3ef..87c7bcf8 100644 --- a/crates/nvisy-server/src/extract/actor.rs +++ b/crates/nvisy-server/src/extract/actor.rs @@ -2,7 +2,9 @@ //! //! Wraps the raw header value into a typed [`ActorId`] newtype, //! rejecting requests that omit the header or supply an invalid UUID -//! with our standard [`ErrorResponse`](crate::handler::response::ErrorResponse). +//! with our standard [`ErrorResponse`]. +//! +//! [`ErrorResponse`]: crate::handler::response::ErrorResponse use aide::OperationInput; use axum::extract::FromRequestParts; diff --git a/crates/nvisy-server/src/extract/json.rs b/crates/nvisy-server/src/extract/json.rs index e4b370b5..e2f13930 100644 --- a/crates/nvisy-server/src/extract/json.rs +++ b/crates/nvisy-server/src/extract/json.rs @@ -1,8 +1,10 @@ //! Custom `Json` extractor that converts rejections into [`Error`]. //! //! Wraps [`axum::Json`] so that malformed JSON bodies produce our -//! standard [`ErrorResponse`](crate::handler::response::ErrorResponse) +//! standard [`ErrorResponse`] //! instead of axum's default plain-text rejection. +//! +//! [`ErrorResponse`]: crate::handler::response::ErrorResponse use aide::OperationInput; use axum::extract::rejection::JsonRejection; diff --git a/crates/nvisy-server/src/extract/path.rs b/crates/nvisy-server/src/extract/path.rs index cefe8e1a..0175bd13 100644 --- a/crates/nvisy-server/src/extract/path.rs +++ b/crates/nvisy-server/src/extract/path.rs @@ -2,8 +2,10 @@ //! //! Wraps [`axum::extract::Path`] so that invalid path parameters //! (e.g. a malformed UUID) produce our standard -//! [`ErrorResponse`](crate::handler::response::ErrorResponse) +//! [`ErrorResponse`] //! instead of axum's default plain-text rejection. +//! +//! [`ErrorResponse`]: crate::handler::response::ErrorResponse use aide::OperationInput; use axum::extract::FromRequestParts; diff --git a/crates/nvisy-server/src/handler/request/mod.rs b/crates/nvisy-server/src/handler/request/mod.rs index 7f9932b0..2d6011e2 100644 --- a/crates/nvisy-server/src/handler/request/mod.rs +++ b/crates/nvisy-server/src/handler/request/mod.rs @@ -1,8 +1,11 @@ //! Typed request bodies and path parameters for API endpoints. //! -//! Each struct derives [`Deserialize`](serde::Deserialize) and -//! [`JsonSchema`](schemars::JsonSchema) for automatic OpenAPI schema +//! Each struct derives [`Deserialize`] and +//! [`JsonSchema`] for automatic OpenAPI schema //! generation via aide. +//! +//! [`Deserialize`]: serde::Deserialize +//! [`JsonSchema`]: schemars::JsonSchema mod contexts; mod files; diff --git a/crates/nvisy-server/src/handler/response/mod.rs b/crates/nvisy-server/src/handler/response/mod.rs index 77ce92fc..6c254ebc 100644 --- a/crates/nvisy-server/src/handler/response/mod.rs +++ b/crates/nvisy-server/src/handler/response/mod.rs @@ -1,8 +1,11 @@ //! Typed response bodies for API endpoints. //! -//! Each struct derives [`Serialize`](serde::Serialize) and -//! [`JsonSchema`](schemars::JsonSchema) for automatic OpenAPI schema +//! Each struct derives [`Serialize`] and +//! [`JsonSchema`] for automatic OpenAPI schema //! generation via aide. +//! +//! [`Serialize`]: serde::Serialize +//! [`JsonSchema`]: schemars::JsonSchema mod check; mod contexts; diff --git a/crates/nvisy-server/src/middleware/recovery.rs b/crates/nvisy-server/src/middleware/recovery.rs index 60d09be7..2672c6a1 100644 --- a/crates/nvisy-server/src/middleware/recovery.rs +++ b/crates/nvisy-server/src/middleware/recovery.rs @@ -26,6 +26,7 @@ use serde::{Deserialize, Serialize}; use serde_with::{DurationSeconds, serde_as}; use tower::ServiceBuilder; use tower::timeout::TimeoutLayer; +use tower::timeout::error::Elapsed; use tower_http::catch_panic::CatchPanicLayer; use super::constants::DEFAULT_REQUEST_TIMEOUT_SECS; @@ -111,11 +112,11 @@ where /// Converts a Tower service error into an appropriate HTTP error response. /// -/// Distinguishes timeouts ([`Elapsed`](tower::timeout::error::Elapsed)) +/// Distinguishes timeouts ([`Elapsed`]) /// from other middleware errors and logs accordingly. +/// +/// [`Elapsed`]: tower::timeout::error::Elapsed pub(crate) fn handle_error(err: tower::BoxError) -> ResponseFut { - use tower::timeout::error::Elapsed; - if err.downcast_ref::().is_some() { tracing::error!( target: TRACING_TARGET_ERROR, @@ -141,9 +142,12 @@ pub(crate) fn handle_error(err: tower::BoxError) -> ResponseFut { /// Converts a panic payload into a `500 Internal Server Error` response. /// /// Returns `Response` directly (not a future) because -/// [`ResponseForPanic`](tower_http::catch_panic::ResponseForPanic) requires +/// [`ResponseForPanic`] requires /// a synchronous return, unlike [`handle_error`] which returns a -/// [`BoxFuture`](futures::future::BoxFuture). +/// [`BoxFuture`]. +/// +/// [`ResponseForPanic`]: tower_http::catch_panic::ResponseForPanic +/// [`BoxFuture`]: futures::future::BoxFuture fn catch_panic(err: Panic) -> Response { let message = err .downcast_ref::()