diff --git a/Cargo.lock b/Cargo.lock index 00a7a003..44addfdc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1897,6 +1897,12 @@ dependencies = [ "digest 0.11.3", ] +[[package]] +name = "hound" +version = "3.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f" + [[package]] name = "html5ever" version = "0.39.0" @@ -3003,6 +3009,7 @@ dependencies = [ "derive_more", "futures", "hipstr", + "hound", "image", "imageproc", "lopdf", diff --git a/Cargo.toml b/Cargo.toml index 73154764..24bed152 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -113,6 +113,9 @@ quick-xml = { version = "0.39", features = [] } image = { version = "0.25", default-features = false, features = ["png", "jpeg", "tiff"] } imageproc = { version = "0.26", features = [] } +# Audio processing +hound = { version = "3.5", features = [] } + # Python interop pyo3 = { version = "0.28", features = [] } pyo3-async-runtimes = { version = "0.28", features = ["tokio-runtime"] } diff --git a/crates/nvisy-codec/Cargo.toml b/crates/nvisy-codec/Cargo.toml index 00142816..253414f5 100644 --- a/crates/nvisy-codec/Cargo.toml +++ b/crates/nvisy-codec/Cargo.toml @@ -56,13 +56,16 @@ uuid = { workspace = true, features = [] } hipstr = { workspace = true, features = [] } # Derive macros and error handling -derive_more = { workspace = true, features = ["as_ref", "deref", "deref_mut", "display", "from"] } +derive_more = { workspace = true, features = ["as_ref", "deref", "deref_mut", "display", "from", "into_iterator"] } thiserror = { workspace = true, features = [] } # Image processing image = { workspace = true, features = [] } imageproc = { workspace = true, features = [] } +# Audio processing +hound = { workspace = true, features = [] } + # PDF processing (feature-gated) lopdf = { workspace = true, optional = true, features = [] } pdfium-render = { workspace = true, optional = true, features = [] } diff --git a/crates/nvisy-codec/src/document/located.rs b/crates/nvisy-codec/src/document/located.rs index 8fbedcdc..2ba9076b 100644 --- a/crates/nvisy-codec/src/document/located.rs +++ b/crates/nvisy-codec/src/document/located.rs @@ -11,7 +11,7 @@ use nvisy_core::content::ContentSource; /// [`Redactions`] — the source is metadata about how the location /// was produced, not part of its identity. /// -/// [`Redactions`]: crate::transform::Redactions +/// [`Redactions`]: crate::handler::Redactions #[derive(Debug, Clone, PartialEq)] pub struct Located { /// The handler-level source that produced this location. diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs index 60410d52..03cbc923 100644 --- a/crates/nvisy-codec/src/document/mod.rs +++ b/crates/nvisy-codec/src/document/mod.rs @@ -18,11 +18,12 @@ pub use self::located::Located; pub use self::span::Span; pub use self::stream::LocationStream; use crate::handler::{ - AudioData, AudioHandler, BoxedAudioHandler, BoxedImageHandler, BoxedRichHandler, - BoxedTabularHandler, BoxedTextHandler, CsvLoader, CsvParams, Handler, ImageData, ImageHandler, - JpegLoader, JpegParams, JsonLoader, JsonParams, Loader, MarkdownLoader, MarkdownParams, - Mp3Loader, Mp3Params, PngLoader, PngParams, TabularHandler, TextData, TextHandler, TiffLoader, - TiffParams, TxtLoader, TxtParams, WavLoader, WavParams, XlsxLoader, XlsxParams, + AudioData, AudioHandler, AudioRedaction, BoxedAudioHandler, BoxedImageHandler, + BoxedRichHandler, BoxedTabularHandler, BoxedTextHandler, CsvLoader, CsvParams, Handler, + ImageData, ImageHandler, ImageRedaction, JpegLoader, JpegParams, JsonLoader, JsonParams, + Loader, MarkdownLoader, MarkdownParams, Mp3Loader, Mp3Params, PngLoader, PngParams, Redactions, + TabularHandler, TabularRedaction, TextData, TextHandler, TextRedaction, TiffLoader, TiffParams, + TxtLoader, TxtParams, WavLoader, WavParams, XlsxLoader, XlsxParams, }; #[cfg(feature = "docx")] use crate::handler::{DocxLoader, DocxParams}; @@ -30,9 +31,6 @@ use crate::handler::{DocxLoader, DocxParams}; use crate::handler::{HtmlLoader, HtmlParams}; #[cfg(feature = "pdf")] use crate::handler::{PdfLoader, PdfParams}; -use crate::transform::{ - AudioRedaction, ImageRedaction, Redactions, TabularRedaction, TextRedaction, -}; /// A fully type-erased document that can hold any supported format. /// @@ -170,7 +168,7 @@ impl ContentHandle { redactions: Redactions, ) -> Result<(), Error> { match self { - Self::Text(h) => h.redact(redactions).await, + Self::Text(h) => TextHandler::redact(h, redactions).await, Self::Rich(h) => TextHandler::redact(h, redactions).await, Self::Tabular(_) | Self::Image(_) | Self::Audio(_) => Ok(()), } @@ -193,7 +191,7 @@ impl ContentHandle { redactions: Redactions, ) -> Result<(), Error> { match self { - Self::Image(h) => h.redact(redactions).await, + Self::Image(h) => ImageHandler::redact(h, redactions).await, Self::Rich(h) => ImageHandler::redact(h, redactions).await, Self::Text(_) | Self::Tabular(_) | Self::Audio(_) => Ok(()), } diff --git a/crates/nvisy-codec/src/handler/audio/apply.rs b/crates/nvisy-codec/src/handler/audio/apply.rs new file mode 100644 index 00000000..bf0c6118 --- /dev/null +++ b/crates/nvisy-codec/src/handler/audio/apply.rs @@ -0,0 +1,184 @@ +//! Helper for applying a single [`AudioRedaction`] to a typed sample +//! buffer in place. + +use nvisy_ontology::primitive::TimeSpan; + +use crate::handler::{AudioOutput, AudioRedaction}; + +const TARGET: &str = "nvisy_codec::handler::audio"; + +/// Apply a single redaction to `samples` in place. +/// +/// `samples` is a flat, channel-interleaved buffer of `S`. `channels` +/// is the number of channels (1 for mono, 2 for stereo). `sample_rate` +/// is the sample rate in Hz. The redaction expresses its range as a +/// [`TimeSpan`] supplied separately by the caller — under the +/// `(location, redaction)` shape the time span lives on the +/// [`AudioLocation`], not the redaction. +/// +/// Ordering across multiple redactions is the caller's +/// responsibility: an [`AudioOutput::Remove`] shrinks the buffer, so +/// later time spans must be applied first to keep earlier ones' +/// indices valid. See [`AudioHandler::redact`]. +/// +/// [`AudioLocation`]: nvisy_ontology::entity::AudioLocation +/// [`AudioHandler::redact`]: crate::handler::AudioHandler::redact +pub(crate) fn apply_audio_redaction( + samples: &mut Vec, + time_span: TimeSpan, + redaction: &AudioRedaction, + sample_rate: u32, + channels: u16, +) where + S: Default + Clone, +{ + let (start_sample, end_sample) = + samples_for_time_span(time_span.start_us, time_span.end_us, sample_rate, channels); + let start = start_sample.min(samples.len()); + let end = end_sample.min(samples.len()); + if start >= end { + return; + } + match &redaction.output { + AudioOutput::Silence => { + for s in &mut samples[start..end] { + *s = S::default(); + } + } + AudioOutput::Remove => { + samples.drain(start..end); + } + AudioOutput::Replace { .. } => { + tracing::warn!( + target: TARGET, + start_us = time_span.start_us, + end_us = time_span.end_us, + "AudioOutput::Replace is not yet implemented, skipping", + ); + } + } +} + +/// Convert a `[start_us, end_us)` time span to a `[start_sample, +/// end_sample)` index range into a channel-interleaved sample buffer. +/// +/// Rounds half-up at the frame boundary, then multiplies by `channels` +/// so the returned indices land on frame boundaries (no stereo channel +/// swap on [`AudioOutput::Remove`]). +fn samples_for_time_span( + start_us: i64, + end_us: i64, + sample_rate: u32, + channels: u16, +) -> (usize, usize) { + let start_frame = us_to_frame(start_us, sample_rate); + let end_frame = us_to_frame(end_us, sample_rate); + ( + start_frame.saturating_mul(channels as usize), + end_frame.saturating_mul(channels as usize), + ) +} + +fn us_to_frame(us: i64, sample_rate: u32) -> usize { + if us <= 0 { + return 0; + } + let num = (us as u128) * (sample_rate as u128) + 500_000; + (num / 1_000_000) as usize +} + +#[cfg(test)] +mod tests { + use super::*; + + fn span(start_us: i64, end_us: i64) -> TimeSpan { + TimeSpan { start_us, end_us } + } + + #[test] + fn silence_zeroes_range_mono() { + let mut samples: Vec = (1..=10).collect(); + apply_audio_redaction( + &mut samples, + span(3_000, 6_000), + &AudioRedaction::new(AudioOutput::Silence), + 1000, + 1, + ); + assert_eq!(samples, vec![1, 2, 3, 0, 0, 0, 7, 8, 9, 10]); + } + + #[test] + fn remove_shrinks_range_mono() { + let mut samples: Vec = (1..=10).collect(); + apply_audio_redaction( + &mut samples, + span(3_000, 6_000), + &AudioRedaction::new(AudioOutput::Remove), + 1000, + 1, + ); + assert_eq!(samples, vec![1, 2, 3, 7, 8, 9, 10]); + } + + #[test] + fn stereo_silence_aligns_to_frames() { + let mut samples: Vec = (1..=20).collect(); + apply_audio_redaction( + &mut samples, + span(3_000, 6_000), + &AudioRedaction::new(AudioOutput::Silence), + 1000, + 2, + ); + assert_eq!( + samples, + vec![ + 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 13, 14, 15, 16, 17, 18, 19, 20 + ], + ); + } + + #[test] + fn stereo_remove_drops_frames_not_samples() { + let mut samples: Vec = (1..=20).collect(); + apply_audio_redaction( + &mut samples, + span(3_000, 6_000), + &AudioRedaction::new(AudioOutput::Remove), + 1000, + 2, + ); + assert_eq!(samples.len(), 14); + assert_eq!( + samples, + vec![1, 2, 3, 4, 5, 6, 13, 14, 15, 16, 17, 18, 19, 20] + ); + } + + #[test] + fn out_of_bounds_clipped() { + let mut samples: Vec = (1..=5).collect(); + apply_audio_redaction( + &mut samples, + span(0, 999_999_000), + &AudioRedaction::new(AudioOutput::Silence), + 1000, + 1, + ); + assert_eq!(samples, vec![0, 0, 0, 0, 0]); + } + + #[test] + fn replace_is_warned_and_skipped() { + let mut samples: Vec = (1..=5).collect(); + apply_audio_redaction( + &mut samples, + span(0, 3_000), + &AudioRedaction::new(AudioOutput::Replace { data: vec![] }), + 1000, + 1, + ); + assert_eq!(samples, vec![1, 2, 3, 4, 5]); + } +} diff --git a/crates/nvisy-codec/src/handler/audio/audio_handler.rs b/crates/nvisy-codec/src/handler/audio/audio_handler.rs index ee79dc11..080f7424 100644 --- a/crates/nvisy-codec/src/handler/audio/audio_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/audio_handler.rs @@ -9,8 +9,7 @@ use nvisy_ontology::entity::AudioLocation; use super::{AudioData, Mp3Handler, WavHandler}; use crate::document::LocationStream; -use crate::handler::{AudioHandler, Handler}; -use crate::transform::{AudioRedaction, Redactions}; +use crate::handler::{AudioHandler, AudioRedaction, Handler}; /// A type-erased audio handler backed by a boxed trait object. pub struct BoxedAudioHandler(Box); @@ -66,10 +65,11 @@ impl AudioHandler for BoxedAudioHandler { self.0.read(location).await } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &AudioLocation, + redaction: AudioRedaction, ) -> Result<(), Error> { - self.0.redact(redactions).await + self.0.redact_at(location, redaction).await } } diff --git a/crates/nvisy-codec/src/handler/audio/audio_handler_macro.rs b/crates/nvisy-codec/src/handler/audio/audio_handler_macro.rs deleted file mode 100644 index fe61b310..00000000 --- a/crates/nvisy-codec/src/handler/audio/audio_handler_macro.rs +++ /dev/null @@ -1,104 +0,0 @@ -//! [`impl_audio_handler!`]: shared macro for audio handler structs. - -/// Implement [`Handler`] + [`AudioHandler`] + inherent methods for an -/// audio handler struct that holds raw bytes. -/// -/// [`Handler`]: crate::handler::Handler -/// [`AudioHandler`]: crate::handler::AudioHandler -macro_rules! impl_audio_handler { - ($handler:ident, $doc_type:expr, $origin:literal, $encode_name:literal) => { - impl crate::handler::Handler for $handler { - fn document_type(&self) -> nvisy_core::media::DocumentType { - $doc_type - } - - fn source(&self) -> nvisy_core::content::ContentSource { - self.source - } - - #[tracing::instrument(name = $encode_name, skip_all, fields(output_bytes))] - fn encode(&self) -> Result { - tracing::Span::current().record("output_bytes", self.bytes.len()); - let source = nvisy_core::content::ContentSource::new().with_parent(&self.source); - Ok(nvisy_core::content::ContentData::new( - source, - self.bytes.clone(), - )) - } - } - - #[async_trait::async_trait] - impl crate::handler::AudioHandler for $handler { - fn locations( - &self, - ) -> crate::document::LocationStream<'_, nvisy_ontology::entity::AudioLocation> - { - use ::std::iter; - - // Single-track audio: the entire audio as one location - // with a time span covering the full duration. Duration - // is unknown without decoding — use 0..0 as a - // placeholder. The actual time span is set by the STT - // extraction operation after transcription. - let location = nvisy_ontology::entity::AudioLocation { - time_span: nvisy_ontology::primitive::TimeSpan { - start_us: 0, - end_us: 0, - }, - speaker_id: None, - audio_id: None, - }; - crate::document::LocationStream::new(futures::stream::iter(iter::once( - crate::document::Located::new(self.source, location), - ))) - } - - async fn read( - &self, - _location: &nvisy_ontology::entity::AudioLocation, - ) -> Option { - // Full audio segment: extracting a sub-segment by - // time span requires decoding, which we don't do here. - Some(crate::handler::AudioData::new(self.bytes.clone())) - } - - async fn redact( - &mut self, - _redactions: crate::transform::Redactions< - nvisy_ontology::entity::AudioLocation, - crate::transform::AudioRedaction, - >, - ) -> Result<(), nvisy_core::Error> { - // TODO: implement audio redaction (silence/remove time ranges) - tracing::warn!( - target: $origin, - "audio redaction is not yet implemented" - ); - Ok(()) - } - } - - impl $handler { - /// Create a handler from raw audio bytes. - pub fn new(bytes: bytes::Bytes) -> Self { - Self { - source: nvisy_core::content::ContentSource::new(), - bytes, - } - } - - /// Set the content source for lineage tracking. - pub fn with_source(mut self, source: nvisy_core::content::ContentSource) -> Self { - self.source = source; - self - } - - /// Reference to the raw audio bytes. - pub fn bytes(&self) -> &bytes::Bytes { - &self.bytes - } - } - }; -} - -pub(crate) use impl_audio_handler; diff --git a/crates/nvisy-codec/src/handler/audio/instruction.rs b/crates/nvisy-codec/src/handler/audio/instruction.rs new file mode 100644 index 00000000..e486bf91 --- /dev/null +++ b/crates/nvisy-codec/src/handler/audio/instruction.rs @@ -0,0 +1,46 @@ +//! Audio redaction instruction types. + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::handler::Mergeable; + +/// An audio redaction: the *how*. The *where* (time span, speaker, +/// audio id) lives on the containing [`AudioLocation`] via +/// [`Redactions`]'s `(S, R)` pairs. +/// +/// [`AudioLocation`]: nvisy_ontology::entity::AudioLocation +/// [`Redactions`]: crate::handler::Redactions +#[derive(Debug, Clone, PartialEq)] +pub struct AudioRedaction { + /// The redaction output that determines the rendering method. + pub(crate) output: AudioOutput, +} + +impl AudioRedaction { + /// Create a new audio redaction. + pub fn new(output: AudioOutput) -> Self { + Self { output } + } +} + +/// Audio redaction output — records the method used. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum AudioOutput { + /// Segment replaced with silence. + Silence, + /// Segment removed entirely. + Remove, + /// Segment replaced with provided audio data. + Replace { data: Vec }, +} + +impl Mergeable for AudioRedaction { + /// Combine two redactions that target overlapping locations. + /// Returns `Some` only when the outputs match; conflicting methods + /// (e.g. silence vs remove) cannot be reconciled. + fn try_merge(self, other: Self) -> Option { + (self.output == other.output).then_some(self) + } +} diff --git a/crates/nvisy-codec/src/handler/audio/mod.rs b/crates/nvisy-codec/src/handler/audio/mod.rs index 56885b8f..000f6874 100644 --- a/crates/nvisy-codec/src/handler/audio/mod.rs +++ b/crates/nvisy-codec/src/handler/audio/mod.rs @@ -1,23 +1,27 @@ //! Audio format handlers and loaders. +use std::cmp::Reverse; + use nvisy_core::Error; use nvisy_ontology::entity::AudioLocation; use super::Handler; use crate::document::LocationStream; -use crate::transform::{AudioRedaction, Redactions}; +use crate::handler::Redactions; +mod apply; mod audio_data; mod audio_handler; -mod audio_handler_macro; +mod instruction; mod mp3_handler; mod mp3_loader; mod wav_handler; mod wav_loader; +pub(crate) use self::apply::apply_audio_redaction; pub use self::audio_data::AudioData; pub use self::audio_handler::BoxedAudioHandler; -use self::audio_handler_macro::impl_audio_handler; +pub use self::instruction::{AudioOutput, AudioRedaction}; pub use self::mp3_handler::Mp3Handler; pub use self::mp3_loader::{Mp3Loader, Mp3Params}; pub use self::wav_handler::WavHandler; @@ -25,10 +29,22 @@ pub use self::wav_loader::{WavLoader, WavParams}; /// Capability trait for handlers that expose audio content. /// -/// Handlers expose audio content as a stream of [`AudioLocation`]s -/// (cheap, identity-only), with explicit `read` calls to fetch the -/// payload for any given location, and a `redact` call that applies a -/// batch of [`AudioRedaction`]s grouped by location. +/// Handlers implement three narrow operations: +/// - [`locations`]: cheap, identity-only stream of [`AudioLocation`]s. +/// - [`read`]: fetch the payload for the time range identified by a +/// location. +/// - [`redact_at`]: apply a single redaction at a single time range. +/// +/// Batched redaction is provided by [`redact`], which overrides the +/// default loop ordering to apply later time spans first — an +/// [`AudioOutput::Remove`] shrinks the buffer and shifts every later +/// sample index, so right-to-left order keeps earlier indices valid. +/// +/// [`locations`]: AudioHandler::locations +/// [`read`]: AudioHandler::read +/// [`redact_at`]: AudioHandler::redact_at +/// [`redact`]: AudioHandler::redact +/// [`AudioOutput::Remove`]: crate::handler::AudioOutput::Remove #[async_trait::async_trait] pub trait AudioHandler: Handler { /// Async stream of [`AudioLocation`]s for this document, each @@ -42,9 +58,26 @@ pub trait AudioHandler: Handler { /// Returns `None` if the location is out of bounds. async fn read(&self, location: &AudioLocation) -> Option; - /// Apply a batch of redactions grouped by [`AudioLocation`]. + /// Apply a single redaction to the time range identified by + /// `location`, mutating in place. + async fn redact_at( + &mut self, + location: &AudioLocation, + redaction: AudioRedaction, + ) -> Result<(), Error>; + + /// Apply every `(location, redaction)` pair in `redactions` to the + /// handler, sorted right-to-left by `time_span.start_us`. The first + /// error aborts the batch. async fn redact( &mut self, redactions: Redactions, - ) -> Result<(), Error>; + ) -> Result<(), Error> { + let mut items: Vec<_> = redactions.into_iter().collect(); + items.sort_by_key(|(loc, _)| Reverse(loc.time_span.start_us)); + for (location, redaction) in items { + self.redact_at(&location, redaction).await?; + } + Ok(()) + } } diff --git a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs index 78006e5c..f170acb7 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs @@ -1,34 +1,139 @@ //! MP3 handler: holds raw MP3 audio bytes and provides location-based //! access via [`AudioHandler`]. //! -//! [`AudioHandler::locations`] yields a single full-duration -//! [`AudioLocation`]; [`AudioHandler::read`] returns the underlying -//! bytes as [`AudioData`]. Redaction is currently a no-op. +//! Redaction is **not supported**: no pure-Rust MP3 encoder exists and +//! pulling a C dependency (libmp3lame) is out of scope here. Callers +//! get an explicit error from [`AudioHandler::redact_at`]; under +//! [`AudioHandler::redact`] this aborts the document's pipeline at the +//! first redaction. Convert to WAV upstream if audio redaction is +//! required. //! //! [`AudioHandler`]: crate::handler::AudioHandler -//! [`AudioHandler::locations`]: crate::handler::AudioHandler::locations -//! [`AudioHandler::read`]: crate::handler::AudioHandler::read -//! [`AudioLocation`]: nvisy_ontology::entity::AudioLocation +//! [`AudioHandler::redact_at`]: crate::handler::AudioHandler::redact_at +//! [`AudioHandler::redact`]: crate::handler::AudioHandler::redact -use nvisy_core::content::ContentSource; +use bytes::Bytes; +use nvisy_core::Error; +use nvisy_core::content::{ContentData, ContentSource}; +use nvisy_core::media::{AudioFormat, DocumentType}; +use nvisy_ontology::entity::AudioLocation; +use nvisy_ontology::primitive::TimeSpan; -use super::impl_audio_handler; +use crate::document::{Located, LocationStream}; +use crate::handler::{AudioData, AudioHandler, AudioRedaction, Handler}; + +const TARGET: &str = "mp3-handler"; /// Handler for loaded MP3 content. -/// -/// Stores the raw audio bytes directly. The bytes can be produced -/// on demand via [`Handler::encode`]. -/// -/// [`Handler::encode`]: crate::handler::Handler::encode #[derive(Debug)] pub struct Mp3Handler { source: ContentSource, - bytes: bytes::Bytes, + bytes: Bytes, +} + +impl Mp3Handler { + /// Create a handler from raw MP3 bytes. + pub fn new(bytes: Bytes) -> Self { + Self { + source: ContentSource::new(), + bytes, + } + } + + /// Set the content source for lineage tracking. + pub fn with_source(mut self, source: ContentSource) -> Self { + self.source = source; + self + } + + /// Reference to the raw audio bytes. + pub fn bytes(&self) -> &Bytes { + &self.bytes + } +} + +impl Handler for Mp3Handler { + fn document_type(&self) -> DocumentType { + DocumentType::Audio(AudioFormat::Mp3) + } + + fn source(&self) -> ContentSource { + self.source + } + + #[tracing::instrument(name = "mp3.encode", skip_all, fields(output_bytes))] + fn encode(&self) -> Result { + tracing::Span::current().record("output_bytes", self.bytes.len()); + let source = ContentSource::new().with_parent(&self.source); + Ok(ContentData::new(source, self.bytes.clone())) + } } -impl_audio_handler!( - Mp3Handler, - nvisy_core::media::DocumentType::Audio(nvisy_core::media::AudioFormat::Mp3), - "mp3-handler", - "mp3.encode" -); +#[async_trait::async_trait] +impl AudioHandler for Mp3Handler { + fn locations(&self) -> LocationStream<'_, AudioLocation> { + let location = AudioLocation { + time_span: TimeSpan { + start_us: 0, + end_us: 0, + }, + speaker_id: None, + audio_id: None, + }; + LocationStream::new(futures::stream::iter(std::iter::once(Located::new( + self.source, + location, + )))) + } + + async fn read(&self, _location: &AudioLocation) -> Option { + Some(AudioData::new(self.bytes.clone())) + } + + async fn redact_at( + &mut self, + _location: &AudioLocation, + _redaction: AudioRedaction, + ) -> Result<(), Error> { + Err(Error::validation( + "MP3 redaction is not yet supported — convert audio to WAV before redaction", + TARGET, + )) + } +} + +#[cfg(test)] +mod tests { + use nvisy_ontology::primitive::TimeSpan; + + use super::*; + use crate::handler::{AudioHandler, AudioOutput, ConflictPolicy, Redactions}; + + #[tokio::test] + async fn redact_with_entries_errors() { + let mut handler = Mp3Handler::new(Bytes::from_static(b"fake mp3")); + let location = AudioLocation { + time_span: TimeSpan { + start_us: 0, + end_us: 1_000, + }, + speaker_id: None, + audio_id: None, + }; + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert(location, AudioRedaction::new(AudioOutput::Silence)) + .unwrap(); + let err = handler.redact(rs).await.unwrap_err(); + assert!( + err.to_string() + .contains("MP3 redaction is not yet supported") + ); + } + + #[tokio::test] + async fn empty_redactions_is_noop() { + let mut handler = Mp3Handler::new(Bytes::from_static(b"fake mp3")); + let rs: Redactions = Redactions::default(); + handler.redact(rs).await.unwrap(); + } +} diff --git a/crates/nvisy-codec/src/handler/audio/wav_handler.rs b/crates/nvisy-codec/src/handler/audio/wav_handler.rs index 49611ca8..4bbc9db5 100644 --- a/crates/nvisy-codec/src/handler/audio/wav_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/wav_handler.rs @@ -1,34 +1,315 @@ //! WAV handler: holds raw WAV audio bytes and provides location-based //! access via [`AudioHandler`]. //! -//! [`AudioHandler::locations`] yields a single full-duration -//! [`AudioLocation`]; [`AudioHandler::read`] returns the underlying -//! bytes as [`AudioData`]. Redaction is currently a no-op. +//! Redaction decodes the WAV via [`hound`], applies a single +//! sample-level mutation, and re-encodes back to bytes. +//! Supported formats are `i8` / `i16` / `i32` PCM and `f32` IEEE +//! float; other bit depths surface a clear error. +//! +//! Batched redaction goes through [`AudioHandler::redact`], which +//! sorts right-to-left by `time_span.start_us` so +//! [`AudioOutput::Remove`] operations don't shift the indices of +//! pending redactions. //! //! [`AudioHandler`]: crate::handler::AudioHandler -//! [`AudioHandler::locations`]: crate::handler::AudioHandler::locations -//! [`AudioHandler::read`]: crate::handler::AudioHandler::read -//! [`AudioLocation`]: nvisy_ontology::entity::AudioLocation +//! [`AudioHandler::redact`]: crate::handler::AudioHandler::redact +//! [`AudioOutput::Remove`]: crate::handler::AudioOutput::Remove + +use std::io::Cursor; + +use bytes::Bytes; +use hound::{Sample, SampleFormat, WavReader, WavSpec, WavWriter}; +use nvisy_core::Error; +use nvisy_core::content::{ContentData, ContentSource}; +use nvisy_core::media::{AudioFormat, DocumentType}; +use nvisy_ontology::entity::AudioLocation; +use nvisy_ontology::primitive::TimeSpan; -use nvisy_core::content::ContentSource; +use super::{AudioRedaction, apply_audio_redaction}; +use crate::document::{Located, LocationStream}; +use crate::handler::{AudioData, AudioHandler, Handler}; -use super::impl_audio_handler; +const TARGET: &str = "wav-handler"; /// Handler for loaded WAV content. -/// -/// Stores the raw audio bytes directly. The bytes can be produced -/// on demand via [`Handler::encode`]. -/// -/// [`Handler::encode`]: crate::handler::Handler::encode #[derive(Debug)] pub struct WavHandler { source: ContentSource, - bytes: bytes::Bytes, + bytes: Bytes, +} + +impl WavHandler { + /// Create a handler from raw WAV bytes. + pub fn new(bytes: Bytes) -> Self { + Self { + source: ContentSource::new(), + bytes, + } + } + + /// Set the content source for lineage tracking. + pub fn with_source(mut self, source: ContentSource) -> Self { + self.source = source; + self + } + + /// Reference to the raw audio bytes. + pub fn bytes(&self) -> &Bytes { + &self.bytes + } +} + +impl Handler for WavHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Audio(AudioFormat::Wav) + } + + fn source(&self) -> ContentSource { + self.source + } + + #[tracing::instrument(name = "wav.encode", skip_all, fields(output_bytes))] + fn encode(&self) -> Result { + tracing::Span::current().record("output_bytes", self.bytes.len()); + let source = ContentSource::new().with_parent(&self.source); + Ok(ContentData::new(source, self.bytes.clone())) + } +} + +#[async_trait::async_trait] +impl AudioHandler for WavHandler { + fn locations(&self) -> LocationStream<'_, AudioLocation> { + // Single-track audio: the entire audio as one location with a + // time span covering the full duration. Duration is unknown + // without decoding — use 0..0 as a placeholder. The actual + // time span is set by the STT extraction operation after + // transcription. + let location = AudioLocation { + time_span: TimeSpan { + start_us: 0, + end_us: 0, + }, + speaker_id: None, + audio_id: None, + }; + LocationStream::new(futures::stream::iter(std::iter::once(Located::new( + self.source, + location, + )))) + } + + async fn read(&self, _location: &AudioLocation) -> Option { + // Full audio segment: extracting a sub-segment by time span + // requires decoding, which we don't do here. + Some(AudioData::new(self.bytes.clone())) + } + + async fn redact_at( + &mut self, + location: &AudioLocation, + redaction: AudioRedaction, + ) -> Result<(), Error> { + let spec = read_spec(&self.bytes)?; + let new_bytes = match (spec.sample_format, spec.bits_per_sample) { + (SampleFormat::Int, 8) => { + redact_typed::(&self.bytes, spec, location.time_span, &redaction)? + } + (SampleFormat::Int, 16) => { + redact_typed::(&self.bytes, spec, location.time_span, &redaction)? + } + (SampleFormat::Int, 24 | 32) => { + redact_typed::(&self.bytes, spec, location.time_span, &redaction)? + } + (SampleFormat::Float, 32) => { + redact_typed::(&self.bytes, spec, location.time_span, &redaction)? + } + _ => { + return Err(Error::validation( + format!( + "WAV format not yet supported: {:?}/{} bits", + spec.sample_format, spec.bits_per_sample + ), + TARGET, + )); + } + }; + self.bytes = Bytes::from(new_bytes); + Ok(()) + } +} + +/// Read just the WAV header to discover the sample format. +fn read_spec(bytes: &Bytes) -> Result { + let reader = WavReader::new(Cursor::new(bytes.as_ref())) + .map_err(|e| Error::validation(format!("invalid WAV: {e}"), TARGET))?; + Ok(reader.spec()) +} + +/// Decode → redact → re-encode for a specific sample type. +fn redact_typed( + bytes: &Bytes, + spec: WavSpec, + time_span: TimeSpan, + redaction: &AudioRedaction, +) -> Result, Error> +where + S: Sample + Default + Clone, +{ + let mut reader = WavReader::new(Cursor::new(bytes.as_ref())) + .map_err(|e| Error::validation(format!("invalid WAV: {e}"), TARGET))?; + let mut samples: Vec = reader + .samples::() + .collect::, _>>() + .map_err(|e| Error::validation(format!("WAV sample decode error: {e}"), TARGET))?; + + apply_audio_redaction( + &mut samples, + time_span, + redaction, + spec.sample_rate, + spec.channels, + ); + + let mut buf = Cursor::new(Vec::::new()); + { + let mut writer = WavWriter::new(&mut buf, spec) + .map_err(|e| Error::validation(format!("WAV writer init error: {e}"), TARGET))?; + for sample in samples { + writer + .write_sample(sample) + .map_err(|e| Error::validation(format!("WAV sample write error: {e}"), TARGET))?; + } + writer + .finalize() + .map_err(|e| Error::validation(format!("WAV finalize error: {e}"), TARGET))?; + } + Ok(buf.into_inner()) } -impl_audio_handler!( - WavHandler, - nvisy_core::media::DocumentType::Audio(nvisy_core::media::AudioFormat::Wav), - "wav-handler", - "wav.encode" -); +#[cfg(test)] +mod tests { + use hound::SampleFormat; + + use super::*; + use crate::handler::{AudioHandler, AudioOutput, ConflictPolicy, Redactions}; + + /// Encode a mono i16 PCM WAV with the given samples at 1 kHz. + fn encode_wav_mono_i16(samples: &[i16]) -> Bytes { + let spec = WavSpec { + channels: 1, + sample_rate: 1000, + bits_per_sample: 16, + sample_format: SampleFormat::Int, + }; + let mut buf = Cursor::new(Vec::::new()); + { + let mut writer = WavWriter::new(&mut buf, spec).unwrap(); + for &s in samples { + writer.write_sample(s).unwrap(); + } + writer.finalize().unwrap(); + } + Bytes::from(buf.into_inner()) + } + + fn decode_wav_mono_i16(bytes: &Bytes) -> Vec { + let mut reader = WavReader::new(Cursor::new(bytes.as_ref())).unwrap(); + reader.samples::().map(Result::unwrap).collect() + } + + fn location(start_us: i64, end_us: i64) -> AudioLocation { + AudioLocation { + time_span: TimeSpan { start_us, end_us }, + speaker_id: None, + audio_id: None, + } + } + + #[tokio::test] + async fn silence_zeros_samples_in_range() { + // 10 samples at 1 kHz = 10 ms. + let bytes = encode_wav_mono_i16(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + let mut handler = WavHandler::new(bytes); + + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + location(3_000, 6_000), + AudioRedaction::new(AudioOutput::Silence), + ) + .unwrap(); + handler.redact(rs).await.unwrap(); + + let samples = decode_wav_mono_i16(handler.bytes()); + assert_eq!(samples, vec![1, 2, 3, 0, 0, 0, 7, 8, 9, 10]); + } + + #[tokio::test] + async fn remove_shortens_file() { + let bytes = encode_wav_mono_i16(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + let mut handler = WavHandler::new(bytes); + + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + location(3_000, 6_000), + AudioRedaction::new(AudioOutput::Remove), + ) + .unwrap(); + handler.redact(rs).await.unwrap(); + + let samples = decode_wav_mono_i16(handler.bytes()); + assert_eq!(samples, vec![1, 2, 3, 7, 8, 9, 10]); + } + + #[tokio::test] + async fn multiple_removes_apply_right_to_left() { + // Two non-overlapping Remove redactions: + // [1..3) removes samples 1..3 (values 2, 3) + // [6..8) removes samples 6..8 (values 7, 8) + // Both time spans measured against original 10-sample buffer. + let bytes = encode_wav_mono_i16(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + let mut handler = WavHandler::new(bytes); + + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + location(1_000, 3_000), + AudioRedaction::new(AudioOutput::Remove), + ) + .unwrap(); + rs.try_insert( + location(6_000, 8_000), + AudioRedaction::new(AudioOutput::Remove), + ) + .unwrap(); + handler.redact(rs).await.unwrap(); + + let samples = decode_wav_mono_i16(handler.bytes()); + // After right-to-left: remove 7,8 first → [1,2,3,4,5,6,9,10], + // then remove 2,3 → [1,4,5,6,9,10]. + assert_eq!(samples, vec![1, 4, 5, 6, 9, 10]); + } + + #[tokio::test] + async fn empty_redactions_is_noop() { + let bytes = encode_wav_mono_i16(&[1, 2, 3]); + let original = bytes.clone(); + let mut handler = WavHandler::new(bytes); + + let rs: Redactions = Redactions::default(); + handler.redact(rs).await.unwrap(); + assert_eq!(handler.bytes(), &original); + } + + #[tokio::test] + async fn unsupported_format_returns_error() { + // Bogus bytes — not a real WAV. read_spec fails. + let mut handler = WavHandler::new(Bytes::from_static(b"not-a-wav")); + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + location(0, 1_000), + AudioRedaction::new(AudioOutput::Silence), + ) + .unwrap(); + let err = handler.redact(rs).await.unwrap_err(); + assert!(err.to_string().contains("invalid WAV")); + } +} diff --git a/crates/nvisy-codec/src/handler/image/apply.rs b/crates/nvisy-codec/src/handler/image/apply.rs new file mode 100644 index 00000000..22414eac --- /dev/null +++ b/crates/nvisy-codec/src/handler/image/apply.rs @@ -0,0 +1,58 @@ +//! Helper for applying a single [`ImageRedaction`] to a +//! [`DynamicImage`] in place. + +use image::DynamicImage; +use nvisy_ontology::primitive::BoundingBox; + +use super::ops::ImageOps; +use crate::handler::{ImageOutput, ImageRedaction}; + +const TARGET: &str = "nvisy_codec::handler::image"; + +/// Apply a single redaction to `img` in place at the given bounding +/// box. The bounding box comes from the redaction's containing +/// [`ImageLocation`] under the `(location, redaction)` shape — not from +/// the redaction itself. +/// +/// Replace outputs whose embedded image data fails to decode are +/// skipped with a warning. +/// +/// [`ImageLocation`]: nvisy_ontology::entity::ImageLocation +pub(crate) fn apply_image_redaction( + img: &mut DynamicImage, + redaction: &ImageRedaction, + bounding_box: BoundingBox, +) { + let region = bounding_box.to_pixel(); + match &redaction.output { + ImageOutput::Blur { sigma } => { + img.apply_gaussian_blur(®ion, *sigma); + } + ImageOutput::Block { color } => { + img.apply_block_overlay(®ion, *color); + } + ImageOutput::Pixelate { block_size } => { + img.apply_pixelate(®ion, *block_size); + } + ImageOutput::Replace { data } => { + let replacement = match image::load_from_memory(data) { + Ok(r) => r, + Err(e) => { + tracing::warn!( + target: TARGET, + region = ?region, + error = %e, + "failed to decode replacement image data, skipping region" + ); + return; + } + }; + let resized = replacement.resize_exact( + region.width, + region.height, + image::imageops::FilterType::Lanczos3, + ); + image::imageops::overlay(img, &resized, region.x as i64, region.y as i64); + } + } +} diff --git a/crates/nvisy-codec/src/handler/image/image_handler.rs b/crates/nvisy-codec/src/handler/image/image_handler.rs index b1db2149..2a194865 100644 --- a/crates/nvisy-codec/src/handler/image/image_handler.rs +++ b/crates/nvisy-codec/src/handler/image/image_handler.rs @@ -9,8 +9,7 @@ use nvisy_ontology::entity::ImageLocation; use super::{ImageData, JpegHandler, PngHandler, TiffHandler}; use crate::document::LocationStream; -use crate::handler::{Handler, ImageHandler}; -use crate::transform::{ImageRedaction, Redactions}; +use crate::handler::{Handler, ImageHandler, ImageRedaction}; /// A type-erased image handler backed by a boxed trait object. pub struct BoxedImageHandler(Box); @@ -72,10 +71,11 @@ impl ImageHandler for BoxedImageHandler { self.0.read(location).await } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &ImageLocation, + redaction: ImageRedaction, ) -> Result<(), Error> { - self.0.redact(redactions).await + self.0.redact_at(location, redaction).await } } diff --git a/crates/nvisy-codec/src/handler/image/image_handler_macro.rs b/crates/nvisy-codec/src/handler/image/image_handler_macro.rs index 88521d74..ea843381 100644 --- a/crates/nvisy-codec/src/handler/image/image_handler_macro.rs +++ b/crates/nvisy-codec/src/handler/image/image_handler_macro.rs @@ -70,21 +70,16 @@ macro_rules! impl_image_handler { Some(crate::handler::ImageData::from(cropped)) } - async fn redact( + async fn redact_at( &mut self, - redactions: crate::transform::Redactions< - nvisy_ontology::entity::ImageLocation, - crate::transform::ImageRedaction, - >, + location: &nvisy_ontology::entity::ImageLocation, + redaction: crate::handler::ImageRedaction, ) -> Result<(), nvisy_core::Error> { - if redactions.is_empty() { - return Ok(()); - } - // Image handlers expose a single full-image location; apply - // every redaction in the collection to the single image. - for (_loc, items) in redactions { - crate::transform::apply_image_redactions(&mut self.image, &items); - } + crate::handler::image::apply_image_redaction( + &mut self.image, + &redaction, + location.bounding_box, + ); Ok(()) } } diff --git a/crates/nvisy-codec/src/handler/image/instruction.rs b/crates/nvisy-codec/src/handler/image/instruction.rs new file mode 100644 index 00000000..412e69ec --- /dev/null +++ b/crates/nvisy-codec/src/handler/image/instruction.rs @@ -0,0 +1,49 @@ +//! Image redaction instruction types. + +use nvisy_ontology::primitive::Color; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::handler::Mergeable; + +/// An image redaction: the *how*. The *where* (bounding box, page +/// number, image id) lives on the containing [`ImageLocation`] via +/// [`Redactions`]'s `(S, R)` pairs. +/// +/// [`ImageLocation`]: nvisy_ontology::entity::ImageLocation +/// [`Redactions`]: crate::handler::Redactions +#[derive(Debug, Clone, PartialEq)] +pub struct ImageRedaction { + /// The redaction output that determines the rendering method. + pub(crate) output: ImageOutput, +} + +impl ImageRedaction { + /// Create a new image redaction with the given output. + pub fn new(output: ImageOutput) -> Self { + Self { output } + } +} + +/// Image redaction output: records the method used and its parameters. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum ImageOutput { + /// Gaussian blur applied to the region. + Blur { sigma: f32 }, + /// Opaque block overlay on the region. + Block { color: Color }, + /// Pixelation (mosaic) applied to the region. + Pixelate { block_size: u32 }, + /// Region replaced with provided image data. + Replace { data: Vec }, +} + +impl Mergeable for ImageRedaction { + /// Combine two redactions that target overlapping locations. + /// Returns `Some` only when the outputs match (method *and* + /// parameters); a Blur and a Pixelate cannot be reconciled. + fn try_merge(self, other: Self) -> Option { + (self.output == other.output).then_some(self) + } +} diff --git a/crates/nvisy-codec/src/handler/image/mod.rs b/crates/nvisy-codec/src/handler/image/mod.rs index 6c8989a6..290f6429 100644 --- a/crates/nvisy-codec/src/handler/image/mod.rs +++ b/crates/nvisy-codec/src/handler/image/mod.rs @@ -5,11 +5,14 @@ use nvisy_ontology::entity::ImageLocation; use super::Handler; use crate::document::LocationStream; -use crate::transform::{ImageRedaction, Redactions}; +use crate::handler::Redactions; +mod apply; mod image_data; mod image_handler; mod image_handler_macro; +mod instruction; +mod ops; mod jpeg_handler; mod jpeg_loader; @@ -20,9 +23,11 @@ mod png_loader; mod tiff_handler; mod tiff_loader; +pub(crate) use self::apply::apply_image_redaction; pub use self::image_data::ImageData; pub use self::image_handler::BoxedImageHandler; pub(crate) use self::image_handler_macro::impl_image_handler; +pub use self::instruction::{ImageOutput, ImageRedaction}; pub use self::jpeg_handler::JpegHandler; pub use self::jpeg_loader::{JpegLoader, JpegParams}; pub use self::png_handler::PngHandler; @@ -32,10 +37,19 @@ pub use self::tiff_loader::{TiffLoader, TiffParams}; /// Capability trait for handlers that expose image content. /// -/// Handlers expose image content as a stream of [`ImageLocation`]s -/// (cheap, identity-only), with explicit `read` calls to fetch the -/// payload for any given location, and a `redact` call that applies a -/// batch of [`ImageRedaction`]s grouped by location. +/// Handlers implement three narrow operations: +/// - [`locations`]: cheap, identity-only stream of [`ImageLocation`]s. +/// - [`read`]: fetch the payload at a given location (cropped to +/// the location's bounding box). +/// - [`redact_at`]: apply a single redaction to a single location. +/// +/// Batched redaction is provided by [`redact`], which loops +/// [`redact_at`] in insertion order. +/// +/// [`locations`]: ImageHandler::locations +/// [`read`]: ImageHandler::read +/// [`redact_at`]: ImageHandler::redact_at +/// [`redact`]: ImageHandler::redact #[async_trait::async_trait] pub trait ImageHandler: Handler { /// Async stream of [`ImageLocation`]s for this document, each @@ -49,9 +63,29 @@ pub trait ImageHandler: Handler { /// Returns `None` if the location is out of bounds. async fn read(&self, location: &ImageLocation) -> Option; - /// Apply a batch of redactions grouped by [`ImageLocation`]. + /// Apply a single redaction at the bounding box identified by + /// `location`, mutating in place. + async fn redact_at( + &mut self, + location: &ImageLocation, + redaction: ImageRedaction, + ) -> Result<(), Error>; + + /// Apply every `(location, redaction)` pair in `redactions` to the + /// handler in insertion order. The first error aborts the batch. + /// + /// The default loops [`redact_at`] in [`Redactions`] insertion + /// order; handlers with ordering constraints override this + /// default. + /// + /// [`redact_at`]: ImageHandler::redact_at async fn redact( &mut self, redactions: Redactions, - ) -> Result<(), Error>; + ) -> Result<(), Error> { + for (location, redaction) in redactions { + self.redact_at(&location, redaction).await?; + } + Ok(()) + } } diff --git a/crates/nvisy-codec/src/transform/image/ops.rs b/crates/nvisy-codec/src/handler/image/ops.rs similarity index 99% rename from crates/nvisy-codec/src/transform/image/ops.rs rename to crates/nvisy-codec/src/handler/image/ops.rs index 7c5b1a07..50f43024 100644 --- a/crates/nvisy-codec/src/transform/image/ops.rs +++ b/crates/nvisy-codec/src/handler/image/ops.rs @@ -9,7 +9,7 @@ use imageproc::filter::gaussian_blur_f32; use nvisy_ontology::primitive::{BoundingBoxPixel, Color}; /// Mutating image-transform operations on individual bounding-box regions. -pub trait ImageOps { +pub(super) trait ImageOps { /// Apply a gaussian blur to `region` with the given `sigma`. fn apply_gaussian_blur(&mut self, region: &BoundingBoxPixel, sigma: f32); diff --git a/crates/nvisy-codec/src/handler/mod.rs b/crates/nvisy-codec/src/handler/mod.rs index a5229f62..c8d09af1 100644 --- a/crates/nvisy-codec/src/handler/mod.rs +++ b/crates/nvisy-codec/src/handler/mod.rs @@ -14,14 +14,19 @@ use nvisy_core::media::DocumentType; mod audio; mod image; +mod policy; +mod redactions; mod rich; mod tabular; mod text; use nvisy_core::content::ContentSource; +pub use nvisy_ontology::entity::Mergeable; pub use self::audio::*; pub use self::image::*; +pub use self::policy::{ConflictPolicy, InsertError}; +pub use self::redactions::Redactions; pub use self::rich::*; pub use self::tabular::*; pub use self::text::*; diff --git a/crates/nvisy-codec/src/transform/policy.rs b/crates/nvisy-codec/src/handler/policy.rs similarity index 83% rename from crates/nvisy-codec/src/transform/policy.rs rename to crates/nvisy-codec/src/handler/policy.rs index 2ab2ace4..f4704211 100644 --- a/crates/nvisy-codec/src/transform/policy.rs +++ b/crates/nvisy-codec/src/handler/policy.rs @@ -1,12 +1,12 @@ //! Conflict resolution policy for [`Redactions`]. //! -//! [`Redactions`]: crate::transform::Redactions +//! [`Redactions`]: crate::handler::Redactions use thiserror::Error; /// How [`Redactions`] resolves overlapping insertions within a span. /// -/// [`Redactions`]: crate::transform::Redactions +/// [`Redactions`]: crate::handler::Redactions #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] pub enum ConflictPolicy { /// Refuse to insert a redaction that overlaps with an existing one. @@ -18,7 +18,7 @@ pub enum ConflictPolicy { /// When `try_merge` returns `None`, insertion fails with /// [`InsertError::NotMergeable`]. /// - /// [`Mergeable::try_merge`]: crate::transform::Mergeable::try_merge + /// [`Mergeable::try_merge`]: crate::handler::Mergeable::try_merge Merge, /// Drop the existing redaction and replace it with the new one. Replace, @@ -27,7 +27,7 @@ pub enum ConflictPolicy { /// Error returned by [`Redactions::try_insert`] when a conflict cannot /// be resolved under the configured [`ConflictPolicy`]. /// -/// [`Redactions::try_insert`]: crate::transform::Redactions::try_insert +/// [`Redactions::try_insert`]: crate::handler::Redactions::try_insert #[derive(Debug, Error)] pub enum InsertError { /// [`ConflictPolicy::Reject`] is active and the new redaction diff --git a/crates/nvisy-codec/src/handler/redactions.rs b/crates/nvisy-codec/src/handler/redactions.rs new file mode 100644 index 00000000..891d86e3 --- /dev/null +++ b/crates/nvisy-codec/src/handler/redactions.rs @@ -0,0 +1,227 @@ +//! Generic [`Redactions`] collection of `(location, redaction)` pairs +//! with overlap detection on insert. +//! +//! The collection is a flat `Vec<(S, R)>` ordered by insertion. On +//! [`try_insert`], `S::overlaps` checks for a collision with any +//! existing entry; under [`ConflictPolicy::Merge`], both +//! `S::try_merge` and `R::try_merge` must succeed to fuse the entries. +//! +//! Callers consume the collection via [`IntoIterator`] yielding `(S, R)`. +//! +//! [`Redactions`]: crate::handler::Redactions +//! [`ConflictPolicy::Merge`]: crate::handler::ConflictPolicy::Merge +//! [`try_insert`]: Redactions::try_insert + +use std::fmt; + +use derive_more::IntoIterator; +use nvisy_ontology::entity::{Mergeable, Overlap}; + +use super::policy::{ConflictPolicy, InsertError}; + +/// A set of `(location, redaction)` pairs with overlap detection on +/// insert. +/// +/// `S` is the location key. It must implement [`Overlap`] (for +/// collision detection) and [`Mergeable`] (for the [`Merge`] policy). +/// +/// `R` is the redaction payload. It must implement [`Mergeable`] — +/// the collection asks both `S` and `R` whether they can be merged +/// before fusing two colliding entries. +/// +/// Internally backed by a `Vec<(S, R)>`. Entry counts are typically +/// small (per-document), so linear scans are cheap. +/// +/// [`Merge`]: ConflictPolicy::Merge +#[derive(IntoIterator)] +pub struct Redactions { + policy: ConflictPolicy, + #[into_iterator(owned)] + items: Vec<(S, R)>, +} + +impl Redactions { + /// Create an empty collection with the given conflict policy. + pub fn new(policy: ConflictPolicy) -> Self { + Self { + policy, + items: Vec::new(), + } + } + + /// The conflict policy in effect. + pub fn policy(&self) -> ConflictPolicy { + self.policy + } + + /// Total number of redactions. + pub fn len(&self) -> usize { + self.items.len() + } + + /// Returns `true` if the collection holds no redactions. + pub fn is_empty(&self) -> bool { + self.items.is_empty() + } +} + +impl Redactions +where + S: Overlap + Mergeable, + R: Mergeable, +{ + /// Insert a `(location, redaction)` pair. + /// + /// If `location` overlaps any existing entry's location, behavior + /// is determined by the configured [`ConflictPolicy`]: + /// + /// - [`Reject`]: returns [`InsertError::OverlapRejected`]. + /// - [`Merge`]: attempts to merge both location and redaction; + /// returns [`InsertError::NotMergeable`] if either rejects. + /// - [`Replace`]: drops the existing overlapping entry and + /// inserts the new one. + /// + /// [`Reject`]: ConflictPolicy::Reject + /// [`Merge`]: ConflictPolicy::Merge + /// [`Replace`]: ConflictPolicy::Replace + pub fn try_insert(&mut self, location: S, redaction: R) -> Result<(), InsertError> { + let overlap_idx = self.items.iter().position(|(s, _)| s.overlaps(&location)); + let Some(idx) = overlap_idx else { + self.items.push((location, redaction)); + return Ok(()); + }; + + match self.policy { + ConflictPolicy::Reject => Err(InsertError::OverlapRejected), + ConflictPolicy::Replace => { + self.items[idx] = (location, redaction); + Ok(()) + } + ConflictPolicy::Merge => { + let (existing_s, existing_r) = self.items.remove(idx); + match ( + existing_s.try_merge(location), + existing_r.try_merge(redaction), + ) { + (Some(merged_s), Some(merged_r)) => { + self.items.push((merged_s, merged_r)); + Ok(()) + } + _ => Err(InsertError::NotMergeable), + } + } + } + } +} + +impl Default for Redactions { + fn default() -> Self { + Self::new(ConflictPolicy::default()) + } +} + +impl fmt::Debug for Redactions { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Redactions") + .field("policy", &self.policy) + .field("redactions", &self.len()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Debug, Clone, PartialEq)] + struct S { + start: usize, + end: usize, + } + + impl S { + fn new(start: usize, end: usize) -> Self { + Self { start, end } + } + } + + impl Overlap for S { + fn overlaps(&self, other: &Self) -> bool { + self.start < other.end && other.start < self.end + } + } + + impl Mergeable for S { + fn try_merge(self, other: Self) -> Option { + Some(Self { + start: self.start.min(other.start), + end: self.end.max(other.end), + }) + } + } + + #[derive(Debug, Clone, PartialEq)] + struct R(&'static str); + + impl Mergeable for R { + fn try_merge(self, other: Self) -> Option { + (self.0 == other.0).then_some(self) + } + } + + #[test] + fn insert_non_overlapping_keeps_both() { + let mut rs = Redactions::::new(ConflictPolicy::Reject); + rs.try_insert(S::new(0, 5), R("x")).unwrap(); + rs.try_insert(S::new(10, 15), R("y")).unwrap(); + assert_eq!(rs.len(), 2); + } + + #[test] + fn reject_policy_errors_on_overlap() { + let mut rs = Redactions::::new(ConflictPolicy::Reject); + rs.try_insert(S::new(0, 5), R("x")).unwrap(); + let err = rs.try_insert(S::new(3, 8), R("y")).unwrap_err(); + assert!(matches!(err, InsertError::OverlapRejected)); + assert_eq!(rs.len(), 1); + } + + #[test] + fn replace_policy_overwrites_overlap() { + let mut rs = Redactions::::new(ConflictPolicy::Replace); + rs.try_insert(S::new(0, 5), R("x")).unwrap(); + rs.try_insert(S::new(3, 8), R("y")).unwrap(); + assert_eq!(rs.len(), 1); + let (s, r) = rs.into_iter().next().unwrap(); + assert_eq!((s.start, s.end), (3, 8)); + assert_eq!(r.0, "y"); + } + + #[test] + fn merge_policy_combines_same_payload() { + let mut rs = Redactions::::new(ConflictPolicy::Merge); + rs.try_insert(S::new(0, 5), R("x")).unwrap(); + rs.try_insert(S::new(3, 8), R("x")).unwrap(); + assert_eq!(rs.len(), 1); + let (s, _) = rs.into_iter().next().unwrap(); + assert_eq!((s.start, s.end), (0, 8)); + } + + #[test] + fn merge_policy_errors_when_payload_differs() { + let mut rs = Redactions::::new(ConflictPolicy::Merge); + rs.try_insert(S::new(0, 5), R("x")).unwrap(); + let err = rs.try_insert(S::new(3, 8), R("y")).unwrap_err(); + assert!(matches!(err, InsertError::NotMergeable)); + } + + #[test] + fn into_iter_preserves_insertion_order() { + let mut rs = Redactions::::new(ConflictPolicy::Reject); + rs.try_insert(S::new(20, 25), R("a")).unwrap(); + rs.try_insert(S::new(0, 5), R("b")).unwrap(); + rs.try_insert(S::new(10, 15), R("c")).unwrap(); + let starts: Vec = rs.into_iter().map(|(s, _)| s.start).collect(); + assert_eq!(starts, vec![20, 0, 10]); + } +} diff --git a/crates/nvisy-codec/src/handler/rich/pdf_handler.rs b/crates/nvisy-codec/src/handler/rich/pdf_handler.rs index 61234363..4511ff07 100644 --- a/crates/nvisy-codec/src/handler/rich/pdf_handler.rs +++ b/crates/nvisy-codec/src/handler/rich/pdf_handler.rs @@ -21,9 +21,8 @@ use nvisy_ontology::primitive::Dpi; use super::pdf_render::PdfRenderer; use crate::document::{Located, LocationStream}; use crate::handler::image::ImageData; -use crate::handler::text::TextData; -use crate::handler::{Handler, ImageHandler, TextHandler}; -use crate::transform::{ImageRedaction, Redactions, TextRedaction, apply_text_redactions}; +use crate::handler::text::{TextData, apply_text_redaction}; +use crate::handler::{Handler, ImageHandler, ImageRedaction, TextHandler, TextRedaction}; const TARGET: &str = "rich-text-handler"; @@ -187,30 +186,24 @@ impl TextHandler for RichTextHandler { self.pages.get(page_idx).cloned().map(TextData::from) } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &TextLocation, + redaction: TextRedaction, ) -> Result<(), Error> { - if redactions.is_empty() { - return Ok(()); - } let offsets = self.page_offsets(); + let Some(page_idx) = offsets.iter().position(|&(start, end, _)| { + location.start_offset >= start && location.end_offset <= end + }) else { + return Ok(()); + }; + let page_start = offsets[page_idx].0; + let start = location.start_offset - page_start; + let end = location.end_offset - page_start; - // Compute new page texts by applying redactions to current values. - let mut page_updates: Vec<(usize, String)> = Vec::new(); - for (loc, items) in redactions { - let Some(page_idx) = offsets - .iter() - .position(|&(start, _, _)| start == loc.start_offset) - else { - continue; - }; - let mut content = self.pages[page_idx].clone(); - apply_text_redactions(&mut content, &items, TARGET)?; - page_updates.push((page_idx, content)); - } + let mut content = self.pages[page_idx].clone(); + apply_text_redaction(&mut content, &redaction, start, end, TARGET)?; - // PDF-specific: bake replacements into content streams. if self.document_type == DocumentType::Pdf { let mut doc = lopdf::Document::load_mem(&self.raw).map_err(|e| { Error::runtime( @@ -219,14 +212,10 @@ impl TextHandler for RichTextHandler { false, ) })?; - - for (idx, new_text) in &page_updates { - let old_text = &self.pages[*idx]; - if !old_text.is_empty() && old_text != new_text { - let _ = doc.replace_text((*idx as u32) + 1, old_text, new_text, None); - } + let old_text = &self.pages[page_idx]; + if !old_text.is_empty() && old_text != &content { + let _ = doc.replace_text((page_idx as u32) + 1, old_text, &content, None); } - let mut buf = Vec::new(); doc.save_to(&mut buf).map_err(|e| { Error::runtime(format!("failed to save edited PDF: {e}"), TARGET, false) @@ -234,10 +223,7 @@ impl TextHandler for RichTextHandler { self.raw = Bytes::from(buf); } - for (idx, new_text) in page_updates { - self.pages[idx] = new_text; - } - + self.pages[page_idx] = content; Ok(()) } } @@ -280,9 +266,10 @@ impl ImageHandler for RichTextHandler { None } - async fn redact( + async fn redact_at( &mut self, - _redactions: Redactions, + _location: &ImageLocation, + _redaction: ImageRedaction, ) -> Result<(), Error> { Ok(()) } diff --git a/crates/nvisy-codec/src/handler/rich/rich_handler.rs b/crates/nvisy-codec/src/handler/rich/rich_handler.rs index 23ad7450..2c43bca9 100644 --- a/crates/nvisy-codec/src/handler/rich/rich_handler.rs +++ b/crates/nvisy-codec/src/handler/rich/rich_handler.rs @@ -12,8 +12,7 @@ use super::RichTextHandler; use crate::document::LocationStream; use crate::handler::image::ImageData; use crate::handler::text::TextData; -use crate::handler::{Handler, ImageHandler, TextHandler}; -use crate::transform::{ImageRedaction, Redactions, TextRedaction}; +use crate::handler::{Handler, ImageHandler, ImageRedaction, TextHandler, TextRedaction}; /// A type-erased rich-document handler backed by a boxed trait object. pub struct BoxedRichHandler(Box); @@ -23,16 +22,18 @@ pub struct BoxedRichHandler(Box); pub(crate) trait RichHandler: Handler + Send + Sync { fn text_locations(&self) -> LocationStream<'_, TextLocation>; async fn read_text(&self, location: &TextLocation) -> Option; - async fn redact_text( + async fn redact_text_at( &mut self, - redactions: Redactions, + location: &TextLocation, + redaction: TextRedaction, ) -> Result<(), Error>; fn image_locations(&self) -> LocationStream<'_, ImageLocation>; async fn read_image(&self, location: &ImageLocation) -> Option; - async fn redact_images( + async fn redact_image_at( &mut self, - redactions: Redactions, + location: &ImageLocation, + redaction: ImageRedaction, ) -> Result<(), Error>; } @@ -47,11 +48,12 @@ impl RichHandler for RichTextHandler { TextHandler::read(self, location).await } - async fn redact_text( + async fn redact_text_at( &mut self, - redactions: Redactions, + location: &TextLocation, + redaction: TextRedaction, ) -> Result<(), Error> { - TextHandler::redact(self, redactions).await + TextHandler::redact_at(self, location, redaction).await } fn image_locations(&self) -> LocationStream<'_, ImageLocation> { @@ -62,11 +64,12 @@ impl RichHandler for RichTextHandler { ImageHandler::read(self, location).await } - async fn redact_images( + async fn redact_image_at( &mut self, - redactions: Redactions, + location: &ImageLocation, + redaction: ImageRedaction, ) -> Result<(), Error> { - ImageHandler::redact(self, redactions).await + ImageHandler::redact_at(self, location, redaction).await } } @@ -115,11 +118,12 @@ impl TextHandler for BoxedRichHandler { self.0.read_text(location).await } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &TextLocation, + redaction: TextRedaction, ) -> Result<(), Error> { - self.0.redact_text(redactions).await + self.0.redact_text_at(location, redaction).await } } @@ -133,10 +137,11 @@ impl ImageHandler for BoxedRichHandler { self.0.read_image(location).await } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &ImageLocation, + redaction: ImageRedaction, ) -> Result<(), Error> { - self.0.redact_images(redactions).await + self.0.redact_image_at(location, redaction).await } } diff --git a/crates/nvisy-codec/src/handler/tabular/apply.rs b/crates/nvisy-codec/src/handler/tabular/apply.rs new file mode 100644 index 00000000..177d1e0a --- /dev/null +++ b/crates/nvisy-codec/src/handler/tabular/apply.rs @@ -0,0 +1,42 @@ +//! Helper for applying a single [`TabularRedaction`] to one cell's +//! value in place. + +use nvisy_core::Error; + +use crate::handler::TabularRedaction; + +/// Apply a single redaction to `cell` in place, restricted to byte +/// range `start..end` (clamped to the cell's length). Returns an +/// error if either offset falls mid-character. +/// +/// The intra-cell byte range comes from the redaction's containing +/// [`TabularLocation`] under the `(location, redaction)` shape — not +/// from the redaction itself. +/// +/// [`TabularLocation`]: nvisy_ontology::entity::TabularLocation +pub(crate) fn apply_tabular_redaction( + cell: &mut String, + redaction: &TabularRedaction, + start: usize, + end: usize, + target: &'static str, +) -> Result<(), Error> { + let value = redaction.output.replacement_value().unwrap_or_default(); + let s = start.min(cell.len()); + let e = end.min(cell.len()); + if s >= e { + return Ok(()); + } + if !cell.is_char_boundary(s) || !cell.is_char_boundary(e) { + return Err(Error::validation( + format!( + "redaction offset falls mid-character \ + (start={start}, end={end}, len={})", + cell.len() + ), + target, + )); + } + cell.replace_range(s..e, value); + Ok(()) +} diff --git a/crates/nvisy-codec/src/handler/tabular/csv_handler.rs b/crates/nvisy-codec/src/handler/tabular/csv_handler.rs index 5d916b4f..02c700e2 100644 --- a/crates/nvisy-codec/src/handler/tabular/csv_handler.rs +++ b/crates/nvisy-codec/src/handler/tabular/csv_handler.rs @@ -7,7 +7,7 @@ //! else row `0` is the first data row. [`TabularHandler::read`] //! returns the cell's value as [`TextData`]. //! [`TabularHandler::redact`] mutates cells by coordinate, applying -//! intra-cell byte-offset replacements via [`apply_tabular_redactions`]. +//! intra-cell byte-offset replacements via [`apply_tabular_redaction`]. //! //! [`TabularLocation`]: nvisy_ontology::entity::TabularLocation @@ -16,10 +16,10 @@ use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::{DocumentType, SpreadsheetFormat}; use nvisy_ontology::entity::TabularLocation; +use super::{TabularRedaction, apply_tabular_redaction}; use crate::document::{Located, LocationStream}; use crate::handler::text::TextData; use crate::handler::{Handler, TabularHandler}; -use crate::transform::{Redactions, TabularRedaction, apply_tabular_redactions}; const TARGET: &str = "csv-handler"; @@ -115,38 +115,36 @@ impl TabularHandler for CsvHandler { Some(TextData::from(cell.clone())) } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &TabularLocation, + redaction: TabularRedaction, ) -> Result<(), Error> { - if redactions.is_empty() { + let Some((is_header, data_row)) = self.resolve_row(location.row_index) else { return Ok(()); - } - - for (loc, items) in redactions { - let Some((is_header, data_row)) = self.resolve_row(loc.row_index) else { - continue; + }; + let cell = if is_header { + let Some(headers) = self.data.headers.as_mut() else { + return Ok(()); }; - let cell = if is_header { - let Some(headers) = self.data.headers.as_mut() else { - continue; - }; - let Some(cell) = headers.get_mut(loc.column_index) else { - continue; - }; - cell - } else { - let Some(row) = self.data.rows.get_mut(data_row) else { - continue; - }; - let Some(cell) = row.get_mut(loc.column_index) else { - continue; - }; - cell + let Some(cell) = headers.get_mut(location.column_index) else { + return Ok(()); }; - apply_tabular_redactions(cell, &items, TARGET)?; - } - Ok(()) + cell + } else { + let Some(row) = self.data.rows.get_mut(data_row) else { + return Ok(()); + }; + let Some(cell) = row.get_mut(location.column_index) else { + return Ok(()); + }; + cell + }; + // Intra-cell byte range comes from the location; omitted means + // redact the whole cell. + let start = location.start_offset.unwrap_or(0); + let end = location.end_offset.unwrap_or(cell.len()); + apply_tabular_redaction(cell, &redaction, start, end, TARGET) } } @@ -274,7 +272,7 @@ mod tests { use nvisy_core::Error; use super::*; - use crate::transform::{ConflictPolicy, TextOutput}; + use crate::handler::{ConflictPolicy, Redactions, TabularHandler, TextOutput}; fn handler_with_headers(headers: Vec<&str>, rows: Vec>) -> CsvHandler { CsvHandler::new(CsvData { @@ -308,6 +306,14 @@ mod tests { .unwrap() } + fn cell_range(row: usize, col: usize, start: usize, end: usize) -> TabularLocation { + TabularLocation { + start_offset: Some(start), + end_offset: Some(end), + ..cell_loc(row, col) + } + } + #[tokio::test] async fn locations_yield_header_then_rows() { let h = handler_with_headers(vec!["name", "age"], vec![vec!["Alice", "30"]]); @@ -350,8 +356,8 @@ mod tests { let mut h = handler_with_headers(vec!["ssn"], vec![vec!["123-45-6789"]]); let mut rs = Redactions::new(ConflictPolicy::Reject); rs.try_insert( - cell_loc(1, 0), - TabularRedaction::new(0, 11, TextOutput::replace("[REDACTED]")), + cell_range(1, 0, 0, 11), + TabularRedaction::new(TextOutput::replace("[REDACTED]")), ) .unwrap(); h.redact(rs).await?; @@ -364,8 +370,8 @@ mod tests { let mut h = handler_with_headers(vec!["bio"], vec![vec!["Alice Smith"]]); let mut rs = Redactions::new(ConflictPolicy::Reject); rs.try_insert( - cell_loc(1, 0), - TabularRedaction::new(0, 5, TextOutput::replace("[NAME]")), + cell_range(1, 0, 0, 5), + TabularRedaction::new(TextOutput::replace("[NAME]")), ) .unwrap(); h.redact(rs).await?; @@ -378,8 +384,8 @@ mod tests { let mut h = handler_with_headers(vec!["secret_field"], vec![vec!["v"]]); let mut rs = Redactions::new(ConflictPolicy::Reject); rs.try_insert( - cell_loc(0, 0), - TabularRedaction::new(0, 12, TextOutput::replace("redacted")), + cell_range(0, 0, 0, 12), + TabularRedaction::new(TextOutput::replace("redacted")), ) .unwrap(); h.redact(rs).await?; @@ -392,8 +398,8 @@ mod tests { let mut h = handler_with_headers(vec!["a"], vec![vec!["one"]]); let mut rs = Redactions::new(ConflictPolicy::Reject); rs.try_insert( - cell_loc(99, 0), - TabularRedaction::new(0, 1, TextOutput::replace("X")), + cell_range(99, 0, 0, 1), + TabularRedaction::new(TextOutput::replace("X")), ) .unwrap(); h.redact(rs).await?; diff --git a/crates/nvisy-codec/src/handler/tabular/instruction.rs b/crates/nvisy-codec/src/handler/tabular/instruction.rs new file mode 100644 index 00000000..6d6b0582 --- /dev/null +++ b/crates/nvisy-codec/src/handler/tabular/instruction.rs @@ -0,0 +1,37 @@ +//! Tabular redaction instruction types. + +use crate::handler::{Mergeable, TextOutput}; + +/// A tabular redaction: the *how*. The *where* — cell coordinates +/// (`row_index`, `column_index`) and optional intra-cell byte offsets +/// — lives on the containing [`TabularLocation`] via [`Redactions`]'s +/// `(S, R)` pairs. +/// +/// This is the tabular counterpart of [`TextRedaction`]: instead of +/// being grouped by a line-level text span, it is grouped by a cell +/// coordinate. +/// +/// [`Redactions`]: crate::handler::Redactions +/// [`TextRedaction`]: crate::handler::TextRedaction +/// [`TabularLocation`]: nvisy_ontology::entity::TabularLocation +#[derive(Debug, Clone, PartialEq)] +pub struct TabularRedaction { + /// The redaction output that carries the replacement value. + pub(crate) output: TextOutput, +} + +impl TabularRedaction { + /// Create a new tabular redaction with the given output. + pub fn new(output: TextOutput) -> Self { + Self { output } + } +} + +impl Mergeable for TabularRedaction { + /// Combine two redactions that target overlapping cells. Returns + /// `Some` only when the outputs match; different replacement + /// strings cannot be reconciled. + fn try_merge(self, other: Self) -> Option { + (self.output == other.output).then_some(self) + } +} diff --git a/crates/nvisy-codec/src/handler/tabular/mod.rs b/crates/nvisy-codec/src/handler/tabular/mod.rs index 95712cf8..6c7b323b 100644 --- a/crates/nvisy-codec/src/handler/tabular/mod.rs +++ b/crates/nvisy-codec/src/handler/tabular/mod.rs @@ -17,28 +17,39 @@ use nvisy_ontology::entity::TabularLocation; use super::Handler; use crate::document::LocationStream; -use crate::handler::TextData; -use crate::transform::{Redactions, TabularRedaction}; +use crate::handler::{Redactions, TextData}; +mod apply; mod csv_handler; mod csv_loader; +mod instruction; mod tabular_handler; mod xlsx_handler; mod xlsx_loader; +pub(crate) use self::apply::apply_tabular_redaction; pub use self::csv_handler::{CsvData, CsvHandler}; pub use self::csv_loader::{CsvLoader, CsvParams}; +pub use self::instruction::TabularRedaction; pub use self::tabular_handler::BoxedTabularHandler; pub use self::xlsx_handler::XlsxHandler; pub use self::xlsx_loader::{XlsxLoader, XlsxParams}; /// Capability trait for handlers that expose content by cell coordinate. /// -/// Handlers expose tabular content as a stream of -/// [`TabularLocation`]s identifying individual cells, with explicit -/// `read` calls to fetch a cell's value as [`TextData`], and a -/// `redact` call that applies a batch of [`TabularRedaction`]s -/// grouped by cell. +/// Handlers implement three narrow operations: +/// - [`locations`]: cheap, identity-only stream of [`TabularLocation`]s +/// identifying individual cells. +/// - [`read`]: fetch a cell's value as [`TextData`]. +/// - [`redact_at`]: apply a single redaction to a single cell. +/// +/// Batched redaction is provided by [`redact`], which loops +/// [`redact_at`] in insertion order. +/// +/// [`locations`]: TabularHandler::locations +/// [`read`]: TabularHandler::read +/// [`redact_at`]: TabularHandler::redact_at +/// [`redact`]: TabularHandler::redact #[async_trait::async_trait] pub trait TabularHandler: Handler { /// Async stream of [`TabularLocation`]s for this document, each @@ -52,13 +63,30 @@ pub trait TabularHandler: Handler { /// Returns `None` if the location is out of bounds. async fn read(&self, location: &TabularLocation) -> Option; - /// Apply a batch of redactions grouped by [`TabularLocation`]. + /// Apply a single redaction to the cell at `location`, mutating + /// in place. The cell coordinates and optional intra-cell byte + /// offsets come from `location`. + async fn redact_at( + &mut self, + location: &TabularLocation, + redaction: TabularRedaction, + ) -> Result<(), Error>; + + /// Apply every `(location, redaction)` pair in `redactions` to the + /// handler in insertion order. The first error aborts the batch. + /// + /// The default loops [`redact_at`] in [`Redactions`] insertion + /// order; handlers with ordering constraints override this + /// default. /// - /// Cell identity is supplied by the [`Redactions`] collection's - /// keys; each redaction within a cell carries intra-cell byte - /// offsets that the handler maps onto its own cell value. + /// [`redact_at`]: TabularHandler::redact_at async fn redact( &mut self, redactions: Redactions, - ) -> Result<(), Error>; + ) -> Result<(), Error> { + for (location, redaction) in redactions { + self.redact_at(&location, redaction).await?; + } + Ok(()) + } } diff --git a/crates/nvisy-codec/src/handler/tabular/tabular_handler.rs b/crates/nvisy-codec/src/handler/tabular/tabular_handler.rs index dab555f2..6e8821c1 100644 --- a/crates/nvisy-codec/src/handler/tabular/tabular_handler.rs +++ b/crates/nvisy-codec/src/handler/tabular/tabular_handler.rs @@ -9,8 +9,7 @@ use nvisy_ontology::entity::TabularLocation; use super::TabularHandler; use crate::document::LocationStream; -use crate::handler::{CsvHandler, Handler, TextData, XlsxHandler}; -use crate::transform::{Redactions, TabularRedaction}; +use crate::handler::{CsvHandler, Handler, TabularRedaction, TextData, XlsxHandler}; /// A type-erased tabular handler backed by a boxed trait object. pub struct BoxedTabularHandler(Box); @@ -54,11 +53,12 @@ impl TabularHandler for BoxedTabularHandler { self.0.read(location).await } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &TabularLocation, + redaction: TabularRedaction, ) -> Result<(), Error> { - self.0.redact(redactions).await + self.0.redact_at(location, redaction).await } } diff --git a/crates/nvisy-codec/src/handler/tabular/xlsx_handler.rs b/crates/nvisy-codec/src/handler/tabular/xlsx_handler.rs index 3af2d881..48d1fc32 100644 --- a/crates/nvisy-codec/src/handler/tabular/xlsx_handler.rs +++ b/crates/nvisy-codec/src/handler/tabular/xlsx_handler.rs @@ -13,8 +13,7 @@ use nvisy_core::media::{DocumentType, SpreadsheetFormat}; use nvisy_ontology::entity::TabularLocation; use crate::document::LocationStream; -use crate::handler::{Handler, TabularHandler, TextData}; -use crate::transform::{Redactions, TabularRedaction}; +use crate::handler::{Handler, TabularHandler, TabularRedaction, TextData}; #[derive(Debug, Default)] pub struct XlsxHandler { @@ -62,9 +61,10 @@ impl TabularHandler for XlsxHandler { None } - async fn redact( + async fn redact_at( &mut self, - _redactions: Redactions, + _location: &TabularLocation, + _redaction: TabularRedaction, ) -> Result<(), Error> { Ok(()) } diff --git a/crates/nvisy-codec/src/handler/text/apply.rs b/crates/nvisy-codec/src/handler/text/apply.rs new file mode 100644 index 00000000..3422344a --- /dev/null +++ b/crates/nvisy-codec/src/handler/text/apply.rs @@ -0,0 +1,91 @@ +//! Byte-level helper for applying a single [`TextRedaction`] to a +//! string in place. Shared across the text-family handlers (TXT, JSON, +//! HTML, and the per-page text in [`RichTextHandler`]). +//! +//! The byte range `start..end` comes from the redaction's containing +//! [`TextLocation`] under the `(location, redaction)` shape — not from +//! the redaction itself. Callers translate the location's +//! document-absolute offsets into span-relative offsets before +//! calling. +//! +//! [`RichTextHandler`]: crate::handler::rich::RichTextHandler +//! [`TextLocation`]: nvisy_ontology::entity::TextLocation + +use nvisy_core::Error; + +use crate::handler::TextRedaction; + +/// Apply a single redaction to `content` in place, restricted to byte +/// range `start..end` (clamped to `content.len()`). Returns an error +/// if either offset falls mid-character. +pub(crate) fn apply_text_redaction( + content: &mut String, + redaction: &TextRedaction, + start: usize, + end: usize, + target: &'static str, +) -> Result<(), Error> { + let value = redaction.output.replacement_value().unwrap_or_default(); + let s = start.min(content.len()); + let e = end.min(content.len()); + if s >= e { + return Ok(()); + } + if !content.is_char_boundary(s) || !content.is_char_boundary(e) { + return Err(Error::validation( + format!( + "redaction offset falls mid-character \ + (start={start}, end={end}, len={})", + content.len() + ), + target, + )); + } + content.replace_range(s..e, value); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::handler::TextOutput; + + fn redaction(replacement: &str) -> TextRedaction { + TextRedaction::new(TextOutput::replace(replacement)) + } + + #[test] + fn single_replacement() { + let mut s = String::from("hello world"); + apply_text_redaction(&mut s, &redaction("[X]"), 0, 5, "test").unwrap(); + assert_eq!(s, "[X] world"); + } + + #[test] + fn remove_output() { + let mut s = String::from("hello world"); + apply_text_redaction( + &mut s, + &TextRedaction::new(TextOutput::Remove), + 5, + 11, + "test", + ) + .unwrap(); + assert_eq!(s, "hello"); + } + + #[test] + fn out_of_bounds_clipped() { + let mut s = String::from("short"); + apply_text_redaction(&mut s, &redaction("[X]"), 0, 999, "test").unwrap(); + assert_eq!(s, "[X]"); + } + + #[test] + fn mid_character_rejected() { + let mut s = String::from("héllo"); // 'é' is 2 bytes + let err = apply_text_redaction(&mut s, &redaction("[X]"), 0, 2, "test").unwrap_err(); + assert!(err.to_string().contains("mid-character")); + } +} diff --git a/crates/nvisy-codec/src/handler/text/html_handler.rs b/crates/nvisy-codec/src/handler/text/html_handler.rs index fbd8e087..b71d5d46 100644 --- a/crates/nvisy-codec/src/handler/text/html_handler.rs +++ b/crates/nvisy-codec/src/handler/text/html_handler.rs @@ -12,10 +12,10 @@ use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::DocumentType; use nvisy_ontology::entity::TextLocation; +use super::{TextRedaction, apply_text_redaction}; use crate::document::{Located, LocationStream}; use crate::handler::text::TextData; use crate::handler::{Handler, TextHandler}; -use crate::transform::{Redactions, TextRedaction, apply_text_redactions}; const TARGET: &str = "html-handler"; @@ -102,24 +102,28 @@ impl TextHandler for HtmlHandler { self.data.text_nodes.get(idx).cloned().map(TextData::from) } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &TextLocation, + redaction: TextRedaction, ) -> Result<(), Error> { - if redactions.is_empty() { - return Ok(()); - } let offsets = self.node_offsets(); - for (loc, items) in redactions { - let Some(idx) = offsets - .iter() - .position(|&(start, _)| start == loc.start_offset) - else { - continue; - }; - apply_text_redactions(&mut self.data.text_nodes[idx], &items, TARGET)?; - } - Ok(()) + let Some(idx) = offsets + .iter() + .position(|&(start, end)| location.start_offset >= start && location.end_offset <= end) + else { + return Ok(()); + }; + let node_start = offsets[idx].0; + let start = location.start_offset - node_start; + let end = location.end_offset - node_start; + apply_text_redaction( + &mut self.data.text_nodes[idx], + &redaction, + start, + end, + TARGET, + ) } } @@ -189,7 +193,7 @@ mod tests { use nvisy_core::Error; use super::*; - use crate::transform::{ConflictPolicy, TextOutput}; + use crate::handler::{ConflictPolicy, Redactions, TextHandler, TextOutput}; fn handler_from_html(raw: &str) -> HtmlHandler { let dom = scraper::Html::parse_document(raw); @@ -227,7 +231,7 @@ mod tests { let mut rs = Redactions::new(ConflictPolicy::Reject); rs.try_insert( items[0].location.clone(), - TextRedaction::new(0, 5, TextOutput::replace("[REDACTED]")), + TextRedaction::new(TextOutput::replace("[REDACTED]")), ) .unwrap(); h.redact(rs).await?; diff --git a/crates/nvisy-codec/src/transform/text/instruction.rs b/crates/nvisy-codec/src/handler/text/instruction.rs similarity index 53% rename from crates/nvisy-codec/src/transform/text/instruction.rs rename to crates/nvisy-codec/src/handler/text/instruction.rs index f97e55dd..37f4c807 100644 --- a/crates/nvisy-codec/src/transform/text/instruction.rs +++ b/crates/nvisy-codec/src/handler/text/instruction.rs @@ -3,29 +3,24 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::transform::Mergeable; +use crate::handler::Mergeable; -/// A text redaction targeting a byte range within its containing span. +/// A text redaction: the *how*. The *where* (byte range within the +/// document) lives on the containing [`TextLocation`] via +/// [`Redactions`]'s `(S, R)` pairs. /// -/// Span identity is supplied externally via [`Redactions`] — this -/// struct only carries the intra-span byte range and the replacement -/// output. -/// -/// [`Redactions`]: crate::transform::Redactions +/// [`TextLocation`]: nvisy_ontology::entity::TextLocation +/// [`Redactions`]: crate::handler::Redactions #[derive(Debug, Clone, PartialEq)] pub struct TextRedaction { - /// Byte offset where the redacted region starts within the span. - pub(crate) start: usize, - /// Byte offset where the redacted region ends (exclusive) within the span. - pub(crate) end: usize, /// The redaction output that carries the replacement value. pub(crate) output: TextOutput, } impl TextRedaction { - /// Create a new text redaction. - pub fn new(start: usize, end: usize, output: TextOutput) -> Self { - Self { start, end, output } + /// Create a new text redaction with the given output. + pub fn new(output: TextOutput) -> Self { + Self { output } } } @@ -65,24 +60,10 @@ impl TextOutput { } impl Mergeable for TextRedaction { - fn overlaps(&self, other: &Self) -> bool { - self.start < other.end && other.start < self.end - } - - /// Merge two overlapping text redactions. - /// - /// Returns `Some` only when both share the same [`TextOutput`] — - /// the merged redaction unions the byte ranges. Returns `None` - /// when the outputs differ (e.g. `Replace { "[A]" }` vs `Replace { "[B]" }`), - /// since picking one would silently drop a redaction. + /// Combine two redactions that target overlapping locations. Returns + /// `Some` only when the outputs match; different replacement + /// strings cannot be reconciled. fn try_merge(self, other: Self) -> Option { - if self.output != other.output { - return None; - } - Some(Self { - start: self.start.min(other.start), - end: self.end.max(other.end), - output: self.output, - }) + (self.output == other.output).then_some(self) } } diff --git a/crates/nvisy-codec/src/handler/text/json_handler.rs b/crates/nvisy-codec/src/handler/text/json_handler.rs index cf36edf8..ef691568 100644 --- a/crates/nvisy-codec/src/handler/text/json_handler.rs +++ b/crates/nvisy-codec/src/handler/text/json_handler.rs @@ -23,10 +23,10 @@ use nvisy_core::media::{DocumentType, TextFormat}; use nvisy_ontology::entity::TextLocation; use serde::{Deserialize, Serialize}; +use super::{TextRedaction, apply_text_redaction}; use crate::document::{Located, LocationStream}; use crate::handler::text::TextData; use crate::handler::{Handler, TextHandler}; -use crate::transform::{Redactions, TextRedaction, apply_text_redactions}; const DEFAULT_INDENT: NonZeroU32 = NonZeroU32::new(2).unwrap(); const TARGET: &str = "json-handler"; @@ -143,50 +143,42 @@ impl TextHandler for JsonHandler { .map(|ls| TextData::from(ls.text)) } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &TextLocation, + redaction: TextRedaction, ) -> Result<(), Error> { - if redactions.is_empty() { - return Ok(()); - } let located = self.locate_spans(); - - let mut value_updates = Vec::new(); - let mut key_updates = Vec::new(); - - for (loc, items) in redactions { - let Some(ls) = located - .iter() - .find(|ls| ls.start == loc.start_offset && ls.end == loc.end_offset) - else { - continue; - }; - let mut content = ls.text.clone(); - apply_text_redactions(&mut content, &items, TARGET)?; - if ls.path.key_of { - key_updates.push((ls.path.clone(), content)); - } else { - value_updates.push((ls.path.clone(), content)); - } - } - - for (path, new_value) in value_updates { - let target = self.data.value.pointer_mut(&path.pointer).ok_or_else(|| { - Error::validation(format!("JSON pointer not found: {}", path.pointer), TARGET) - })?; + let Some(ls) = located + .into_iter() + .find(|ls| location.start_offset >= ls.start && location.end_offset <= ls.end) + else { + return Ok(()); + }; + let start = location.start_offset - ls.start; + let end = location.end_offset - ls.start; + let mut content = ls.text.clone(); + apply_text_redaction(&mut content, &redaction, start, end, TARGET)?; + if ls.path.key_of { + rename_key(&mut self.data.value, &ls.path.pointer, &content)?; + } else { + let target = self + .data + .value + .pointer_mut(&ls.path.pointer) + .ok_or_else(|| { + Error::validation( + format!("JSON pointer not found: {}", ls.path.pointer), + TARGET, + ) + })?; if target.is_string() { - *target = serde_json::Value::String(new_value); + *target = serde_json::Value::String(content); } else { - *target = serde_json::from_str(&new_value) - .unwrap_or(serde_json::Value::String(new_value)); + *target = + serde_json::from_str(&content).unwrap_or(serde_json::Value::String(content)); } } - - for (path, new_key) in key_updates { - rename_key(&mut self.data.value, &path.pointer, &new_key)?; - } - Ok(()) } } diff --git a/crates/nvisy-codec/src/handler/text/mod.rs b/crates/nvisy-codec/src/handler/text/mod.rs index 64cd2c98..baa20626 100644 --- a/crates/nvisy-codec/src/handler/text/mod.rs +++ b/crates/nvisy-codec/src/handler/text/mod.rs @@ -5,12 +5,14 @@ use nvisy_ontology::entity::TextLocation; use super::Handler; use crate::document::LocationStream; -use crate::transform::{Redactions, TextRedaction}; +use crate::handler::Redactions; +mod apply; #[cfg(feature = "html")] mod html_handler; #[cfg(feature = "html")] mod html_loader; +mod instruction; mod json_handler; mod json_loader; mod markdown_loader; @@ -19,10 +21,12 @@ mod text_handler; mod txt_handler; mod txt_loader; +pub(crate) use self::apply::apply_text_redaction; #[cfg(feature = "html")] pub use self::html_handler::{HtmlData, HtmlHandler}; #[cfg(feature = "html")] pub use self::html_loader::{HtmlLoader, HtmlParams}; +pub use self::instruction::{TextOutput, TextRedaction}; pub use self::json_handler::{JsonData, JsonHandler, JsonIndent}; pub use self::json_loader::{JsonLoader, JsonParams}; pub use self::markdown_loader::{MarkdownLoader, MarkdownParams}; @@ -61,12 +65,34 @@ pub trait TextHandler: Handler { /// Returns `None` if the location is out of bounds. async fn read(&self, location: &TextLocation) -> Option; - /// Apply a batch of redactions grouped by [`TextLocation`]. + /// Apply a single redaction at the given location, mutating in + /// place. Implementations need not handle iteration or overlap — + /// the provided [`redact`] feeds one `(location, redaction)` pair + /// at a time. /// - /// The collection enforces overlap policy on insert; this method - /// trusts that ranges within a single location do not overlap. + /// [`redact`]: TextHandler::redact + async fn redact_at( + &mut self, + location: &TextLocation, + redaction: TextRedaction, + ) -> Result<(), Error>; + + /// Apply every `(location, redaction)` pair in `redactions` to the + /// handler in insertion order. The first error aborts the batch. + /// + /// The default loops [`redact_at`] in [`Redactions`] insertion + /// order; handlers with ordering constraints (see + /// [`AudioHandler::redact`]) override this default. + /// + /// [`redact_at`]: TextHandler::redact_at + /// [`AudioHandler::redact`]: crate::handler::AudioHandler::redact async fn redact( &mut self, redactions: Redactions, - ) -> Result<(), Error>; + ) -> Result<(), Error> { + for (location, redaction) in redactions { + self.redact_at(&location, redaction).await?; + } + Ok(()) + } } diff --git a/crates/nvisy-codec/src/handler/text/text_handler.rs b/crates/nvisy-codec/src/handler/text/text_handler.rs index f690cfef..4e42d4df 100644 --- a/crates/nvisy-codec/src/handler/text/text_handler.rs +++ b/crates/nvisy-codec/src/handler/text/text_handler.rs @@ -9,8 +9,7 @@ use nvisy_ontology::entity::TextLocation; use super::TextData; use crate::document::LocationStream; -use crate::handler::{Handler, TextHandler}; -use crate::transform::{Redactions, TextRedaction}; +use crate::handler::{Handler, TextHandler, TextRedaction}; /// A type-erased text handler backed by a boxed trait object. pub struct BoxedTextHandler(Box); @@ -55,11 +54,12 @@ impl TextHandler for BoxedTextHandler { self.0.read(location).await } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &TextLocation, + redaction: TextRedaction, ) -> Result<(), Error> { - self.0.redact(redactions).await + self.0.redact_at(location, redaction).await } } diff --git a/crates/nvisy-codec/src/handler/text/txt_handler.rs b/crates/nvisy-codec/src/handler/text/txt_handler.rs index 71fc6c93..52efd5dc 100644 --- a/crates/nvisy-codec/src/handler/text/txt_handler.rs +++ b/crates/nvisy-codec/src/handler/text/txt_handler.rs @@ -15,10 +15,10 @@ use nvisy_core::content::{ContentData, ContentSource}; use nvisy_core::media::{DocumentType, TextFormat}; use nvisy_ontology::entity::TextLocation; +use super::{TextRedaction, apply_text_redaction}; use crate::document::{Located, LocationStream}; use crate::handler::text::TextData; use crate::handler::{Handler, TextHandler}; -use crate::transform::{Redactions, TextRedaction, apply_text_redactions}; const TARGET: &str = "txt-handler"; @@ -89,24 +89,22 @@ impl TextHandler for TxtHandler { line.get(local_start..local_end).map(TextData::from) } - async fn redact( + async fn redact_at( &mut self, - redactions: Redactions, + location: &TextLocation, + redaction: TextRedaction, ) -> Result<(), Error> { - if redactions.is_empty() { - return Ok(()); - } let offsets = self.line_offsets(); - for (loc, items) in redactions { - let Some(line_idx) = offsets - .iter() - .position(|&(start, _)| start == loc.start_offset) - else { - continue; - }; - apply_text_redactions(&mut self.lines[line_idx], &items, TARGET)?; - } - Ok(()) + let Some(line_idx) = offsets + .iter() + .position(|&(start, end)| location.start_offset >= start && location.end_offset <= end) + else { + return Ok(()); + }; + let line_start = offsets[line_idx].0; + let start = location.start_offset - line_start; + let end = location.end_offset - line_start; + apply_text_redaction(&mut self.lines[line_idx], &redaction, start, end, TARGET) } } @@ -172,7 +170,7 @@ mod tests { use nvisy_core::Error; use super::*; - use crate::transform::{ConflictPolicy, TextOutput}; + use crate::handler::{ConflictPolicy, Redactions, TextHandler, TextOutput}; fn handler(text: &str) -> TxtHandler { let trailing_newline = text.ends_with('\n'); @@ -228,13 +226,13 @@ mod tests { } #[tokio::test] - async fn redact_replaces_substring() -> Result<(), Error> { + async fn redact_replaces_whole_line() -> Result<(), Error> { let mut h = handler("hello\nworld\n"); let items: Vec<_> = h.locations().collect().await; let mut rs = Redactions::new(ConflictPolicy::Reject); rs.try_insert( items[1].location.clone(), - TextRedaction::new(0, 5, TextOutput::replace("[REDACTED]")), + TextRedaction::new(TextOutput::replace("[REDACTED]")), ) .unwrap(); h.redact(rs).await?; @@ -242,6 +240,26 @@ mod tests { Ok(()) } + #[tokio::test] + async fn redact_substring_within_line() -> Result<(), Error> { + // Entity-shaped location: bytes 6..11 in "hello world" picks the + // substring "world", which lives inside the single-line span. + let mut h = handler("hello world"); + let mut rs = Redactions::new(ConflictPolicy::Reject); + rs.try_insert( + TextLocation { + start_offset: 6, + end_offset: 11, + ..Default::default() + }, + TextRedaction::new(TextOutput::replace("[X]")), + ) + .unwrap(); + h.redact(rs).await?; + assert_eq!(h.lines(), &["hello [X]"]); + Ok(()) + } + #[tokio::test] async fn redact_multiple_lines() -> Result<(), Error> { let mut h = handler("aaa\nbbb\nccc\n"); @@ -249,12 +267,12 @@ mod tests { let mut rs = Redactions::new(ConflictPolicy::Reject); rs.try_insert( items[0].location.clone(), - TextRedaction::new(0, 3, TextOutput::replace("[X]")), + TextRedaction::new(TextOutput::replace("[X]")), ) .unwrap(); rs.try_insert( items[2].location.clone(), - TextRedaction::new(0, 3, TextOutput::replace("[Y]")), + TextRedaction::new(TextOutput::replace("[Y]")), ) .unwrap(); h.redact(rs).await?; @@ -272,7 +290,7 @@ mod tests { end_offset: 1000, ..Default::default() }, - TextRedaction::new(0, 1, TextOutput::replace("nope")), + TextRedaction::new(TextOutput::replace("nope")), ) .unwrap(); h.redact(rs).await?; diff --git a/crates/nvisy-codec/src/lib.rs b/crates/nvisy-codec/src/lib.rs index 5865a0fd..33136946 100644 --- a/crates/nvisy-codec/src/lib.rs +++ b/crates/nvisy-codec/src/lib.rs @@ -4,6 +4,5 @@ mod document; pub mod handler; -pub mod transform; pub use self::document::{ContentHandle, Located, LocationStream, Span}; diff --git a/crates/nvisy-codec/src/transform/audio/instruction.rs b/crates/nvisy-codec/src/transform/audio/instruction.rs deleted file mode 100644 index 9a267fee..00000000 --- a/crates/nvisy-codec/src/transform/audio/instruction.rs +++ /dev/null @@ -1,61 +0,0 @@ -//! Audio redaction instruction types. - -use nvisy_ontology::primitive::TimeSpan; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; - -use crate::transform::Mergeable; - -/// An audio redaction targeting a time range within its containing span. -/// -/// Span identity is supplied externally via [`Redactions`] — this -/// struct only carries the time span and the rendering method. -/// -/// [`Redactions`]: crate::transform::Redactions -#[derive(Debug, Clone, PartialEq)] -pub struct AudioRedaction { - /// Time interval of the segment to redact. - pub(crate) time_span: TimeSpan, - /// The redaction output that determines the rendering method. - pub(crate) output: AudioOutput, -} - -impl AudioRedaction { - /// Create a new audio redaction. - pub fn new(time_span: TimeSpan, output: AudioOutput) -> Self { - Self { time_span, output } - } -} - -/// Audio redaction output — records the method used. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] -#[serde(tag = "method", rename_all = "snake_case")] -pub enum AudioOutput { - /// Segment replaced with silence. - Silence, - /// Segment removed entirely. - Remove, - /// Segment replaced with provided audio data. - Replace { data: Vec }, -} - -impl Mergeable for AudioRedaction { - fn overlaps(&self, other: &Self) -> bool { - self.time_span.overlaps(&other.time_span) - } - - /// Merge two overlapping audio redactions. - /// - /// Returns `Some` only when both share the same [`AudioOutput`] - /// (method *and* parameters) — the merged redaction unions the - /// time spans. Returns `None` when the methods differ. - fn try_merge(self, other: Self) -> Option { - if self.output != other.output { - return None; - } - Some(Self { - time_span: self.time_span.union(&other.time_span), - output: self.output, - }) - } -} diff --git a/crates/nvisy-codec/src/transform/audio/mod.rs b/crates/nvisy-codec/src/transform/audio/mod.rs deleted file mode 100644 index ebb72430..00000000 --- a/crates/nvisy-codec/src/transform/audio/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -//! Audio redaction primitives. - -mod instruction; - -pub use self::instruction::{AudioOutput, AudioRedaction}; diff --git a/crates/nvisy-codec/src/transform/image/apply.rs b/crates/nvisy-codec/src/transform/image/apply.rs deleted file mode 100644 index 7061c7ff..00000000 --- a/crates/nvisy-codec/src/transform/image/apply.rs +++ /dev/null @@ -1,51 +0,0 @@ -//! Helper for applying a batch of [`ImageRedaction`]s to a -//! [`DynamicImage`] in place. - -use image::DynamicImage; - -use super::instruction::{ImageOutput, ImageRedaction}; -use super::ops::ImageOps; - -const TARGET: &str = "nvisy_codec::transform::image"; - -/// Apply a slice of image redactions to `img` in place. -/// -/// Each redaction's bounding box is converted to pixel coordinates and -/// the corresponding output method is applied. Replace outputs whose -/// embedded image data fails to decode are skipped with a warning. -pub(crate) fn apply_image_redactions(img: &mut DynamicImage, redactions: &[ImageRedaction]) { - for redaction in redactions { - let region = redaction.bounding_box.to_pixel(); - match &redaction.output { - ImageOutput::Blur { sigma } => { - img.apply_gaussian_blur(®ion, *sigma); - } - ImageOutput::Block { color } => { - img.apply_block_overlay(®ion, *color); - } - ImageOutput::Pixelate { block_size } => { - img.apply_pixelate(®ion, *block_size); - } - ImageOutput::Replace { data } => { - let replacement = match image::load_from_memory(data) { - Ok(r) => r, - Err(e) => { - tracing::warn!( - target: TARGET, - region = ?region, - error = %e, - "failed to decode replacement image data, skipping region" - ); - continue; - } - }; - let resized = replacement.resize_exact( - region.width, - region.height, - image::imageops::FilterType::Lanczos3, - ); - image::imageops::overlay(img, &resized, region.x as i64, region.y as i64); - } - } - } -} diff --git a/crates/nvisy-codec/src/transform/image/instruction.rs b/crates/nvisy-codec/src/transform/image/instruction.rs deleted file mode 100644 index 2b3a38ca..00000000 --- a/crates/nvisy-codec/src/transform/image/instruction.rs +++ /dev/null @@ -1,67 +0,0 @@ -//! Image redaction instruction types. - -use nvisy_ontology::primitive::{BoundingBox, Color}; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; - -use crate::transform::Mergeable; - -/// An image redaction targeting a bounding box within its containing span. -/// -/// Span identity is supplied externally via [`Redactions`] — this -/// struct only carries the bounding box and the rendering method. -/// -/// [`Redactions`]: crate::transform::Redactions -#[derive(Debug, Clone, PartialEq)] -pub struct ImageRedaction { - /// Bounding box of the region to redact within the span. - pub(crate) bounding_box: BoundingBox, - /// The redaction output that determines the rendering method. - pub(crate) output: ImageOutput, -} - -impl ImageRedaction { - /// Create a new image redaction. - pub fn new(bounding_box: BoundingBox, output: ImageOutput) -> Self { - Self { - bounding_box, - output, - } - } -} - -/// Image redaction output: records the method used and its parameters. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] -#[serde(tag = "method", rename_all = "snake_case")] -pub enum ImageOutput { - /// Gaussian blur applied to the region. - Blur { sigma: f32 }, - /// Opaque block overlay on the region. - Block { color: Color }, - /// Pixelation (mosaic) applied to the region. - Pixelate { block_size: u32 }, - /// Region replaced with provided image data. - Replace { data: Vec }, -} - -impl Mergeable for ImageRedaction { - fn overlaps(&self, other: &Self) -> bool { - self.bounding_box.overlaps(&other.bounding_box) - } - - /// Merge two overlapping image redactions. - /// - /// Returns `Some` only when both share the same [`ImageOutput`] - /// (method *and* parameters) — the merged redaction unions the - /// bounding boxes. Returns `None` when the methods differ (e.g. - /// `Blur { sigma: 5.0 }` vs `Pixelate { block_size: 10 }`). - fn try_merge(self, other: Self) -> Option { - if self.output != other.output { - return None; - } - Some(Self { - bounding_box: self.bounding_box.union(&other.bounding_box), - output: self.output, - }) - } -} diff --git a/crates/nvisy-codec/src/transform/image/mod.rs b/crates/nvisy-codec/src/transform/image/mod.rs deleted file mode 100644 index 2016351f..00000000 --- a/crates/nvisy-codec/src/transform/image/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -//! Image redaction primitives. - -mod apply; -mod instruction; -mod ops; - -pub(crate) use self::apply::apply_image_redactions; -pub use self::instruction::{ImageOutput, ImageRedaction}; diff --git a/crates/nvisy-codec/src/transform/mergeable.rs b/crates/nvisy-codec/src/transform/mergeable.rs deleted file mode 100644 index d01797c7..00000000 --- a/crates/nvisy-codec/src/transform/mergeable.rs +++ /dev/null @@ -1,25 +0,0 @@ -//! [`Mergeable`] trait — overlap detection and merge semantics for -//! redaction payloads. - -/// Trait for redactions that have a comparable extent within a span. -/// -/// Required by [`Redactions`] to detect overlap on insert and to -/// produce a merged redaction under [`ConflictPolicy::Merge`]. -/// -/// [`Redactions`]: crate::transform::Redactions -/// [`ConflictPolicy::Merge`]: crate::transform::ConflictPolicy::Merge -pub trait Mergeable: Sized { - /// Returns `true` when this redaction overlaps with `other`. - /// - /// Both redactions are assumed to live in the same span — callers - /// must already group by span identity before calling this. - fn overlaps(&self, other: &Self) -> bool; - - /// Try to combine two overlapping redactions into one. - /// - /// Returns `Some(merged)` when the redactions can be meaningfully - /// combined (e.g. same replacement output, unioned extents). - /// Returns `None` when they overlap but cannot be reconciled - /// (e.g. different replacement strings, different image methods). - fn try_merge(self, other: Self) -> Option; -} diff --git a/crates/nvisy-codec/src/transform/mod.rs b/crates/nvisy-codec/src/transform/mod.rs deleted file mode 100644 index 9a19111e..00000000 --- a/crates/nvisy-codec/src/transform/mod.rs +++ /dev/null @@ -1,32 +0,0 @@ -//! Redaction transform primitives. -//! -//! [`Redactions`] groups per-modality instructions by their target -//! span identity and enforces an overlap [`ConflictPolicy`] on insert. -//! Handler capability traits ([`TextHandler`], [`ImageHandler`], -//! [`AudioHandler`]) consume these collections directly via their -//! `redact` methods. -//! -//! [`Redactions`]: crate::transform::Redactions -//! [`ConflictPolicy`]: crate::transform::ConflictPolicy -//! [`TextHandler`]: crate::handler::TextHandler -//! [`ImageHandler`]: crate::handler::ImageHandler -//! [`AudioHandler`]: crate::handler::AudioHandler - -mod audio; -mod image; -mod mergeable; -mod policy; -mod redactions; -mod tabular; -mod text; - -pub use self::audio::{AudioOutput, AudioRedaction}; -pub(crate) use self::image::apply_image_redactions; -pub use self::image::{ImageOutput, ImageRedaction}; -pub use self::mergeable::Mergeable; -pub use self::policy::{ConflictPolicy, InsertError}; -pub use self::redactions::Redactions; -pub use self::tabular::TabularRedaction; -pub(crate) use self::tabular::apply_tabular_redactions; -pub(crate) use self::text::apply_text_redactions; -pub use self::text::{TextOutput, TextRedaction}; diff --git a/crates/nvisy-codec/src/transform/redactions.rs b/crates/nvisy-codec/src/transform/redactions.rs deleted file mode 100644 index 583756da..00000000 --- a/crates/nvisy-codec/src/transform/redactions.rs +++ /dev/null @@ -1,276 +0,0 @@ -//! Generic [`Redactions`] collection keyed by span identity. -//! -//! Stores redactions grouped by their target span, with overlap -//! detection on insert. The [`ConflictPolicy`] decides what happens -//! when two redactions overlap within the same span. See the -//! [`mergeable`] and [`policy`] modules for the traits and types -//! that govern collection behavior. -//! -//! Transforms consume a `Redactions` instead of a flat slice, so the -//! grouping + overlap-checking work is done once at the engine -//! boundary rather than re-done in each handler. The collection does -//! not expose raw map access: callers consume the collection via -//! [`IntoIterator`]. -//! -//! [`Redactions`]: crate::transform::Redactions -//! [`ConflictPolicy`]: crate::transform::ConflictPolicy -//! [`mergeable`]: crate::transform::mergeable -//! [`policy`]: crate::transform::policy - -use std::fmt; - -use super::mergeable::Mergeable; -use super::policy::{ConflictPolicy, InsertError}; - -/// A set of redactions grouped by their target span, with overlap -/// detection on insert. -/// -/// `S` is the span identity (e.g. [`TextLocation`], [`ImageLocation`]) -/// and must implement [`PartialEq`] so the collection can find the -/// span an inserted redaction belongs to. -/// -/// `R` is the per-span redaction payload (e.g. [`TextRedaction`]) -/// and must implement [`Mergeable`] to support overlap detection. -/// -/// Internally backed by a `Vec<(S, Vec)>` — span counts are -/// typically small and insertion order matters for deterministic -/// downstream behavior, so a `HashMap` would be more cost than -/// benefit and would not work for keys with `f64` fields anyway. -/// -/// [`TextLocation`]: nvisy_ontology::entity::TextLocation -/// [`ImageLocation`]: nvisy_ontology::entity::ImageLocation -/// [`TextRedaction`]: crate::transform::TextRedaction -pub struct Redactions { - policy: ConflictPolicy, - spans: Vec<(S, Vec)>, -} - -impl Redactions { - /// Create an empty collection with the given conflict policy. - pub fn new(policy: ConflictPolicy) -> Self { - Self { - policy, - spans: Vec::new(), - } - } - - /// The conflict policy in effect. - pub fn policy(&self) -> ConflictPolicy { - self.policy - } - - /// Total number of redactions across all spans. - pub fn len(&self) -> usize { - self.spans.iter().map(|(_, rs)| rs.len()).sum() - } - - /// Number of distinct spans that hold redactions. - pub fn span_count(&self) -> usize { - self.spans.len() - } - - /// Returns `true` if the collection holds no redactions. - pub fn is_empty(&self) -> bool { - self.spans.iter().all(|(_, rs)| rs.is_empty()) - } -} - -impl Redactions -where - S: PartialEq, - R: Mergeable, -{ - /// Insert a redaction targeting the given span. - /// - /// If the span already holds an overlapping redaction, behavior - /// is determined by the configured [`ConflictPolicy`]: - /// - /// - [`Reject`]: returns [`InsertError::OverlapRejected`]. - /// - [`Merge`]: attempts to merge; returns - /// [`InsertError::NotMergeable`] when the merge fails. - /// - [`Replace`]: drops the existing overlapping redaction and - /// inserts the new one. - /// - /// [`Reject`]: ConflictPolicy::Reject - /// [`Merge`]: ConflictPolicy::Merge - /// [`Replace`]: ConflictPolicy::Replace - pub fn try_insert(&mut self, span: S, redaction: R) -> Result<(), InsertError> { - let bucket = match self.spans.iter().position(|(s, _)| s == &span) { - Some(idx) => &mut self.spans[idx].1, - None => { - self.spans.push((span, vec![redaction])); - return Ok(()); - } - }; - - let overlap_idx = bucket.iter().position(|r| r.overlaps(&redaction)); - let Some(idx) = overlap_idx else { - bucket.push(redaction); - return Ok(()); - }; - - match self.policy { - ConflictPolicy::Reject => Err(InsertError::OverlapRejected), - ConflictPolicy::Replace => { - bucket[idx] = redaction; - Ok(()) - } - ConflictPolicy::Merge => { - let existing = bucket.remove(idx); - match existing.try_merge(redaction) { - Some(merged) => { - bucket.push(merged); - Ok(()) - } - None => Err(InsertError::NotMergeable), - } - } - } - } -} - -impl Default for Redactions { - fn default() -> Self { - Self::new(ConflictPolicy::default()) - } -} - -impl IntoIterator for Redactions { - type IntoIter = std::vec::IntoIter<(S, Vec)>; - type Item = (S, Vec); - - /// Consume the collection, yielding each span paired with its - /// owned redactions in insertion order. - fn into_iter(self) -> Self::IntoIter { - self.spans.into_iter() - } -} - -impl fmt::Debug for Redactions { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Redactions") - .field("policy", &self.policy) - .field("spans", &self.spans.len()) - .field("redactions", &self.len()) - .finish() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[derive(Debug, Clone, PartialEq)] - struct R { - start: usize, - end: usize, - out: String, - } - - impl R { - fn new(start: usize, end: usize, out: &str) -> Self { - Self { - start, - end, - out: out.to_owned(), - } - } - } - - impl Mergeable for R { - fn overlaps(&self, other: &Self) -> bool { - self.start < other.end && other.start < self.end - } - - fn try_merge(self, other: Self) -> Option { - if self.out != other.out { - return None; - } - Some(R { - start: self.start.min(other.start), - end: self.end.max(other.end), - out: self.out, - }) - } - } - - #[test] - fn insert_into_new_span() { - let mut rs = Redactions::::new(ConflictPolicy::Reject); - rs.try_insert(0, R::new(0, 5, "x")).unwrap(); - assert_eq!(rs.len(), 1); - assert_eq!(rs.span_count(), 1); - } - - #[test] - fn insert_non_overlapping_into_same_span() { - let mut rs = Redactions::::new(ConflictPolicy::Reject); - rs.try_insert(0, R::new(0, 5, "x")).unwrap(); - rs.try_insert(0, R::new(10, 15, "y")).unwrap(); - assert_eq!(rs.len(), 2); - assert_eq!(rs.span_count(), 1); - } - - #[test] - fn insert_into_different_spans() { - let mut rs = Redactions::::new(ConflictPolicy::Reject); - rs.try_insert(0, R::new(0, 5, "x")).unwrap(); - rs.try_insert(1, R::new(0, 5, "x")).unwrap(); - assert_eq!(rs.span_count(), 2); - } - - #[test] - fn reject_policy_errors_on_overlap() { - let mut rs = Redactions::::new(ConflictPolicy::Reject); - rs.try_insert(0, R::new(0, 5, "x")).unwrap(); - let err = rs.try_insert(0, R::new(3, 8, "y")).unwrap_err(); - assert!(matches!(err, InsertError::OverlapRejected)); - assert_eq!(rs.len(), 1); - } - - #[test] - fn replace_policy_overwrites_overlap() { - let mut rs = Redactions::::new(ConflictPolicy::Replace); - rs.try_insert(0, R::new(0, 5, "x")).unwrap(); - rs.try_insert(0, R::new(3, 8, "y")).unwrap(); - assert_eq!(rs.len(), 1); - let (_, items) = rs.into_iter().next().unwrap(); - assert_eq!(items[0].out, "y"); - } - - #[test] - fn merge_policy_combines_same_output() { - let mut rs = Redactions::::new(ConflictPolicy::Merge); - rs.try_insert(0, R::new(0, 5, "x")).unwrap(); - rs.try_insert(0, R::new(3, 8, "x")).unwrap(); - assert_eq!(rs.len(), 1); - let (_, items) = rs.into_iter().next().unwrap(); - assert_eq!(items[0].start, 0); - assert_eq!(items[0].end, 8); - } - - #[test] - fn merge_policy_errors_when_unmergeable() { - let mut rs = Redactions::::new(ConflictPolicy::Merge); - rs.try_insert(0, R::new(0, 5, "x")).unwrap(); - let err = rs.try_insert(0, R::new(3, 8, "y")).unwrap_err(); - assert!(matches!(err, InsertError::NotMergeable)); - } - - #[test] - fn into_iter_preserves_insertion_order() { - let mut rs = Redactions::::new(ConflictPolicy::Reject); - rs.try_insert(2, R::new(0, 5, "x")).unwrap(); - rs.try_insert(0, R::new(0, 5, "x")).unwrap(); - rs.try_insert(1, R::new(0, 5, "x")).unwrap(); - let spans: Vec = rs.into_iter().map(|(s, _)| s).collect(); - assert_eq!(spans, vec![2, 0, 1]); - } - - #[test] - fn empty_and_len() { - let rs = Redactions::::new(ConflictPolicy::Reject); - assert!(rs.is_empty()); - assert_eq!(rs.len(), 0); - } -} diff --git a/crates/nvisy-codec/src/transform/tabular/apply.rs b/crates/nvisy-codec/src/transform/tabular/apply.rs deleted file mode 100644 index de817bb9..00000000 --- a/crates/nvisy-codec/src/transform/tabular/apply.rs +++ /dev/null @@ -1,51 +0,0 @@ -//! Byte-level helper for applying a batch of [`TabularRedaction`]s to a -//! single cell's value in place. - -use std::cmp::Reverse; - -use nvisy_core::Error; - -use super::instruction::TabularRedaction; - -/// Apply a slice of cell-scoped redactions to `cell` in place. -/// -/// Redactions are sorted right-to-left so earlier byte offsets stay -/// valid as later ones are replaced. Returns an error if any offset -/// falls mid-character. -/// -/// The slice must not contain overlapping ranges — that invariant is -/// owned by [`Redactions`] on insert. -/// -/// [`Redactions`]: crate::transform::Redactions -pub(crate) fn apply_tabular_redactions( - cell: &mut String, - redactions: &[TabularRedaction], - target: &'static str, -) -> Result<(), Error> { - let mut items: Vec<&TabularRedaction> = redactions.iter().collect(); - items.sort_by_key(|r| Reverse(r.start)); - - for r in items { - let value = r.output.replacement_value().unwrap_or_default(); - let s = r.start.min(cell.len()); - let e = r.end.min(cell.len()); - if s >= e { - continue; - } - if !cell.is_char_boundary(s) || !cell.is_char_boundary(e) { - return Err(Error::validation( - format!( - "redaction offset falls mid-character \ - (start={}, end={}, len={})", - r.start, - r.end, - cell.len() - ), - target, - )); - } - cell.replace_range(s..e, value); - } - - Ok(()) -} diff --git a/crates/nvisy-codec/src/transform/tabular/instruction.rs b/crates/nvisy-codec/src/transform/tabular/instruction.rs deleted file mode 100644 index e41a1da4..00000000 --- a/crates/nvisy-codec/src/transform/tabular/instruction.rs +++ /dev/null @@ -1,55 +0,0 @@ -//! Tabular redaction instruction types. - -use crate::transform::{Mergeable, TextOutput}; - -/// A tabular redaction targeting a byte range within its containing cell. -/// -/// Cell identity is supplied externally via [`Redactions`] — this -/// struct only carries the intra-cell byte range and the replacement -/// output. -/// -/// This is the tabular counterpart of [`TextRedaction`]: instead of -/// being grouped by a text span, it is grouped by a [`TabularLocation`] -/// cell. -/// -/// [`Redactions`]: crate::transform::Redactions -/// [`TextRedaction`]: crate::transform::TextRedaction -/// [`TabularLocation`]: nvisy_ontology::entity::TabularLocation -#[derive(Debug, Clone, PartialEq)] -pub struct TabularRedaction { - /// Byte offset where the redacted region starts within the cell value. - pub(crate) start: usize, - /// Byte offset where the redacted region ends (exclusive) within the cell value. - pub(crate) end: usize, - /// The redaction output that carries the replacement value. - pub(crate) output: TextOutput, -} - -impl TabularRedaction { - /// Create a new tabular redaction. - pub fn new(start: usize, end: usize, output: TextOutput) -> Self { - Self { start, end, output } - } -} - -impl Mergeable for TabularRedaction { - fn overlaps(&self, other: &Self) -> bool { - self.start < other.end && other.start < self.end - } - - /// Merge two overlapping tabular redactions. - /// - /// Returns `Some` only when both share the same [`TextOutput`] — - /// the merged redaction unions the byte ranges. Returns `None` - /// when the outputs differ. - fn try_merge(self, other: Self) -> Option { - if self.output != other.output { - return None; - } - Some(Self { - start: self.start.min(other.start), - end: self.end.max(other.end), - output: self.output, - }) - } -} diff --git a/crates/nvisy-codec/src/transform/tabular/mod.rs b/crates/nvisy-codec/src/transform/tabular/mod.rs deleted file mode 100644 index c23ae21c..00000000 --- a/crates/nvisy-codec/src/transform/tabular/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -//! Tabular redaction primitives. - -mod apply; -mod instruction; - -pub(crate) use self::apply::apply_tabular_redactions; -pub use self::instruction::TabularRedaction; diff --git a/crates/nvisy-codec/src/transform/text/apply.rs b/crates/nvisy-codec/src/transform/text/apply.rs deleted file mode 100644 index 62c9369c..00000000 --- a/crates/nvisy-codec/src/transform/text/apply.rs +++ /dev/null @@ -1,102 +0,0 @@ -//! Byte-level helper for applying a batch of [`TextRedaction`]s -//! to a string in place. - -use std::cmp::Reverse; - -use nvisy_core::Error; - -use super::instruction::TextRedaction; - -/// Apply a slice of redactions to `content` in place. -/// -/// Redactions are sorted right-to-left so that earlier byte offsets -/// remain valid as later ones are replaced. Returns an error if any -/// offset falls mid-character. -/// -/// The slice must not contain overlapping ranges — that invariant is -/// owned by [`Redactions`] on insert. -/// -/// [`Redactions`]: crate::transform::Redactions -pub(crate) fn apply_text_redactions( - content: &mut String, - redactions: &[TextRedaction], - target: &'static str, -) -> Result<(), Error> { - let mut items: Vec<&TextRedaction> = redactions.iter().collect(); - items.sort_by_key(|r| Reverse(r.start)); - - for r in items { - let value = r.output.replacement_value().unwrap_or_default(); - let s = r.start.min(content.len()); - let e = r.end.min(content.len()); - if s >= e { - continue; - } - if !content.is_char_boundary(s) || !content.is_char_boundary(e) { - return Err(Error::validation( - format!( - "redaction offset falls mid-character \ - (start={}, end={}, len={})", - r.start, - r.end, - content.len() - ), - target, - )); - } - content.replace_range(s..e, value); - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::transform::TextOutput; - - fn redaction(start: usize, end: usize, replacement: &str) -> TextRedaction { - TextRedaction::new(start, end, TextOutput::replace(replacement)) - } - - #[test] - fn single_replacement() { - let mut s = String::from("hello world"); - apply_text_redactions(&mut s, &[redaction(0, 5, "[X]")], "test").unwrap(); - assert_eq!(s, "[X] world"); - } - - #[test] - fn right_to_left_application() { - let mut s = String::from("aaa bbb ccc"); - let rs = vec![redaction(0, 3, "[A]"), redaction(8, 11, "[C]")]; - apply_text_redactions(&mut s, &rs, "test").unwrap(); - assert_eq!(s, "[A] bbb [C]"); - } - - #[test] - fn remove_output() { - let mut s = String::from("hello world"); - apply_text_redactions( - &mut s, - &[TextRedaction::new(5, 11, TextOutput::Remove)], - "test", - ) - .unwrap(); - assert_eq!(s, "hello"); - } - - #[test] - fn out_of_bounds_clipped() { - let mut s = String::from("short"); - apply_text_redactions(&mut s, &[redaction(0, 999, "[X]")], "test").unwrap(); - assert_eq!(s, "[X]"); - } - - #[test] - fn mid_character_rejected() { - let mut s = String::from("héllo"); // 'é' is 2 bytes - let err = apply_text_redactions(&mut s, &[redaction(0, 2, "[X]")], "test").unwrap_err(); - assert!(err.to_string().contains("mid-character")); - } -} diff --git a/crates/nvisy-codec/src/transform/text/mod.rs b/crates/nvisy-codec/src/transform/text/mod.rs deleted file mode 100644 index 0fb821d0..00000000 --- a/crates/nvisy-codec/src/transform/text/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -//! Text redaction primitives. - -mod apply; -mod instruction; - -pub(crate) use self::apply::apply_text_redactions; -pub use self::instruction::{TextOutput, TextRedaction}; diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 2d834de4..038d8dad 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -43,7 +43,7 @@ infer = { workspace = true, features = [] } # Derive macros and error handling thiserror = { workspace = true, features = [] } -derive_more = { workspace = true, features = ["as_ref", "deref", "display"] } +derive_more = { workspace = true, features = ["as_ref", "deref", "display", "is_variant"] } strum = { workspace = true, features = [] } # Encoding and hashing diff --git a/crates/nvisy-core/src/media/content_kind.rs b/crates/nvisy-core/src/media/content_kind.rs index 70adba76..fac53881 100644 --- a/crates/nvisy-core/src/media/content_kind.rs +++ b/crates/nvisy-core/src/media/content_kind.rs @@ -4,6 +4,7 @@ //! into broad categories. Extension-to-kind mapping is handled by the //! engine's format registry. +use derive_more::IsVariant; use serde::{Deserialize, Serialize}; use strum::{AsRefStr, Display, EnumIter, EnumString}; @@ -13,7 +14,8 @@ use strum::{AsRefStr, Display, EnumIter, EnumString}; /// of specific file extensions or MIME types. The engine's format registry /// handles the mapping from extensions/MIME types to content kinds. #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] -#[derive(AsRefStr, Display, EnumString, EnumIter, Serialize, Deserialize)] +#[derive(AsRefStr, Display, EnumString, EnumIter, IsVariant)] +#[derive(Serialize, Deserialize)] #[strum(serialize_all = "lowercase")] #[serde(rename_all = "lowercase")] pub enum ContentKind { @@ -29,29 +31,3 @@ pub enum ContentKind { #[default] Unknown, } - -impl ContentKind { - /// Check if this content kind represents text-based content - #[must_use] - pub fn is_text_based(&self) -> bool { - matches!(self, Self::Text) - } - - /// Check if this content kind represents a document - #[must_use] - pub fn is_document(&self) -> bool { - matches!(self, Self::Document) - } - - /// Check if this content kind represents a spreadsheet - #[must_use] - pub fn is_spreadsheet(&self) -> bool { - matches!(self, Self::Spreadsheet) - } - - /// Check if this content kind represents an image - #[must_use] - pub fn is_image(&self) -> bool { - matches!(self, Self::Image) - } -} diff --git a/crates/nvisy-engine/src/operation/envelope/document.rs b/crates/nvisy-engine/src/operation/envelope/document.rs index 6d3d9ba4..7f5faf31 100644 --- a/crates/nvisy-engine/src/operation/envelope/document.rs +++ b/crates/nvisy-engine/src/operation/envelope/document.rs @@ -8,9 +8,9 @@ use std::fmt; use futures::StreamExt; -use nvisy_codec::handler::{AudioData, ImageData, TextData}; -use nvisy_codec::transform::{ - AudioRedaction, ImageRedaction, Redactions, TabularRedaction, TextRedaction, +use nvisy_codec::handler::{ + AudioData, AudioRedaction, ImageData, ImageRedaction, Redactions, TabularRedaction, TextData, + TextRedaction, }; use nvisy_codec::{ContentHandle, Located}; use nvisy_core::Error; diff --git a/crates/nvisy-engine/src/operation/redaction/apply.rs b/crates/nvisy-engine/src/operation/redaction/apply.rs index 06a1c898..4e392cde 100644 --- a/crates/nvisy-engine/src/operation/redaction/apply.rs +++ b/crates/nvisy-engine/src/operation/redaction/apply.rs @@ -8,7 +8,7 @@ use std::collections::HashMap; -use nvisy_codec::transform::{ +use nvisy_codec::handler::{ AudioOutput, AudioRedaction, ConflictPolicy, ImageOutput, ImageRedaction, Redactions, TabularRedaction, TextOutput, TextRedaction, }; @@ -114,10 +114,7 @@ impl<'a> RedactionApplicator<'a> { ); redactions - .try_insert( - loc.clone(), - TextRedaction::new(loc.start_offset, loc.end_offset, output), - ) + .try_insert(loc.clone(), TextRedaction::new(output)) .map_err(|e| Error::validation(e.to_string(), "redaction-apply-text"))?; } @@ -158,23 +155,18 @@ impl<'a> RedactionApplicator<'a> { self.envelope.redaction_map.entries[idx].replacement = replacement; } - // Intra-cell byte range: prefer explicit offsets if the - // entity provided them, otherwise redact the entire cell. - let start = loc.start_offset.unwrap_or(0); - let end = loc.end_offset.unwrap_or(value.len()); - tracing::trace!( target: TARGET, %entity_id, row = loc.row_index, col = loc.column_index, - start, - end, + start = ?loc.start_offset, + end = ?loc.end_offset, "built tabular redaction instruction", ); redactions - .try_insert(loc.clone(), TabularRedaction::new(start, end, output)) + .try_insert(loc.clone(), TabularRedaction::new(output)) .map_err(|e| Error::validation(e.to_string(), "redaction-apply-tabular"))?; } @@ -221,7 +213,7 @@ impl<'a> RedactionApplicator<'a> { ); redactions - .try_insert(loc.clone(), ImageRedaction::new(loc.bounding_box, output)) + .try_insert(loc.clone(), ImageRedaction::new(output)) .map_err(|e| Error::validation(e.to_string(), "redaction-apply-image"))?; } @@ -270,7 +262,7 @@ impl<'a> RedactionApplicator<'a> { ); redactions - .try_insert(loc.clone(), AudioRedaction::new(loc.time_span, output)) + .try_insert(loc.clone(), AudioRedaction::new(output)) .map_err(|e| Error::validation(e.to_string(), "redaction-apply-audio"))?; } diff --git a/crates/nvisy-ontology/src/entity/location/audio.rs b/crates/nvisy-ontology/src/entity/location/audio.rs index 49005bf9..69efb7aa 100644 --- a/crates/nvisy-ontology/src/entity/location/audio.rs +++ b/crates/nvisy-ontology/src/entity/location/audio.rs @@ -5,7 +5,7 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use super::Overlap; +use super::{Mergeable, Overlap}; use crate::primitive::TimeSpan; /// Location of an entity within an audio stream. @@ -42,3 +42,19 @@ impl Overlap for AudioLocation { self.time_span.overlaps(&other.time_span) } } + +impl Mergeable for AudioLocation { + /// Merge two audio locations by unioning time spans when their + /// `audio_id` and `speaker_id` match. Different speakers or + /// different documents cannot merge. + fn try_merge(self, other: Self) -> Option { + if self.audio_id != other.audio_id || self.speaker_id != other.speaker_id { + return None; + } + Some(Self { + time_span: self.time_span.union(&other.time_span), + speaker_id: self.speaker_id, + audio_id: self.audio_id, + }) + } +} diff --git a/crates/nvisy-ontology/src/entity/location/image.rs b/crates/nvisy-ontology/src/entity/location/image.rs index 72312506..0a641f60 100644 --- a/crates/nvisy-ontology/src/entity/location/image.rs +++ b/crates/nvisy-ontology/src/entity/location/image.rs @@ -5,7 +5,7 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use super::Overlap; +use super::{Mergeable, Overlap}; use crate::primitive::BoundingBox; /// Location of an entity within an image. @@ -47,3 +47,19 @@ impl Overlap for ImageLocation { self.bounding_box.overlaps(&other.bounding_box) } } + +impl Mergeable for ImageLocation { + /// Merge two image locations by unioning bounding boxes when their + /// `image_id` and `page_number` match. Different documents or + /// different pages cannot merge. + fn try_merge(self, other: Self) -> Option { + if self.image_id != other.image_id || self.page_number != other.page_number { + return None; + } + Some(Self { + bounding_box: self.bounding_box.union(&other.bounding_box), + image_id: self.image_id, + page_number: self.page_number, + }) + } +} diff --git a/crates/nvisy-ontology/src/entity/location/mod.rs b/crates/nvisy-ontology/src/entity/location/mod.rs index 20076e1d..0f67d07a 100644 --- a/crates/nvisy-ontology/src/entity/location/mod.rs +++ b/crates/nvisy-ontology/src/entity/location/mod.rs @@ -30,6 +30,21 @@ pub trait Overlap { fn overlaps(&self, other: &Self) -> bool; } +/// Trait for combining two values into one when they can be reconciled. +/// +/// Used by [`Redactions`] (and any other collection that groups +/// targets) under a merge policy: when two entries collide (per +/// [`Overlap`]), the collection asks both the location and the +/// payload whether they can fuse. Returns `Some(merged)` when the +/// two can be combined (e.g. unioned bounding boxes, identical +/// outputs), `None` when they cannot (e.g. different tabular cells, +/// conflicting replacement strings). +/// +/// [`Redactions`]: https://docs.rs/nvisy-codec/latest/nvisy_codec/transform/struct.Redactions.html +pub trait Mergeable: Sized { + fn try_merge(self, other: Self) -> Option; +} + /// A modality-specific location for a detected entity. /// /// Exactly one variant is set per entity, enforcing the invariant that diff --git a/crates/nvisy-ontology/src/entity/location/tabular.rs b/crates/nvisy-ontology/src/entity/location/tabular.rs index cb3bef9e..64f638db 100644 --- a/crates/nvisy-ontology/src/entity/location/tabular.rs +++ b/crates/nvisy-ontology/src/entity/location/tabular.rs @@ -4,7 +4,7 @@ use derive_builder::Builder; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use super::Overlap; +use super::{Mergeable, Overlap}; /// Location of an entity within tabular data. #[derive(Debug, Clone, PartialEq, Eq, Builder)] @@ -63,6 +63,38 @@ impl Overlap for TabularLocation { } } +impl Mergeable for TabularLocation { + /// Merge two tabular locations when their cell coordinates match + /// (same `row_index` + `column_index` + `sheet_name`). Intra-cell + /// byte offsets union when present on both sides; otherwise the + /// result has no offsets (meaning "whole cell"). + fn try_merge(self, other: Self) -> Option { + if self.row_index != other.row_index + || self.column_index != other.column_index + || self.sheet_name != other.sheet_name + { + return None; + } + let (start, end) = match ( + self.start_offset, + self.end_offset, + other.start_offset, + other.end_offset, + ) { + (Some(s1), Some(e1), Some(s2), Some(e2)) => (Some(s1.min(s2)), Some(e1.max(e2))), + _ => (None, None), + }; + Some(Self { + row_index: self.row_index, + column_index: self.column_index, + start_offset: start, + end_offset: end, + column_name: self.column_name.or(other.column_name), + sheet_name: self.sheet_name, + }) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/nvisy-ontology/src/entity/location/text.rs b/crates/nvisy-ontology/src/entity/location/text.rs index 609b1f2d..1c39b785 100644 --- a/crates/nvisy-ontology/src/entity/location/text.rs +++ b/crates/nvisy-ontology/src/entity/location/text.rs @@ -4,7 +4,7 @@ use derive_builder::Builder; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use super::Overlap; +use super::{Mergeable, Overlap}; /// Location of an entity within text content. #[derive(Debug, Clone, Default, PartialEq, Eq, Builder)] @@ -61,6 +61,40 @@ impl Overlap for TextLocation { } } +impl Mergeable for TextLocation { + /// Merge two text locations by unioning byte offsets when their + /// non-range identity (page/line) matches. Context offsets union + /// when present on both sides; otherwise the result has no + /// context window. + fn try_merge(self, other: Self) -> Option { + if self.page_number != other.page_number || self.line_number != other.line_number { + return None; + } + Some(Self { + start_offset: self.start_offset.min(other.start_offset), + end_offset: self.end_offset.max(other.end_offset), + context_start_offset: option_min(self.context_start_offset, other.context_start_offset), + context_end_offset: option_max(self.context_end_offset, other.context_end_offset), + page_number: self.page_number, + line_number: self.line_number, + }) + } +} + +fn option_min(a: Option, b: Option) -> Option { + match (a, b) { + (Some(x), Some(y)) => Some(x.min(y)), + _ => None, + } +} + +fn option_max(a: Option, b: Option) -> Option { + match (a, b) { + (Some(x), Some(y)) => Some(x.max(y)), + _ => None, + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/nvisy-ontology/src/entity/mod.rs b/crates/nvisy-ontology/src/entity/mod.rs index ee079d1e..773c411a 100644 --- a/crates/nvisy-ontology/src/entity/mod.rs +++ b/crates/nvisy-ontology/src/entity/mod.rs @@ -12,8 +12,6 @@ mod method; mod sensitivity; mod source; -use std::slice; - use derive_builder::Builder; use derive_more::{Deref, DerefMut, From, IntoIterator}; use schemars::JsonSchema; @@ -24,8 +22,8 @@ pub use self::annotation::{Annotation, AnnotationKind, AnnotationTarget, Annotat pub use self::category::EntityCategory; pub use self::kind::EntityKind; pub use self::location::{ - AudioLocation, AudioLocationBuilder, ImageLocation, ImageLocationBuilder, Location, Overlap, - TabularLocation, TabularLocationBuilder, TextLocation, TextLocationBuilder, + AudioLocation, AudioLocationBuilder, ImageLocation, ImageLocationBuilder, Location, Mergeable, + Overlap, TabularLocation, TabularLocationBuilder, TextLocation, TextLocationBuilder, }; pub use self::method::{ AnnotationProvenance, ExtractionMethod, ModelKind, ModelProvenance, PatternProvenance, @@ -125,7 +123,7 @@ impl EntityBuilder { #[derive(Deref, DerefMut, From, IntoIterator)] #[derive(Serialize, Deserialize, JsonSchema)] #[serde(transparent)] -pub struct Entities(pub Vec); +pub struct Entities(#[into_iterator(owned, ref, ref_mut)] pub Vec); impl Entities { /// Create an empty collection. @@ -168,15 +166,6 @@ impl Entities { } } -impl<'a> IntoIterator for &'a Entities { - type IntoIter = slice::Iter<'a, Entity>; - type Item = &'a Entity; - - fn into_iter(self) -> Self::IntoIter { - self.0.iter() - } -} - impl FromIterator for Entities { fn from_iter>(iter: I) -> Self { Self(iter.into_iter().collect())