Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/nvisy-codec/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ hipstr = { workspace = true, features = [] }

# Derive macros and error handling
derive_more = { workspace = true, features = ["as_ref", "deref", "deref_mut", "display", "from"] }
thiserror = { workspace = true, features = [] }

# Image processing
image = { workspace = true, features = [] }
Expand Down
61 changes: 61 additions & 0 deletions crates/nvisy-codec/src/document/located.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
//! [`Located`]: a location paired with its production-time provenance.

use nvisy_core::content::ContentSource;

/// A location tagged with the [`ContentSource`] of the handler that
/// produced it.
///
/// Returned by handler `locations()` streams so callers can attribute
/// each location to a specific content artifact. The location itself
/// remains the structural identity used as a key in
/// [`Redactions`] — the source is metadata about how the location
/// was produced, not part of its identity.
///
/// [`Redactions`]: crate::transform::Redactions
#[derive(Debug, Clone, PartialEq)]
pub struct Located<L> {
/// The handler-level source that produced this location.
pub source: ContentSource,
/// The structural location within the handler's data model.
pub location: L,
}

impl<L> Located<L> {
/// Create a new located location.
pub fn new(source: ContentSource, location: L) -> Self {
Self { source, location }
}

/// Discard the source, returning the underlying location.
pub fn into_location(self) -> L {
self.location
}

/// Transform the inner location, keeping the source unchanged.
pub fn map<T>(self, f: impl FnOnce(L) -> T) -> Located<T> {
Located {
source: self.source,
location: f(self.location),
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn map_transforms_inner() {
let src = ContentSource::new();
let l = Located::new(src, 7u32);
let mapped = l.map(|n| n.to_string());
assert_eq!(mapped.location, "7");
assert_eq!(mapped.source, src);
}

#[test]
fn into_location_discards_source() {
let l = Located::new(ContentSource::new(), 42u32);
assert_eq!(l.into_location(), 42);
}
}
116 changes: 59 additions & 57 deletions crates/nvisy-codec/src/document/mod.rs
Original file line number Diff line number Diff line change
@@ -1,35 +1,42 @@
//! Type-erased content handle for all supported formats.

mod located;
mod span;
mod stream;

use std::fmt;

use derive_more::{From, IsVariant, TryInto};
use futures::StreamExt;
use nvisy_core::Error;
use nvisy_core::content::{Content, ContentData, ContentSource};
use nvisy_core::media::{
AudioFormat, DocumentType, ImageFormat, SpreadsheetFormat, TextFormat, WordFormat,
};
use nvisy_ontology::entity::{AudioLocation, ImageLocation, Location, TextLocation};
use nvisy_ontology::entity::{AudioLocation, ImageLocation, TextLocation};

pub use self::located::Located;
pub use self::span::Span;
pub use self::stream::SpanStream;
pub use self::stream::LocationStream;
#[cfg(feature = "docx")]
use crate::handler::{DocxLoader, DocxParams};
#[cfg(feature = "html")]
use crate::handler::{HtmlLoader, HtmlParams};
#[cfg(feature = "pdf")]
use crate::handler::{PdfLoader, PdfParams};
use crate::handler::{
AudioData, AudioHandler, BoxedAudioHandler, BoxedImageHandler, BoxedRichHandler,
BoxedTextHandler, CsvLoader, CsvParams, Handler, ImageData, ImageHandler, JpegLoader,
JpegParams, JsonLoader, JsonParams, Loader, MarkdownLoader, MarkdownParams, Mp3Loader,
Mp3Params, PngLoader, PngParams, TextData, TextHandler, TiffLoader, TiffParams, TxtLoader,
TxtParams, WavLoader, WavParams, XlsxLoader, XlsxParams,
};
use crate::transform::{AudioRedaction, ImageRedaction, TextRedaction};
use crate::transform::{AudioRedaction, ImageRedaction, Redactions, TextRedaction};

/// A fully type-erased document that can hold any supported format.
///
/// Groups documents into four modality families:
/// - **Text**: plain text, CSV, JSON, HTML, XLSX
/// - **Image**: PNG, JPEG
/// - **Image**: PNG, JPEG, TIFF
/// - **Audio**: WAV, MP3
/// - **Rich**: PDF, DOCX (multi-modal documents with text + images)
#[derive(From, IsVariant, TryInto)]
Expand Down Expand Up @@ -79,92 +86,92 @@ impl ContentHandle {
}
}

/// Stream text spans from text or rich documents.
pub async fn text_spans(&self) -> SpanStream<'_, TextLocation, TextData> {
/// Stream text locations from text or rich documents.
pub fn text_locations(&self) -> LocationStream<'_, TextLocation> {
match self {
Self::Text(h) => h.text_spans().await,
Self::Rich(h) => h.text_spans().await,
_ => SpanStream::new(futures::stream::empty()),
Self::Text(h) => h.locations(),
Self::Rich(h) => TextHandler::locations(h),
Self::Image(_) | Self::Audio(_) => LocationStream::empty(),
}
}

/// Stream image spans from image or rich documents.
pub async fn image_spans(&self) -> SpanStream<'_, ImageLocation, ImageData> {
/// Stream image locations from image or rich documents.
pub fn image_locations(&self) -> LocationStream<'_, ImageLocation> {
match self {
Self::Image(h) => h.image_spans().await,
Self::Rich(h) => h.image_spans().await,
Self::Text(_) | Self::Audio(_) => SpanStream::new(futures::stream::empty()),
Self::Image(h) => h.locations(),
Self::Rich(h) => ImageHandler::locations(h),
Self::Text(_) | Self::Audio(_) => LocationStream::empty(),
}
}

/// Stream audio spans from audio documents.
pub async fn audio_spans(&self) -> SpanStream<'_, AudioLocation, AudioData> {
/// Stream audio locations from audio documents.
pub fn audio_locations(&self) -> LocationStream<'_, AudioLocation> {
match self {
Self::Audio(h) => h.audio_spans().await,
_ => SpanStream::new(futures::stream::empty()),
Self::Audio(h) => h.locations(),
_ => LocationStream::empty(),
}
}

/// Collect all text spans into a `Vec`.
pub async fn collect_text_spans(&self) -> Vec<Span<TextLocation, TextData>> {
self.text_spans().await.collect().await
}

/// Collect all image spans into a `Vec`.
pub async fn collect_image_spans(&self) -> Vec<Span<ImageLocation, ImageData>> {
self.image_spans().await.collect().await
/// Read text data at the given location.
///
/// Returns `None` if the location is out of bounds or the handle
/// does not expose text content.
pub async fn read_text(&self, location: &TextLocation) -> Option<TextData> {
match self {
Self::Text(h) => h.read(location).await,
Self::Rich(h) => TextHandler::read(h, location).await,
Self::Image(_) | Self::Audio(_) => None,
}
}

/// Collect all audio spans into a `Vec`.
pub async fn collect_audio_spans(&self) -> Vec<Span<AudioLocation, AudioData>> {
self.audio_spans().await.collect().await
/// Read image data at the given location.
pub async fn read_image(&self, location: &ImageLocation) -> Option<ImageData> {
match self {
Self::Image(h) => h.read(location).await,
Self::Rich(h) => ImageHandler::read(h, location).await,
Self::Text(_) | Self::Audio(_) => None,
}
}

/// Extract the value at the given location, dispatching by modality.
///
/// Returns the text/data at the location if available.
pub async fn value_at(&self, location: &Location) -> Option<String> {
match (self, location) {
(Self::Text(h), Location::Text(loc)) => h.value_at(loc).await,
(Self::Rich(h), Location::Text(loc)) => TextHandler::value_at(h, loc).await,
/// Read audio data at the given location.
pub async fn read_audio(&self, location: &AudioLocation) -> Option<AudioData> {
match self {
Self::Audio(h) => h.read(location).await,
_ => None,
}
}

/// Apply a batch of text redactions to the document.
pub async fn apply_text_redactions(
&mut self,
redactions: &[TextRedaction<TextLocation>],
redactions: Redactions<TextLocation, TextRedaction>,
) -> Result<(), Error> {
use crate::transform::TextTransform;
match self {
Self::Text(h) => h.redact_text(redactions).await,
Self::Rich(h) => h.redact_text(redactions).await,
Self::Text(h) => h.redact(redactions).await,
Self::Rich(h) => TextHandler::redact(h, redactions).await,
Self::Image(_) | Self::Audio(_) => Ok(()),
}
}

/// Apply a batch of image redactions to the document.
pub async fn apply_image_redactions(
&mut self,
redactions: &[ImageRedaction<ImageLocation>],
redactions: Redactions<ImageLocation, ImageRedaction>,
) -> Result<(), Error> {
use crate::transform::ImageTransform;
match self {
Self::Image(h) => h.redact_images(redactions).await,
Self::Rich(h) => h.redact_images(redactions).await,
Self::Image(h) => h.redact(redactions).await,
Self::Rich(h) => ImageHandler::redact(h, redactions).await,
Self::Text(_) | Self::Audio(_) => Ok(()),
}
}

/// Apply a batch of audio redactions to the document.
pub async fn apply_audio_redactions(
&mut self,
redactions: &[AudioRedaction<AudioLocation>],
redactions: Redactions<AudioLocation, AudioRedaction>,
) -> Result<(), Error> {
use crate::transform::AudioTransform;
match self {
Self::Audio(h) => h.redact_audio(redactions).await,
Self::Audio(h) => h.redact(redactions).await,
Self::Text(_) | Self::Image(_) | Self::Rich(_) => Ok(()),
}
}
Expand Down Expand Up @@ -207,13 +214,10 @@ impl ContentHandle {
.await?
.into(),
#[cfg(feature = "html")]
DocumentType::Html => {
use crate::handler::{HtmlLoader, HtmlParams};
HtmlLoader
.decode(content, &HtmlParams::default())
.await?
.into()
}
DocumentType::Html => HtmlLoader
.decode(content, &HtmlParams::default())
.await?
.into(),
DocumentType::Spreadsheet(SpreadsheetFormat::Csv) => CsvLoader
.decode(content, &CsvParams::default())
.await?
Expand Down Expand Up @@ -276,7 +280,6 @@ impl ContentHandle {
DocumentType::Pdf => {
#[cfg(feature = "pdf")]
{
use crate::handler::{PdfLoader, PdfParams};
let handler = PdfLoader.decode(content, &PdfParams::default()).await?;
Ok(Self::from(BoxedRichHandler::from(handler)))
}
Expand All @@ -291,7 +294,6 @@ impl ContentHandle {
DocumentType::Word(WordFormat::Docx) => {
#[cfg(feature = "docx")]
{
use crate::handler::{DocxLoader, DocxParams};
let handler = DocxLoader.decode(content, &DocxParams).await?;
Ok(Self::from(BoxedRichHandler::from(handler)))
}
Expand Down
Loading