diff --git a/ollama/_types.py b/ollama/_types.py index 96529d63..f0e22a2f 100644 --- a/ollama/_types.py +++ b/ollama/_types.py @@ -186,6 +186,34 @@ def serialize_model(self): raise ValueError('Invalid image data, expected base64 string or path to image file') from Exception +class Audio(BaseModel): + value: Union[str, bytes, Path] + + @model_serializer + def serialize_model(self): + if isinstance(self.value, (Path, bytes)): + return b64encode(self.value.read_bytes() if isinstance(self.value, Path) else self.value).decode() + + if isinstance(self.value, str): + try: + if Path(self.value).exists(): + return b64encode(Path(self.value).read_bytes()).decode() + except Exception: + # Long base64 string can't be wrapped in Path, so try to treat as base64 string + pass + + # String might be a file path, but might not exist + if self.value.split('.')[-1] in ('mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'ogg', 'wav', 'webm'): + raise ValueError(f'File {self.value} does not exist') + + try: + # Try to decode to check if it is already base64 + b64decode(self.value) + return self.value + except Exception: + raise ValueError('Invalid audio data, expected base64 string or path to audio file') from Exception + + class GenerateRequest(BaseGenerateRequest): prompt: Optional[str] = None 'Prompt to generate response from.' @@ -327,6 +355,18 @@ class Message(SubscriptableBaseModel): Valid image formats depend on the model. See the model card for more information. """ + audio: Optional[Sequence[Audio]] = None + """ + Optional list of audio data for multimodal models. + + Valid input types are: + + - `str` or path-like object: path to audio file + - `bytes` or bytes-like object: raw audio data + + Valid audio formats depend on the model. See the model card for more information. + """ + tool_name: Optional[str] = None 'Name of the executed tool.'