From 2b2a93abac946b05eeaefafde686550b0de40aa3 Mon Sep 17 00:00:00 2001 From: tianzeshi Date: Sun, 31 May 2026 13:31:23 +0800 Subject: [PATCH 1/2] added openai compatible captioner interface --- .../CaptionLocal/captioner/__init__.py | 3 + .../CaptionLocal/captioner/customEndpoint.py | 136 ++++++++++++++++++ .../CaptionLocal/customEndpointConfig.py | 99 +++++++++++++ .../CaptionLocal/imageDescriber.py | 16 ++- addon/globalPlugins/CaptionLocal/models.json | 8 ++ 5 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py create mode 100644 addon/globalPlugins/CaptionLocal/customEndpointConfig.py diff --git a/addon/globalPlugins/CaptionLocal/captioner/__init__.py b/addon/globalPlugins/CaptionLocal/captioner/__init__.py index 7928e31..4e9cf07 100644 --- a/addon/globalPlugins/CaptionLocal/captioner/__init__.py +++ b/addon/globalPlugins/CaptionLocal/captioner/__init__.py @@ -40,5 +40,8 @@ def imageCaptionerFactory( from .qwen import QwenImageCaptioner modelDir = os.path.dirname(configPath) return QwenImageCaptioner(modelDir) + elif modelArchitecture == "CustomEndpoint": + from .customEndpoint import CustomEndpointCaptioner + return CustomEndpointCaptioner.from_config(configPath) else: raise NotImplementedError(f"Unsupported model architecture: {modelArchitecture}") diff --git a/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py new file mode 100644 index 0000000..184381d --- /dev/null +++ b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py @@ -0,0 +1,136 @@ +# -*- coding: UTF-8 -*- +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import base64 +import json +import requests +import io +from typing import Callable +from logHandler import log +from .base import ImageCaptioner + +try: + _ +except NameError: + _ = lambda x: x + +class CustomEndpointCaptioner(ImageCaptioner): + """Captioner using custom OpenAI-compatible API endpoints.""" + + def __init__(self, endpoint: str, api_key: str, model: str, prompt: str = None): + """ + Initialize the custom endpoint captioner. + + :param endpoint: Base URL of the API. + :param api_key: API key. + :param model: Model name. + :param prompt: Custom prompt. + """ + self.endpoint = endpoint.rstrip('/') + self.api_key = api_key + self.model = model + # Translators: default prompt for image captioning + self.prompt = prompt or _("Please describe the picture in one sentence") + + @classmethod + def from_config(cls, config_path: str): + """ + Create a CustomEndpointCaptioner from a config file. + """ + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + + endpoint = config.get("endpoint") + api_key = config.get("api_key") + model = config.get("model") + prompt = config.get("prompt", None) + + if endpoint and api_key and model: + return cls(endpoint, api_key, model, prompt) + + raise ValueError(f"Invalid custom endpoint configuration in {config_path}") + + def generateCaption( + self, + image: str | bytes, + maxLength: int | None = None, + onToken: Callable[[str], None] | None = None, + ) -> str: + """ + Generate caption via custom endpoint. + """ + try: + if isinstance(image, str): + with open(image, "rb") as f: + img_data = f.read() + else: + img_data = image + + base64_image = base64.b64encode(img_data).decode('utf-8') + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": self.prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + } + ] + } + ], + "stream": bool(onToken) + } + + if maxLength: + payload["max_tokens"] = maxLength + + response = requests.post( + f"{self.endpoint}/chat/completions", + headers=headers, + json=payload, + stream=bool(onToken), + timeout=60 + ) + response.raise_for_status() + + if onToken: + full_text = "" + for line in response.iter_lines(): + if line: + line_str = line.decode('utf-8').strip() + if line_str.startswith("data:"): + data_str = line_str[len("data:"):].strip() + if data_str == "[DONE]": + break + try: + data = json.loads(data_str) + choices = data.get('choices', []) + if not choices: + continue + content = choices[0].get('delta', {}).get('content', '') + if content: + full_text += content + onToken(content) + except Exception: + continue + return full_text.strip() + else: + data = response.json() + return data['choices'][0]['message']['content'].strip() + except Exception as e: + log.exception(f"Custom endpoint API request failed: {e}") + raise diff --git a/addon/globalPlugins/CaptionLocal/customEndpointConfig.py b/addon/globalPlugins/CaptionLocal/customEndpointConfig.py new file mode 100644 index 0000000..877868a --- /dev/null +++ b/addon/globalPlugins/CaptionLocal/customEndpointConfig.py @@ -0,0 +1,99 @@ +# -*- coding: UTF-8 -*- +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import wx +import json +import os +from logHandler import log + +try: + _ +except NameError: + _ = lambda x: x + +class CustomEndpointConfigDialog(wx.Dialog): + """Dialog for configuring custom API endpoints.""" + + def __init__(self, parent, config_path): + super().__init__(parent, title=_("Configure Custom Endpoint"), size=(500, 350)) + self.config_path = config_path + self.config_data = {} + self._load_existing_config() + self._initUI() + + def _load_existing_config(self): + if os.path.exists(self.config_path): + try: + with open(self.config_path, "r", encoding="utf-8") as f: + self.config_data = json.load(f) + except Exception: + log.exception(f"Failed to load config from {self.config_path}") + + def _initUI(self): + mainSizer = wx.BoxSizer(wx.VERTICAL) + + flexSizer = wx.FlexGridSizer(rows=4, cols=2, vgap=10, hgap=10) + flexSizer.AddGrowableCol(1) + + # Endpoint URL + flexSizer.Add(wx.StaticText(self, label=_("Endpoint URL:")), 0, wx.ALIGN_CENTER_VERTICAL) + self.endpointCtrl = wx.TextCtrl(self, value=self.config_data.get("endpoint", "https://api.openai.com/v1")) + flexSizer.Add(self.endpointCtrl, 1, wx.EXPAND) + + # API Key + flexSizer.Add(wx.StaticText(self, label=_("API Key:")), 0, wx.ALIGN_CENTER_VERTICAL) + self.apiKeyCtrl = wx.TextCtrl(self, value=self.config_data.get("api_key", ""), style=wx.TE_PASSWORD) + flexSizer.Add(self.apiKeyCtrl, 1, wx.EXPAND) + + # Model Name + flexSizer.Add(wx.StaticText(self, label=_("Model Name:")), 0, wx.ALIGN_CENTER_VERTICAL) + self.modelCtrl = wx.TextCtrl(self, value=self.config_data.get("model", "gpt-4o-mini")) + flexSizer.Add(self.modelCtrl, 1, wx.EXPAND) + + # Custom Prompt + flexSizer.Add(wx.StaticText(self, label=_("Custom Prompt (Optional):")), 0, wx.ALIGN_CENTER_VERTICAL) + self.promptCtrl = wx.TextCtrl(self, value=self.config_data.get("prompt", "")) + flexSizer.Add(self.promptCtrl, 1, wx.EXPAND) + + mainSizer.Add(flexSizer, 1, wx.ALL | wx.EXPAND, 15) + + # Buttons + btnSizer = self.CreateButtonSizer(wx.OK | wx.CANCEL) + mainSizer.Add(btnSizer, 0, wx.ALL | wx.CENTER, 10) + + self.SetSizer(mainSizer) + + def get_config(self): + return { + "architectures": ["CustomEndpoint"], + "endpoint": self.endpointCtrl.GetValue().strip(), + "api_key": self.apiKeyCtrl.GetValue().strip(), + "model": self.modelCtrl.GetValue().strip(), + "prompt": self.promptCtrl.GetValue().strip() or None + } + +def show_config_dialog(parent, config_path): + """Show the config dialog and save if OK is pressed.""" + dlg = CustomEndpointConfigDialog(parent, config_path) + if dlg.ShowModal() == wx.ID_OK: + config = dlg.get_config() + # Ensure directory exists + os.makedirs(os.path.dirname(config_path), exist_ok=True) + with open(config_path, "w", encoding="utf-8") as f: + json.dump(config, f, indent=4, ensure_ascii=False) + return True + return False + +def is_config_valid(config_path): + """Check if the config file exists and has the required fields.""" + if not os.path.exists(config_path): + return False + try: + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + return all(k in config for k in ["endpoint", "api_key", "model"]) + except Exception: + return False diff --git a/addon/globalPlugins/CaptionLocal/imageDescriber.py b/addon/globalPlugins/CaptionLocal/imageDescriber.py index 401a919..642c8d5 100644 --- a/addon/globalPlugins/CaptionLocal/imageDescriber.py +++ b/addon/globalPlugins/CaptionLocal/imageDescriber.py @@ -174,10 +174,24 @@ def _loadModel(self, localModelDirPath: str | None = None) -> None: modelsDir = config.conf["captionLocal"]["modelsDir"] currentModel = config.conf["captionLocal"]["currentModel"] localModelDirPath = os.path.join(modelsDir, currentModel) + + # Special handling for custom/endpoint + configPath = os.path.join(localModelDirPath, "config.json") + if currentModel == "custom/endpoint" or (os.path.exists(configPath) and "CustomEndpoint" in open(configPath, "r", encoding="utf-8").read()): + from . import customEndpointConfig + if not customEndpointConfig.is_config_valid(configPath): + def show_ui(): + if wx.CallAfter(customEndpointConfig.show_config_dialog(None, configPath)): + # Reload after config + self.loadModelInBackground(localModelDirPath) + else: + ui.message(_("Custom endpoint not configured")) + + wx.CallAfter(show_ui) + return encoderPath = os.path.join(localModelDirPath, "onnx", "encoder_model_quantized.onnx") decoderPath = os.path.join(localModelDirPath, "onnx", "decoder_model_merged_quantized.onnx") - configPath = os.path.join(localModelDirPath, "config.json") try: from . import modelConfig diff --git a/addon/globalPlugins/CaptionLocal/models.json b/addon/globalPlugins/CaptionLocal/models.json index 6a77799..5de6a36 100644 --- a/addon/globalPlugins/CaptionLocal/models.json +++ b/addon/globalPlugins/CaptionLocal/models.json @@ -58,6 +58,14 @@ "config.json", "preprocessor_config.json" ] + }, + { + "id": "custom/endpoint", + "name": "Custom OpenAI-compatible API Endpoint", + "resolvePath": "", + "files": [ + "config.json" + ] } ] } From c431767da9cd47df8a6ce4da16ddbd3e01a84ee0 Mon Sep 17 00:00:00 2001 From: tianzeshi Date: Sun, 31 May 2026 14:28:26 +0800 Subject: [PATCH 2/2] fix empty api key --- .../globalPlugins/CaptionLocal/captioner/customEndpoint.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py index 184381d..14dc601 100644 --- a/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py +++ b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py @@ -48,7 +48,7 @@ def from_config(cls, config_path: str): model = config.get("model") prompt = config.get("prompt", None) - if endpoint and api_key and model: + if endpoint and model: return cls(endpoint, api_key, model, prompt) raise ValueError(f"Invalid custom endpoint configuration in {config_path}") @@ -72,9 +72,10 @@ def generateCaption( base64_image = base64.b64encode(img_data).decode('utf-8') headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {self.api_key}" + "Content-Type": "application/json" } + if self.api_key: + headers["Authorization"] = f"Bearer {self.api_key}" payload = { "model": self.model,