From 2b2a93abac946b05eeaefafde686550b0de40aa3 Mon Sep 17 00:00:00 2001
From: tianzeshi <tianzeshi_study@outlook.com>
Date: Sun, 31 May 2026 13:31:23 +0800
Subject: [PATCH 1/2] added openai compatible captioner interface

---
 .../CaptionLocal/captioner/__init__.py        |   3 +
 .../CaptionLocal/captioner/customEndpoint.py  | 136 ++++++++++++++++++
 .../CaptionLocal/customEndpointConfig.py      |  99 +++++++++++++
 .../CaptionLocal/imageDescriber.py            |  16 ++-
 addon/globalPlugins/CaptionLocal/models.json  |   8 ++
 5 files changed, 261 insertions(+), 1 deletion(-)
 create mode 100644 addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py
 create mode 100644 addon/globalPlugins/CaptionLocal/customEndpointConfig.py

diff --git a/addon/globalPlugins/CaptionLocal/captioner/__init__.py b/addon/globalPlugins/CaptionLocal/captioner/__init__.py
index 7928e31..4e9cf07 100644
--- a/addon/globalPlugins/CaptionLocal/captioner/__init__.py
+++ b/addon/globalPlugins/CaptionLocal/captioner/__init__.py
@@ -40,5 +40,8 @@ def imageCaptionerFactory(
 		from .qwen import QwenImageCaptioner
 		modelDir = os.path.dirname(configPath)
 		return QwenImageCaptioner(modelDir)
+	elif modelArchitecture == "CustomEndpoint":
+		from .customEndpoint import CustomEndpointCaptioner
+		return CustomEndpointCaptioner.from_config(configPath)
 	else:
 		raise NotImplementedError(f"Unsupported model architecture: {modelArchitecture}")
diff --git a/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py
new file mode 100644
index 0000000..184381d
--- /dev/null
+++ b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py
@@ -0,0 +1,136 @@
+# -*- coding: UTF-8 -*-
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+import base64
+import json
+import requests
+import io
+from typing import Callable
+from logHandler import log
+from .base import ImageCaptioner
+
+try:
+	_
+except NameError:
+	_ = lambda x: x
+
+class CustomEndpointCaptioner(ImageCaptioner):
+	"""Captioner using custom OpenAI-compatible API endpoints."""
+
+	def __init__(self, endpoint: str, api_key: str, model: str, prompt: str = None):
+		"""
+		Initialize the custom endpoint captioner.
+		
+		:param endpoint: Base URL of the API.
+		:param api_key: API key.
+		:param model: Model name.
+		:param prompt: Custom prompt.
+		"""
+		self.endpoint = endpoint.rstrip('/')
+		self.api_key = api_key
+		self.model = model
+		# Translators: default prompt for image captioning
+		self.prompt = prompt or _("Please describe the picture in one sentence")
+
+	@classmethod
+	def from_config(cls, config_path: str):
+		"""
+		Create a CustomEndpointCaptioner from a config file.
+		"""
+		with open(config_path, "r", encoding="utf-8") as f:
+			config = json.load(f)
+		
+		endpoint = config.get("endpoint")
+		api_key = config.get("api_key")
+		model = config.get("model")
+		prompt = config.get("prompt", None)
+		
+		if endpoint and api_key and model:
+			return cls(endpoint, api_key, model, prompt)
+			
+		raise ValueError(f"Invalid custom endpoint configuration in {config_path}")
+
+	def generateCaption(
+		self,
+		image: str | bytes,
+		maxLength: int | None = None,
+		onToken: Callable[[str], None] | None = None,
+	) -> str:
+		"""
+		Generate caption via custom endpoint.
+		"""
+		try:
+			if isinstance(image, str):
+				with open(image, "rb") as f:
+					img_data = f.read()
+			else:
+				img_data = image
+				
+			base64_image = base64.b64encode(img_data).decode('utf-8')
+			
+			headers = {
+				"Content-Type": "application/json",
+				"Authorization": f"Bearer {self.api_key}"
+			}
+			
+			payload = {
+				"model": self.model,
+				"messages": [
+					{
+						"role": "user",
+						"content": [
+							{"type": "text", "text": self.prompt},
+							{
+								"type": "image_url",
+								"image_url": {
+									"url": f"data:image/jpeg;base64,{base64_image}"
+								}
+							}
+						]
+					}
+				],
+				"stream": bool(onToken)
+			}
+			
+			if maxLength:
+				payload["max_tokens"] = maxLength
+
+			response = requests.post(
+				f"{self.endpoint}/chat/completions",
+				headers=headers,
+				json=payload,
+				stream=bool(onToken),
+				timeout=60
+			)
+			response.raise_for_status()
+			
+			if onToken:
+				full_text = ""
+				for line in response.iter_lines():
+					if line:
+						line_str = line.decode('utf-8').strip()
+						if line_str.startswith("data:"):
+							data_str = line_str[len("data:"):].strip()
+							if data_str == "[DONE]":
+								break
+							try:
+								data = json.loads(data_str)
+								choices = data.get('choices', [])
+								if not choices:
+									continue
+								content = choices[0].get('delta', {}).get('content', '')
+								if content:
+									full_text += content
+									onToken(content)
+							except Exception:
+								continue
+				return full_text.strip()
+			else:
+				data = response.json()
+				return data['choices'][0]['message']['content'].strip()
+		except Exception as e:
+			log.exception(f"Custom endpoint API request failed: {e}")
+			raise
diff --git a/addon/globalPlugins/CaptionLocal/customEndpointConfig.py b/addon/globalPlugins/CaptionLocal/customEndpointConfig.py
new file mode 100644
index 0000000..877868a
--- /dev/null
+++ b/addon/globalPlugins/CaptionLocal/customEndpointConfig.py
@@ -0,0 +1,99 @@
+# -*- coding: UTF-8 -*-
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+import wx
+import json
+import os
+from logHandler import log
+
+try:
+	_
+except NameError:
+	_ = lambda x: x
+
+class CustomEndpointConfigDialog(wx.Dialog):
+	"""Dialog for configuring custom API endpoints."""
+	
+	def __init__(self, parent, config_path):
+		super().__init__(parent, title=_("Configure Custom Endpoint"), size=(500, 350))
+		self.config_path = config_path
+		self.config_data = {}
+		self._load_existing_config()
+		self._initUI()
+		
+	def _load_existing_config(self):
+		if os.path.exists(self.config_path):
+			try:
+				with open(self.config_path, "r", encoding="utf-8") as f:
+					self.config_data = json.load(f)
+			except Exception:
+				log.exception(f"Failed to load config from {self.config_path}")
+
+	def _initUI(self):
+		mainSizer = wx.BoxSizer(wx.VERTICAL)
+		
+		flexSizer = wx.FlexGridSizer(rows=4, cols=2, vgap=10, hgap=10)
+		flexSizer.AddGrowableCol(1)
+		
+		# Endpoint URL
+		flexSizer.Add(wx.StaticText(self, label=_("Endpoint URL:")), 0, wx.ALIGN_CENTER_VERTICAL)
+		self.endpointCtrl = wx.TextCtrl(self, value=self.config_data.get("endpoint", "https://api.openai.com/v1"))
+		flexSizer.Add(self.endpointCtrl, 1, wx.EXPAND)
+		
+		# API Key
+		flexSizer.Add(wx.StaticText(self, label=_("API Key:")), 0, wx.ALIGN_CENTER_VERTICAL)
+		self.apiKeyCtrl = wx.TextCtrl(self, value=self.config_data.get("api_key", ""), style=wx.TE_PASSWORD)
+		flexSizer.Add(self.apiKeyCtrl, 1, wx.EXPAND)
+		
+		# Model Name
+		flexSizer.Add(wx.StaticText(self, label=_("Model Name:")), 0, wx.ALIGN_CENTER_VERTICAL)
+		self.modelCtrl = wx.TextCtrl(self, value=self.config_data.get("model", "gpt-4o-mini"))
+		flexSizer.Add(self.modelCtrl, 1, wx.EXPAND)
+
+		# Custom Prompt
+		flexSizer.Add(wx.StaticText(self, label=_("Custom Prompt (Optional):")), 0, wx.ALIGN_CENTER_VERTICAL)
+		self.promptCtrl = wx.TextCtrl(self, value=self.config_data.get("prompt", ""))
+		flexSizer.Add(self.promptCtrl, 1, wx.EXPAND)
+		
+		mainSizer.Add(flexSizer, 1, wx.ALL | wx.EXPAND, 15)
+		
+		# Buttons
+		btnSizer = self.CreateButtonSizer(wx.OK | wx.CANCEL)
+		mainSizer.Add(btnSizer, 0, wx.ALL | wx.CENTER, 10)
+		
+		self.SetSizer(mainSizer)
+		
+	def get_config(self):
+		return {
+			"architectures": ["CustomEndpoint"],
+			"endpoint": self.endpointCtrl.GetValue().strip(),
+			"api_key": self.apiKeyCtrl.GetValue().strip(),
+			"model": self.modelCtrl.GetValue().strip(),
+			"prompt": self.promptCtrl.GetValue().strip() or None
+		}
+
+def show_config_dialog(parent, config_path):
+	"""Show the config dialog and save if OK is pressed."""
+	dlg = CustomEndpointConfigDialog(parent, config_path)
+	if dlg.ShowModal() == wx.ID_OK:
+		config = dlg.get_config()
+		# Ensure directory exists
+		os.makedirs(os.path.dirname(config_path), exist_ok=True)
+		with open(config_path, "w", encoding="utf-8") as f:
+			json.dump(config, f, indent=4, ensure_ascii=False)
+		return True
+	return False
+
+def is_config_valid(config_path):
+	"""Check if the config file exists and has the required fields."""
+	if not os.path.exists(config_path):
+		return False
+	try:
+		with open(config_path, "r", encoding="utf-8") as f:
+			config = json.load(f)
+		return all(k in config for k in ["endpoint", "api_key", "model"])
+	except Exception:
+		return False
diff --git a/addon/globalPlugins/CaptionLocal/imageDescriber.py b/addon/globalPlugins/CaptionLocal/imageDescriber.py
index 401a919..642c8d5 100644
--- a/addon/globalPlugins/CaptionLocal/imageDescriber.py
+++ b/addon/globalPlugins/CaptionLocal/imageDescriber.py
@@ -174,10 +174,24 @@ def _loadModel(self, localModelDirPath: str | None = None) -> None:
 			modelsDir = config.conf["captionLocal"]["modelsDir"]
 			currentModel = config.conf["captionLocal"]["currentModel"]
 			localModelDirPath = os.path.join(modelsDir, currentModel)
+
+		# Special handling for custom/endpoint
+		configPath = os.path.join(localModelDirPath, "config.json")
+		if currentModel == "custom/endpoint" or (os.path.exists(configPath) and "CustomEndpoint" in open(configPath, "r", encoding="utf-8").read()):
+			from . import customEndpointConfig
+			if not customEndpointConfig.is_config_valid(configPath):
+				def show_ui():
+					if wx.CallAfter(customEndpointConfig.show_config_dialog(None, configPath)):
+						# Reload after config
+						self.loadModelInBackground(localModelDirPath)
+					else:
+						ui.message(_("Custom endpoint not configured"))
+				
+				wx.CallAfter(show_ui)
+				return
 		
 		encoderPath = os.path.join(localModelDirPath, "onnx", "encoder_model_quantized.onnx")
 		decoderPath = os.path.join(localModelDirPath, "onnx", "decoder_model_merged_quantized.onnx")
-		configPath = os.path.join(localModelDirPath, "config.json")
 
 		try:
 			from . import modelConfig
diff --git a/addon/globalPlugins/CaptionLocal/models.json b/addon/globalPlugins/CaptionLocal/models.json
index 6a77799..5de6a36 100644
--- a/addon/globalPlugins/CaptionLocal/models.json
+++ b/addon/globalPlugins/CaptionLocal/models.json
@@ -58,6 +58,14 @@
 				"config.json",
 				"preprocessor_config.json"
 			]
+		},
+		{
+			"id": "custom/endpoint",
+			"name": "Custom OpenAI-compatible API Endpoint",
+			"resolvePath": "",
+			"files": [
+				"config.json"
+			]
 		}
 			]
 }

From c431767da9cd47df8a6ce4da16ddbd3e01a84ee0 Mon Sep 17 00:00:00 2001
From: tianzeshi <tianzeshi_study@outlook.com>
Date: Sun, 31 May 2026 14:28:26 +0800
Subject: [PATCH 2/2] fix empty api key

---
 .../globalPlugins/CaptionLocal/captioner/customEndpoint.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py
index 184381d..14dc601 100644
--- a/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py
+++ b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py
@@ -48,7 +48,7 @@ def from_config(cls, config_path: str):
 		model = config.get("model")
 		prompt = config.get("prompt", None)
 		
-		if endpoint and api_key and model:
+		if endpoint and model:
 			return cls(endpoint, api_key, model, prompt)
 			
 		raise ValueError(f"Invalid custom endpoint configuration in {config_path}")
@@ -72,9 +72,10 @@ def generateCaption(
 			base64_image = base64.b64encode(img_data).decode('utf-8')
 			
 			headers = {
-				"Content-Type": "application/json",
-				"Authorization": f"Bearer {self.api_key}"
+				"Content-Type": "application/json"
 			}
+			if self.api_key:
+				headers["Authorization"] = f"Bearer {self.api_key}"
 			
 			payload = {
 				"model": self.model,