diff --git a/.github/workflows/build_addon.yml b/.github/workflows/build_addon.yml index f24003b..6978aae 100644 --- a/.github/workflows/build_addon.yml +++ b/.github/workflows/build_addon.yml @@ -29,7 +29,6 @@ jobs: pip install --upgrade pip wheel pip install -r requirements.txt pip install -r requirements-libs.txt --target ./addon/globalPlugins/CaptionLocal/libs --platform win_amd64 --only-binary=:all: --no-binary=:none: - pip install miniqinference[cli]==0.1.2 --target ./addon/globalPlugins/CaptionLocal/libs --upgrade --platform win_amd64 --only-binary=:all: --no-binary=:none: - name: Code checks run: | diff --git a/addon/globalPlugins/CaptionLocal/captioner/__init__.py b/addon/globalPlugins/CaptionLocal/captioner/__init__.py index 7928e31..4e9cf07 100644 --- a/addon/globalPlugins/CaptionLocal/captioner/__init__.py +++ b/addon/globalPlugins/CaptionLocal/captioner/__init__.py @@ -40,5 +40,8 @@ def imageCaptionerFactory( from .qwen import QwenImageCaptioner modelDir = os.path.dirname(configPath) return QwenImageCaptioner(modelDir) + elif modelArchitecture == "CustomEndpoint": + from .customEndpoint import CustomEndpointCaptioner + return CustomEndpointCaptioner.from_config(configPath) else: raise NotImplementedError(f"Unsupported model architecture: {modelArchitecture}") diff --git a/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py new file mode 100644 index 0000000..14dc601 --- /dev/null +++ b/addon/globalPlugins/CaptionLocal/captioner/customEndpoint.py @@ -0,0 +1,137 @@ +# -*- coding: UTF-8 -*- +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import base64 +import json +import requests +import io +from typing import Callable +from logHandler import log +from .base import ImageCaptioner + +try: + _ +except NameError: + _ = lambda x: x + +class CustomEndpointCaptioner(ImageCaptioner): + """Captioner using custom OpenAI-compatible API endpoints.""" + + def __init__(self, endpoint: str, api_key: str, model: str, prompt: str = None): + """ + Initialize the custom endpoint captioner. + + :param endpoint: Base URL of the API. + :param api_key: API key. + :param model: Model name. + :param prompt: Custom prompt. + """ + self.endpoint = endpoint.rstrip('/') + self.api_key = api_key + self.model = model + # Translators: default prompt for image captioning + self.prompt = prompt or _("Please describe the picture in one sentence") + + @classmethod + def from_config(cls, config_path: str): + """ + Create a CustomEndpointCaptioner from a config file. + """ + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + + endpoint = config.get("endpoint") + api_key = config.get("api_key") + model = config.get("model") + prompt = config.get("prompt", None) + + if endpoint and model: + return cls(endpoint, api_key, model, prompt) + + raise ValueError(f"Invalid custom endpoint configuration in {config_path}") + + def generateCaption( + self, + image: str | bytes, + maxLength: int | None = None, + onToken: Callable[[str], None] | None = None, + ) -> str: + """ + Generate caption via custom endpoint. + """ + try: + if isinstance(image, str): + with open(image, "rb") as f: + img_data = f.read() + else: + img_data = image + + base64_image = base64.b64encode(img_data).decode('utf-8') + + headers = { + "Content-Type": "application/json" + } + if self.api_key: + headers["Authorization"] = f"Bearer {self.api_key}" + + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": self.prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + } + ] + } + ], + "stream": bool(onToken) + } + + if maxLength: + payload["max_tokens"] = maxLength + + response = requests.post( + f"{self.endpoint}/chat/completions", + headers=headers, + json=payload, + stream=bool(onToken), + timeout=60 + ) + response.raise_for_status() + + if onToken: + full_text = "" + for line in response.iter_lines(): + if line: + line_str = line.decode('utf-8').strip() + if line_str.startswith("data:"): + data_str = line_str[len("data:"):].strip() + if data_str == "[DONE]": + break + try: + data = json.loads(data_str) + choices = data.get('choices', []) + if not choices: + continue + content = choices[0].get('delta', {}).get('content', '') + if content: + full_text += content + onToken(content) + except Exception: + continue + return full_text.strip() + else: + data = response.json() + return data['choices'][0]['message']['content'].strip() + except Exception as e: + log.exception(f"Custom endpoint API request failed: {e}") + raise diff --git a/addon/globalPlugins/CaptionLocal/customEndpointConfig.py b/addon/globalPlugins/CaptionLocal/customEndpointConfig.py new file mode 100644 index 0000000..1a4cc63 --- /dev/null +++ b/addon/globalPlugins/CaptionLocal/customEndpointConfig.py @@ -0,0 +1,99 @@ +# -*- coding: UTF-8 -*- +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import wx +import json +import os +from logHandler import log + +try: + _ +except NameError: + _ = lambda x: x + +class CustomEndpointConfigDialog(wx.Dialog): + """Dialog for configuring custom API endpoints.""" + + def __init__(self, parent, config_path): + super().__init__(parent, title=_("Configure Custom Endpoint"), size=(500, 350)) + self.config_path = config_path + self.config_data = {} + self._load_existing_config() + self._initUI() + + def _load_existing_config(self): + if os.path.exists(self.config_path): + try: + with open(self.config_path, "r", encoding="utf-8") as f: + self.config_data = json.load(f) + except Exception: + log.exception(f"Failed to load config from {self.config_path}") + + def _initUI(self): + mainSizer = wx.BoxSizer(wx.VERTICAL) + + flexSizer = wx.FlexGridSizer(rows=4, cols=2, vgap=10, hgap=10) + flexSizer.AddGrowableCol(1) + + # Endpoint URL + flexSizer.Add(wx.StaticText(self, label=_("Endpoint URL:")), 0, wx.ALIGN_CENTER_VERTICAL) + self.endpointCtrl = wx.TextCtrl(self, value=self.config_data.get("endpoint", "https://api.openai.com/v1")) + flexSizer.Add(self.endpointCtrl, 1, wx.EXPAND) + + # API Key + flexSizer.Add(wx.StaticText(self, label=_("API Key:")), 0, wx.ALIGN_CENTER_VERTICAL) + self.apiKeyCtrl = wx.TextCtrl(self, value=self.config_data.get("api_key", ""), style=wx.TE_PASSWORD) + flexSizer.Add(self.apiKeyCtrl, 1, wx.EXPAND) + + # Model Name + flexSizer.Add(wx.StaticText(self, label=_("Model Name:")), 0, wx.ALIGN_CENTER_VERTICAL) + self.modelCtrl = wx.TextCtrl(self, value=self.config_data.get("model", "gpt-4o-mini")) + flexSizer.Add(self.modelCtrl, 1, wx.EXPAND) + + # Custom Prompt + flexSizer.Add(wx.StaticText(self, label=_("Custom Prompt (Optional):")), 0, wx.ALIGN_CENTER_VERTICAL) + self.promptCtrl = wx.TextCtrl(self, value=self.config_data.get("prompt", "")) + flexSizer.Add(self.promptCtrl, 1, wx.EXPAND) + + mainSizer.Add(flexSizer, 1, wx.ALL | wx.EXPAND, 15) + + # Buttons + btnSizer = self.CreateButtonSizer(wx.OK | wx.CANCEL) + mainSizer.Add(btnSizer, 0, wx.ALL | wx.CENTER, 10) + + self.SetSizer(mainSizer) + + def get_config(self): + return { + "architectures": ["CustomEndpoint"], + "endpoint": self.endpointCtrl.GetValue().strip(), + "api_key": self.apiKeyCtrl.GetValue().strip(), + "model": self.modelCtrl.GetValue().strip(), + "prompt": self.promptCtrl.GetValue().strip() or None + } + +def show_config_dialog(parent, config_path): + """Show the config dialog and save if OK is pressed.""" + dlg = CustomEndpointConfigDialog(parent, config_path) + if dlg.ShowModal() == wx.ID_OK: + config = dlg.get_config() + # Ensure directory exists + os.makedirs(os.path.dirname(config_path), exist_ok=True) + with open(config_path, "w", encoding="utf-8") as f: + json.dump(config, f, indent=4, ensure_ascii=False) + return True + return False + +def is_config_valid(config_path): + """Check if the config file exists and has the required fields.""" + if not os.path.exists(config_path): + return False + try: + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + return all(k in config for k in ["endpoint", "model"]) + except Exception: + return False diff --git a/addon/globalPlugins/CaptionLocal/dependencyManager.py b/addon/globalPlugins/CaptionLocal/dependencyManager.py new file mode 100644 index 0000000..fb069b1 --- /dev/null +++ b/addon/globalPlugins/CaptionLocal/dependencyManager.py @@ -0,0 +1,174 @@ +# -*- coding: UTF-8 -*- +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import os +import sys +import json +import zipfile +import io +import requests +import threading +from typing import Callable, List, Optional, Dict +from logHandler import log + +try: + _ +except NameError: + _ = lambda x: x + +# Type for progress callback: (fileName, downloadedBytes, totalBytes, percentage) +ProgressCallback = Callable[[str, int, int, float], None] + +MODELS_CONFIG_FILE = os.path.join(os.path.dirname(__file__), "models.json") +LIBS_DIR = os.path.join(os.path.dirname(__file__), "libs") + +class DependencyManager: + """Manages runtime dependencies (Python packages and binaries).""" + + def __init__(self): + self.runtimes = {} + self._load_config() + + def _load_config(self): + try: + if os.path.exists(MODELS_CONFIG_FILE): + with open(MODELS_CONFIG_FILE, "r", encoding="utf-8") as f: + data = json.load(f) + self.runtimes = data.get("runtimes", {}) + self.models = data.get("models", []) + except Exception: + log.exception("Failed to load runtimes config") + + def get_required_runtimes(self, model_id: str) -> List[str]: + """Get a list of runtime IDs required by a model.""" + for m in self.models: + if m.get("id") == model_id: + return m.get("runtime_dependencies", []) + return [] + + def is_runtime_installed(self, runtime_id: str) -> bool: + """Check if a runtime is already installed in the libs directory.""" + if runtime_id == "onnxruntime": + # Check for onnxruntime package directory + return os.path.exists(os.path.join(LIBS_DIR, "onnxruntime")) + elif runtime_id == "miniqinference": + # Check for the binary specifically + cli_path = os.path.join(LIBS_DIR, "bin", "miniqwen-cli.exe") + return os.path.exists(cli_path) + return False + + def download_and_install(self, runtime_id: str, progress_callback: Optional[ProgressCallback] = None) -> bool: + """Download and install a runtime dependency.""" + info = self.runtimes.get(runtime_id) + if not info: + log.error(f"Unknown runtime: {runtime_id}") + return False + + if info.get("type") == "pypi": + return self._install_from_pypi(runtime_id, info, progress_callback) + + return False + + def _install_from_pypi(self, runtime_id: str, info: dict, progress_callback: Optional[ProgressCallback]) -> bool: + package_name = info.get("package", runtime_id).split("[")[0] # Remove extras + + # Get correct version based on Python version if specified + version = info.get("version") + py_ver = f"{sys.version_info.major}.{sys.version_info.minor}" + if "versions" in info: + version = info["versions"].get(py_ver, version) + + try: + # 1. Fetch metadata from PyPI + url = f"https://pypi.org/pypi/{package_name}/json" + if version: + url = f"https://pypi.org/pypi/{package_name}/{version}/json" + + resp = requests.get(url, timeout=10) + resp.raise_for_status() + data = resp.json() + + # 2. Find best matching wheel + # Targets: win_amd64 and current python version + releases = data.get("urls", []) + best_url = None + file_size = 0 + + py_tag = f"cp{sys.version_info.major}{sys.version_info.minor}" + # Also consider universal wheels or those with correct abi tags + + for release in releases: + filename = release.get("filename", "") + if not filename.endswith(".whl"): + continue + if "win_amd64" not in filename: + continue + + # Check python version compatibility in filename + # e.g., onnxruntime-1.19.2-cp311-cp311-win_amd64.whl + parts = filename.split("-") + if len(parts) < 5: continue + + file_py_tag = parts[2] + if file_py_tag == "py3" or py_tag in file_py_tag: + best_url = release.get("url") + file_size = release.get("size", 0) + break + + if not best_url: + log.error(f"Could not find compatible wheel for {package_name} on {py_tag} win_amd64") + return False + + # 3. Download the wheel + if progress_callback: + progress_callback(filename, 0, file_size, 0.0) + + download_resp = requests.get(best_url, stream=True, timeout=30) + download_resp.raise_for_status() + + content = io.BytesIO() + downloaded = 0 + for chunk in download_resp.iter_content(chunk_size=8192): + if chunk: + content.write(chunk) + downloaded += len(chunk) + if progress_callback: + progress_callback(filename, downloaded, file_size, (downloaded/file_size)*100 if file_size else 0) + + # 4. Unzip into libs + content.seek(0) + with zipfile.ZipFile(content) as zf: + # Filter out metadata and unnecessary files if desired, but for now just extract all + zf.extractall(LIBS_DIR) + + # Special post-install for miniqinference to move the exe if needed + if runtime_id == "miniqinference": + self._fix_miniqinference_paths() + + return True + + except Exception: + log.exception(f"Failed to install {runtime_id} from PyPI") + return False + + def _fix_miniqinference_paths(self): + """miniqinference might put the exe in a subfolder or need to be moved to libs/bin.""" + # In wheel, scripts usually go to {package}-{version}.data/scripts/ + # But since we are extracting to LIBS_DIR, we need to find it. + # Search for miniqwen-cli.exe in extracted files + for root, dirs, files in os.walk(LIBS_DIR): + if "miniqwen-cli.exe" in files: + src = os.path.join(root, "miniqwen-cli.exe") + dest_dir = os.path.join(LIBS_DIR, "bin") + os.makedirs(dest_dir, exist_ok=True) + dest = os.path.join(dest_dir, "miniqwen-cli.exe") + if src != dest: + try: + import shutil + shutil.copy2(src, dest) + except Exception: + log.exception("Failed to move miniqwen-cli.exe") + break diff --git a/addon/globalPlugins/CaptionLocal/imageDescriber.py b/addon/globalPlugins/CaptionLocal/imageDescriber.py index 401a919..a482d75 100644 --- a/addon/globalPlugins/CaptionLocal/imageDescriber.py +++ b/addon/globalPlugins/CaptionLocal/imageDescriber.py @@ -174,10 +174,68 @@ def _loadModel(self, localModelDirPath: str | None = None) -> None: modelsDir = config.conf["captionLocal"]["modelsDir"] currentModel = config.conf["captionLocal"]["currentModel"] localModelDirPath = os.path.join(modelsDir, currentModel) + + # Special handling for custom/endpoint + configPath = os.path.join(localModelDirPath, "config.json") + if currentModel == "custom/endpoint" or (os.path.exists(configPath) and "CustomEndpoint" in open(configPath, "r", encoding="utf-8").read()): + from . import customEndpointConfig + if not customEndpointConfig.is_config_valid(configPath): + def show_ui(): + if customEndpointConfig.show_config_dialog(None, configPath): + # Reload after config + self.loadModelInBackground(localModelDirPath) + else: + ui.message(_("Custom endpoint not configured")) + + wx.CallAfter(show_ui) + return + + # Runtime Dependency Check + from .dependencyManager import DependencyManager + dm = DependencyManager() + runtimes = dm.get_required_runtimes(currentModel) + missing = [r for r in runtimes if not dm.is_runtime_installed(r)] + if missing: + def start_download(): + if wx.MessageBox( + _("This model requires additional components (runtimes). Would you like to download them now?"), + _("Download Dependencies"), + wx.YES_NO | wx.ICON_QUESTION + ) == wx.YES: + progress = wx.ProgressDialog( + _("Downloading Dependencies"), + _("Preparing..."), + maximum=100, + parent=gui.mainFrame, + style=wx.PD_AUTO_HIDE | wx.PD_CAN_ABORT | wx.PD_ELAPSED_TIME | wx.PD_REMAINING_TIME + ) + + def download_worker(): + try: + for runtime_id in missing: + def cb(file, down, total, pct): + wx.CallAfter(progress.Update, int(pct), _("Downloading {file}...").format(file=file)) + + if not dm.download_and_install(runtime_id, progress_callback=cb): + raise Exception(f"Failed to install {runtime_id}") + + wx.CallAfter(progress.Destroy) + # Retry loading model + self.loadModelInBackground(localModelDirPath) + except Exception as e: + log.exception("Dependency download failed") + wx.CallAfter(progress.Destroy) + wx.CallAfter(ui.message, _("Dependency download failed: {error}").format(error=e)) + + threading.Thread(target=download_worker, daemon=True).start() + else: + ui.message(_("Model cannot be loaded without dependencies.")) + + wx.CallAfter(start_download) + return encoderPath = os.path.join(localModelDirPath, "onnx", "encoder_model_quantized.onnx") decoderPath = os.path.join(localModelDirPath, "onnx", "decoder_model_merged_quantized.onnx") - configPath = os.path.join(localModelDirPath, "config.json") try: from . import modelConfig diff --git a/addon/globalPlugins/CaptionLocal/models.json b/addon/globalPlugins/CaptionLocal/models.json index 6a77799..f72a143 100644 --- a/addon/globalPlugins/CaptionLocal/models.json +++ b/addon/globalPlugins/CaptionLocal/models.json @@ -1,9 +1,25 @@ { - "version": "1.0", + "version": "1.1", + "runtimes": { + "onnxruntime": { + "type": "pypi", + "package": "onnxruntime", + "versions": { + "3.11": "1.19.2", + "3.13": "1.20.1" + } + }, + "miniqinference": { + "type": "pypi", + "package": "miniqinference[cli]", + "version": "0.1.2" + } + }, "models": [ { "id": "onnx-community/Qwen3.5-0.8B-ONNX", "name": "Qwen 3.5-0.8B (local cli)", + "runtime_dependencies": ["miniqinference"], "resolvePath": "/resolve/main", "files": [ "config.json", @@ -22,6 +38,7 @@ { "id": "onnx-community/Qwen3.5-2B-ONNX", "name": "Qwen 3.5-2B (local cli)", + "runtime_dependencies": ["miniqinference"], "resolvePath": "/resolve/main", "files": [ "config.json", @@ -40,6 +57,7 @@ { "id": "Xenova/vit-gpt2-image-captioning", "name": "Vit-GPT2 Image Captioning (Xenova)", + "runtime_dependencies": ["onnxruntime"], "resolvePath": "/resolve/main", "files": [ "onnx/encoder_model_quantized.onnx", @@ -52,12 +70,21 @@ { "id": "Mozilla/distilvit", "name": "DistilViT (Mozilla)", + "runtime_dependencies": ["onnxruntime"], "resolvePath": "/resolve/main", "files": [ "onnx/model_quantized.onnx", "config.json", "preprocessor_config.json" ] + }, + { + "id": "custom/endpoint", + "name": "Custom OpenAI-compatible API Endpoint", + "resolvePath": "", + "files": [ + "config.json" + ] } ] } diff --git a/build.bat b/build.bat index 080a7d3..7c461f6 100644 --- a/build.bat +++ b/build.bat @@ -1,7 +1,6 @@ pip install --upgrade pip wheel pip install -r requirements.txt pip install -r requirements-libs.txt --target ./addon/globalPlugins/CaptionLocal/libs --platform win_amd64 --only-binary=:all: --no-binary=:none: --upgrade -pip install miniqinference[cli] --target ./addon/globalPlugins/CaptionLocal/libs --upgrade set SKIP="no-commit-to-branch" pre-commit run --all diff --git a/requirements-libs.txt b/requirements-libs.txt index 52f195d..837d1ba 100644 --- a/requirements-libs.txt +++ b/requirements-libs.txt @@ -1,4 +1,2 @@ -onnxruntime == 1.19.2 ; python_version < "3.13" -onnxruntime == 1.20.1 ; python_version >= "3.13" -pillow == 11.1.0 -requests == 2.33.1 +# pillow == 11.1.0 +# requests == 2.33.1