tianzeshi-study · tianzeshi-study · May 23, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/.github/workflows/build_addon.yml b/.github/workflows/build_addon.yml
@@ -14,8 +14,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: "3.11"
-            architecture: "x86"
           - python-version: "3.13"
             architecture: "x64"
     steps:
@@ -30,20 +28,17 @@ jobs:
         run: |
           pip install --upgrade pip wheel
           pip install -r requirements.txt
-          if ("${{ matrix.python-version }}" -eq "3.11") {
-            pip install -r requirements-libs.txt --target ./addon/globalPlugins/CaptionLocal/libs --platform win32 --only-binary=:all: --no-binary=:none:
-            pip install miniqinference[cli]==0.1.2 --target ./addon/globalPlugins/CaptionLocal/libs --upgrade --platform win_amd64 --only-binary=:all: --no-binary=:none: 
-          } else {
-            pip install -r requirements-libs.txt --target ./addon/globalPlugins/CaptionLocal/libs 
-            pip install miniqinference[cli]==0.1.2 --target ./addon/globalPlugins/CaptionLocal/libs --upgrade --platform win_amd64 --only-binary=:all: --no-binary=:none: 
-          }
+          pip install -r requirements-libs.txt --target ./addon/globalPlugins/CaptionLocal/libs --platform win_amd64 --only-binary=:all: --no-binary=:none:
+          pip install miniqinference[cli]==0.1.2 --target ./addon/globalPlugins/CaptionLocal/libs --upgrade --platform win_amd64 --only-binary=:all: --no-binary=:none: 
 
       - name: Code checks
         run: |
           $env:SKIP="no-commit-to-branch"
           pre-commit run --all
       - name: building addon
-        run: scons version="${{ github.ref_name }}-${{ matrix.python-version }}-${{ matrix.architecture }}"
+        run: |
+          $REF_NAME = "${{ github.ref_name }}".Replace('/', '-')
+          scons version="${REF_NAME}-${{ matrix.python-version }}-${{ matrix.architecture }}"
         shell: pwsh
       - uses: actions/upload-artifact@v4
         with:

diff --git a/addon/globalPlugins/CaptionLocal/__init__.py b/addon/globalPlugins/CaptionLocal/__init__.py
@@ -21,6 +21,7 @@
 import config
 import scriptHandler
 import globalPluginHandler
+from contentRecog import recogUi
 
 # Add libs directory to path
 _here = os.path.dirname(__file__)
@@ -45,7 +46,8 @@
 CONFSPEC = {
 	"modelsDir": f"string(default={_modelsDir})",
 	"currentModel": "string(default=Xenova/vit-gpt2-image-captioning)",
-	"loadModelWhenInit": "boolean(default=true)"
+	"loadModelWhenInit": "boolean(default=true)",
+	"copyToClipboard": "boolean(default=false)"
 }
 
 config.conf.spec['captionLocal'] = CONFSPEC
@@ -95,7 +97,7 @@ def terminate(self) -> None:
 	)
 	def script_runCaption(self, gesture) -> None:
 		"""Script to run image captioning on the current navigator object."""
-		self.imageDescriber.runCaption(gesture)
+		recogUi.recognizeNavigatorObject(self.imageDescriber)
 
 	@scriptHandler.script(
 		# Translators: Description for the release model script

diff --git a/addon/globalPlugins/CaptionLocal/captioner/base.py b/addon/globalPlugins/CaptionLocal/captioner/base.py
@@ -7,19 +7,28 @@
 from abc import ABC, abstractmethod
 
 
+from typing import Callable
+
+
 class ImageCaptioner(ABC):
 	"""Abstract interface for image caption generation.
 
 	Supports generate caption for image
 	"""
 
 	@abstractmethod
-	def generateCaption(self, image: str | bytes, maxLength: int | None = None) -> str:
+	def generateCaption(
+		self,
+		image: str | bytes,
+		maxLength: int | None = None,
+		onToken: Callable[[str], None] | None = None,
+	) -> str:
 		"""
 		Generate a caption for the given image.
 
 		:param image: Image file path or binary data.
 		:param maxLength: Optional maximum length for the generated caption.
+		:param onToken: Optional callback for each generated token (for streaming).
 		:return: The generated image caption as a string.
 		"""
 		pass
diff --git a/addon/globalPlugins/CaptionLocal/captioner/qwen.py b/addon/globalPlugins/CaptionLocal/captioner/qwen.py
@@ -8,6 +8,7 @@
 import subprocess
 import tempfile
 import io
+from typing import Callable
 from PIL import Image
 from logHandler import log
 from .base import ImageCaptioner
@@ -57,11 +58,13 @@ def generateCaption(
 		self,
 		image: str | bytes,
 		maxLength: int | None = None,
+		onToken: Callable[[str], None] | None = None,
 	) -> str:
 		"""Generate image caption using CLI.
 
 		:param image: Image file path or binary data.
 		:param maxLength: Optional maximum tokens.
+		:param onToken: Optional callback for each generated token.
 		"""
 		temp_file_path = None
 		image_path = None
@@ -123,17 +126,36 @@ def generateCaption(
 			startupinfo = subprocess.STARTUPINFO()
 			startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
 
-			result = subprocess.check_output(
+			# Use Popen to allow streaming output
+			process = subprocess.Popen(
 				cmd,
+				stdout=subprocess.PIPE,
 				stderr=subprocess.STDOUT,
 				universal_newlines=True,
 				encoding="utf-8",
 				startupinfo=startupinfo,
+				bufsize=1, # Line buffered
 			)
-			return result.strip()
-		except subprocess.CalledProcessError as e:
-			log.error(f"miniqwen-cli failed with exit code {e.returncode}: {e.output}")
-			raise Exception(f"CLI error: {e.output}")
+
+			full_text = []
+			# Read output character by character (or chunk by chunk)
+			while True:
+				char = process.stdout.read(1)
+				if not char and process.poll() is not None:
+					break
+				if char:
+					full_text.append(char)
+					if onToken:
+						onToken(char)
+
+			res_text = "".join(full_text).strip()
+			if process.returncode != 0:
+				log.error(f"miniqwen-cli failed with exit code {process.returncode}")
+				# If we have text, it might be the error message
+				if res_text:
+					raise Exception(f"CLI error: {res_text}")
+				raise Exception(f"CLI error with exit code {process.returncode}")
+			return res_text
 		except Exception as e:
 			log.exception("Error running miniqwen-cli")
 			raise

diff --git a/addon/globalPlugins/CaptionLocal/captioner/vitGpt2.py b/addon/globalPlugins/CaptionLocal/captioner/vitGpt2.py
@@ -9,6 +9,7 @@
 import re
 import io
 from functools import lru_cache
+from typing import Callable
 
 import numpy as np
 from PIL import Image
@@ -299,12 +300,14 @@ def _generateWithGreedy(
 		self,
 		encoderHiddenStates: np.ndarray,
 		maxLength: int | None = None,
+		onToken: Callable[[str], None] | None = None,
 	) -> str:
 		"""Generate text using greedy search.
 
 
 		:param encoderHiddenStates: Encoder hidden states.
 		:param maxLength: Maximum generation length.
+		:param onToken: Optional callback for each generated token.
 		:return: Generated text string.
 		"""
 		if maxLength is None:
@@ -341,6 +344,12 @@ def _generateWithGreedy(
 				break
 
 			generatedTokens.append(nextTokenId)
+
+			if onToken:
+				# Decode only the last token
+				token_text = self._decodeTokens([nextTokenId])
+				if token_text:
+					onToken(token_text)
 
 			# Update past_key_values from outputs
 			if len(decoderOutputs) > 1:
@@ -364,11 +373,13 @@ def generateCaption(
 		self,
 		image: str | bytes,
 		maxLength: int | None = None,
+		onToken: Callable[[str], None] | None = None,
 	) -> str:
 		"""Generate image caption.
 
 		:param image: Image file path or binary data.
 		:param maxLength: Maximum generation length.
+		:param onToken: Optional callback for each generated token.
 		:return: Generated image caption.
 		"""
 		# Preprocess image
@@ -378,6 +389,6 @@ def generateCaption(
 		encoderHiddenStates = self._encodeImage(imageArray)
 
 		# Generate text
-		caption = self._generateWithGreedy(encoderHiddenStates, maxLength)
+		caption = self._generateWithGreedy(encoderHiddenStates, maxLength, onToken=onToken)
 
 		return caption