Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions .github/workflows/build_addon.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ jobs:
strategy:
matrix:
include:
- python-version: "3.11"
architecture: "x86"
- python-version: "3.13"
architecture: "x64"
steps:
Expand All @@ -30,20 +28,17 @@ jobs:
run: |
pip install --upgrade pip wheel
pip install -r requirements.txt
if ("${{ matrix.python-version }}" -eq "3.11") {
pip install -r requirements-libs.txt --target ./addon/globalPlugins/CaptionLocal/libs --platform win32 --only-binary=:all: --no-binary=:none:
pip install miniqinference[cli]==0.1.2 --target ./addon/globalPlugins/CaptionLocal/libs --upgrade --platform win_amd64 --only-binary=:all: --no-binary=:none:
} else {
pip install -r requirements-libs.txt --target ./addon/globalPlugins/CaptionLocal/libs
pip install miniqinference[cli]==0.1.2 --target ./addon/globalPlugins/CaptionLocal/libs --upgrade --platform win_amd64 --only-binary=:all: --no-binary=:none:
}
pip install -r requirements-libs.txt --target ./addon/globalPlugins/CaptionLocal/libs --platform win_amd64 --only-binary=:all: --no-binary=:none:
pip install miniqinference[cli]==0.1.2 --target ./addon/globalPlugins/CaptionLocal/libs --upgrade --platform win_amd64 --only-binary=:all: --no-binary=:none:

- name: Code checks
run: |
$env:SKIP="no-commit-to-branch"
pre-commit run --all
- name: building addon
run: scons version="${{ github.ref_name }}-${{ matrix.python-version }}-${{ matrix.architecture }}"
run: |
$REF_NAME = "${{ github.ref_name }}".Replace('/', '-')
scons version="${REF_NAME}-${{ matrix.python-version }}-${{ matrix.architecture }}"
shell: pwsh
- uses: actions/upload-artifact@v4
with:
Expand Down
6 changes: 4 additions & 2 deletions addon/globalPlugins/CaptionLocal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import config
import scriptHandler
import globalPluginHandler
from contentRecog import recogUi

# Add libs directory to path
_here = os.path.dirname(__file__)
Expand All @@ -45,7 +46,8 @@
CONFSPEC = {
"modelsDir": f"string(default={_modelsDir})",
"currentModel": "string(default=Xenova/vit-gpt2-image-captioning)",
"loadModelWhenInit": "boolean(default=true)"
"loadModelWhenInit": "boolean(default=true)",
"copyToClipboard": "boolean(default=false)"
}

config.conf.spec['captionLocal'] = CONFSPEC
Expand Down Expand Up @@ -95,7 +97,7 @@ def terminate(self) -> None:
)
def script_runCaption(self, gesture) -> None:
"""Script to run image captioning on the current navigator object."""
self.imageDescriber.runCaption(gesture)
recogUi.recognizeNavigatorObject(self.imageDescriber)

@scriptHandler.script(
# Translators: Description for the release model script
Expand Down
11 changes: 10 additions & 1 deletion addon/globalPlugins/CaptionLocal/captioner/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,28 @@
from abc import ABC, abstractmethod


from typing import Callable


class ImageCaptioner(ABC):
"""Abstract interface for image caption generation.

Supports generate caption for image
"""

@abstractmethod
def generateCaption(self, image: str | bytes, maxLength: int | None = None) -> str:
def generateCaption(
self,
image: str | bytes,
maxLength: int | None = None,
onToken: Callable[[str], None] | None = None,
) -> str:
"""
Generate a caption for the given image.

:param image: Image file path or binary data.
:param maxLength: Optional maximum length for the generated caption.
:param onToken: Optional callback for each generated token (for streaming).
:return: The generated image caption as a string.
"""
pass
32 changes: 27 additions & 5 deletions addon/globalPlugins/CaptionLocal/captioner/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import subprocess
import tempfile
import io
from typing import Callable
from PIL import Image
from logHandler import log
from .base import ImageCaptioner
Expand Down Expand Up @@ -57,11 +58,13 @@ def generateCaption(
self,
image: str | bytes,
maxLength: int | None = None,
onToken: Callable[[str], None] | None = None,
) -> str:
"""Generate image caption using CLI.

:param image: Image file path or binary data.
:param maxLength: Optional maximum tokens.
:param onToken: Optional callback for each generated token.
"""
temp_file_path = None
image_path = None
Expand Down Expand Up @@ -123,17 +126,36 @@ def generateCaption(
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW

result = subprocess.check_output(
# Use Popen to allow streaming output
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
encoding="utf-8",
startupinfo=startupinfo,
bufsize=1, # Line buffered
)
return result.strip()
except subprocess.CalledProcessError as e:
log.error(f"miniqwen-cli failed with exit code {e.returncode}: {e.output}")
raise Exception(f"CLI error: {e.output}")

full_text = []
# Read output character by character (or chunk by chunk)
while True:
char = process.stdout.read(1)
if not char and process.poll() is not None:
break
if char:
full_text.append(char)
if onToken:
onToken(char)

res_text = "".join(full_text).strip()
if process.returncode != 0:
log.error(f"miniqwen-cli failed with exit code {process.returncode}")
# If we have text, it might be the error message
if res_text:
raise Exception(f"CLI error: {res_text}")
raise Exception(f"CLI error with exit code {process.returncode}")
return res_text
except Exception as e:
log.exception("Error running miniqwen-cli")
raise
Expand Down
13 changes: 12 additions & 1 deletion addon/globalPlugins/CaptionLocal/captioner/vitGpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import re
import io
from functools import lru_cache
from typing import Callable

import numpy as np
from PIL import Image
Expand Down Expand Up @@ -299,12 +300,14 @@ def _generateWithGreedy(
self,
encoderHiddenStates: np.ndarray,
maxLength: int | None = None,
onToken: Callable[[str], None] | None = None,
) -> str:
"""Generate text using greedy search.


:param encoderHiddenStates: Encoder hidden states.
:param maxLength: Maximum generation length.
:param onToken: Optional callback for each generated token.
:return: Generated text string.
"""
if maxLength is None:
Expand Down Expand Up @@ -341,6 +344,12 @@ def _generateWithGreedy(
break

generatedTokens.append(nextTokenId)

if onToken:
# Decode only the last token
token_text = self._decodeTokens([nextTokenId])
if token_text:
onToken(token_text)

# Update past_key_values from outputs
if len(decoderOutputs) > 1:
Expand All @@ -364,11 +373,13 @@ def generateCaption(
self,
image: str | bytes,
maxLength: int | None = None,
onToken: Callable[[str], None] | None = None,
) -> str:
"""Generate image caption.

:param image: Image file path or binary data.
:param maxLength: Maximum generation length.
:param onToken: Optional callback for each generated token.
:return: Generated image caption.
"""
# Preprocess image
Expand All @@ -378,6 +389,6 @@ def generateCaption(
encoderHiddenStates = self._encodeImage(imageArray)

# Generate text
caption = self._generateWithGreedy(encoderHiddenStates, maxLength)
caption = self._generateWithGreedy(encoderHiddenStates, maxLength, onToken=onToken)

return caption
Loading
Loading