diff --git a/assets/ltx2/flf2v.webm b/assets/ltx2/flf2v.webm
new file mode 100644
index 000000000..1b22a437c
Binary files /dev/null and b/assets/ltx2/flf2v.webm differ
diff --git a/docs/ltx2.md b/docs/ltx2.md
index 4861cf0e9..b8ec4cf98 100644
--- a/docs/ltx2.md
+++ b/docs/ltx2.md
@@ -38,4 +38,16 @@
src="../assets/ltx2/i2v.webm"
controls
muted
+ style="max-width: 100%; height: auto;">
+
+### LTX-2.3 dev FLF2V
+
+```
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png -o flf2v.webm
+```
+
+
\ No newline at end of file
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index e5aa5073d..9e4e444ec 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -40,6 +40,7 @@ struct DiffusionParams {
float vace_strength = 1.f;
int audio_length = 0;
float frame_rate = 24.f;
+ const sd::Tensor* video_positions = nullptr;
const std::vector* skip_layers = nullptr;
};
@@ -766,7 +767,8 @@ struct LTXAVModel : public DiffusionModel {
tensor_or_empty(diffusion_params.audio_x),
tensor_or_empty(diffusion_params.audio_timesteps),
diffusion_params.audio_length,
- diffusion_params.frame_rate);
+ diffusion_params.frame_rate,
+ tensor_or_empty(diffusion_params.video_positions));
}
};
diff --git a/src/ltxv.hpp b/src/ltxv.hpp
index a4aeea6f5..fa6c0601d 100644
--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
@@ -243,6 +243,56 @@ namespace LTXV {
return build_rope_matrix_from_frequencies(freqs, dim);
}
+ __STATIC_INLINE__ std::vector build_video_rope_matrix_from_positions(const sd::Tensor& positions,
+ int dim,
+ int num_heads,
+ float theta,
+ const std::vector& max_pos,
+ bool use_middle_indices_grid) {
+ GGML_ASSERT(max_pos.size() == 3);
+ GGML_ASSERT(dim % num_heads == 0);
+ GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4);
+ GGML_ASSERT(positions.shape()[0] == 2);
+ GGML_ASSERT(positions.shape()[1] == 3);
+ if (positions.dim() == 4) {
+ GGML_ASSERT(positions.shape()[3] == 1);
+ }
+
+ const int64_t tokens = positions.shape()[2];
+ const std::vector indices = generate_freq_grid(theta, 3, dim);
+ const int half_dim = dim / 2;
+ const int pad_size = half_dim - static_cast(indices.size()) * 3;
+ std::vector> freqs(static_cast(tokens), std::vector(half_dim, 0.f));
+
+ for (int64_t token = 0; token < tokens; token++) {
+ int out_idx = 0;
+ for (int i = 0; i < pad_size; i++) {
+ freqs[token][out_idx++] = 0.f;
+ }
+
+ float coords[3];
+ for (int axis = 0; axis < 3; axis++) {
+ float start = positions.dim() == 4 ? positions.index(0, axis, token, 0)
+ : positions.index(0, axis, token);
+ float end = positions.dim() == 4 ? positions.index(1, axis, token, 0)
+ : positions.index(1, axis, token);
+ float coord = use_middle_indices_grid ? 0.5f * (start + end) : start;
+ coords[axis] = coord / static_cast(max_pos[axis]);
+ }
+
+ for (float index : indices) {
+ for (int axis = 0; axis < 3; axis++) {
+ freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
+ }
+ }
+ }
+
+ if (num_heads > 1) {
+ return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
+ }
+ return build_rope_matrix_from_frequencies(freqs, dim);
+ }
+
__STATIC_INLINE__ std::vector build_1d_rope_matrix(int64_t seq_len,
int dim,
int num_heads = 1,
@@ -848,6 +898,31 @@ namespace LTXV {
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast(max_pos_t));
}
+ __STATIC_INLINE__ std::vector build_video_temporal_rope_matrix_from_positions(const sd::Tensor& positions,
+ int dim,
+ int num_heads,
+ float theta,
+ int max_pos_t,
+ bool use_middle_indices_grid) {
+ GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4);
+ GGML_ASSERT(positions.shape()[0] == 2);
+ GGML_ASSERT(positions.shape()[1] >= 1);
+ if (positions.dim() == 4) {
+ GGML_ASSERT(positions.shape()[3] == 1);
+ }
+
+ std::vector coords;
+ coords.reserve(static_cast(positions.shape()[2]));
+ for (int64_t token = 0; token < positions.shape()[2]; token++) {
+ float start = positions.dim() == 4 ? positions.index(0, 0, token, 0)
+ : positions.index(0, 0, token);
+ float end = positions.dim() == 4 ? positions.index(1, 0, token, 0)
+ : positions.index(1, 0, token);
+ coords.push_back(use_middle_indices_grid ? 0.5f * (start + end) : start);
+ }
+ return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast(max_pos_t));
+ }
+
__STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
int audio_latent_downsample_factor = 4,
int hop_length = 160,
@@ -1664,7 +1739,8 @@ namespace LTXV {
const sd::Tensor& audio_x_tensor = {},
const sd::Tensor& audio_timesteps_tensor = {},
int audio_length = 0,
- float frame_rate = 24.f) {
+ float frame_rate = 24.f,
+ const sd::Tensor& video_positions_tensor = {}) {
auto split_inputs = split_av_latents(x_tensor, audio_length);
vx_input_cache = split_inputs.first;
if (!audio_x_tensor.empty()) {
@@ -1681,19 +1757,31 @@ namespace LTXV {
ggml_cgraph* gf = new_graph_custom(LTXAV_GRAPH_SIZE);
- float video_frame_rate = frame_rate > 0.f ? frame_rate : 24.f;
- video_pe_vec = build_video_rope_matrix(vx->ne[0],
- vx->ne[1],
- vx->ne[2],
- static_cast(params.hidden_size),
- static_cast(params.num_attention_heads),
- video_frame_rate,
- params.positional_embedding_theta,
- params.positional_embedding_max_pos,
- params.vae_scale_factors,
- params.causal_temporal_positioning,
- params.use_middle_indices_grid);
- auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads);
+ float video_frame_rate = frame_rate > 0.f ? frame_rate : 24.f;
+ int64_t video_token_count = vx->ne[0] * vx->ne[1] * vx->ne[2];
+ bool has_video_positions = !video_positions_tensor.empty();
+ if (has_video_positions) {
+ GGML_ASSERT(video_positions_tensor.shape()[2] == video_token_count);
+ video_pe_vec = build_video_rope_matrix_from_positions(video_positions_tensor,
+ static_cast(params.hidden_size),
+ static_cast(params.num_attention_heads),
+ params.positional_embedding_theta,
+ params.positional_embedding_max_pos,
+ params.use_middle_indices_grid);
+ } else {
+ video_pe_vec = build_video_rope_matrix(vx->ne[0],
+ vx->ne[1],
+ vx->ne[2],
+ static_cast(params.hidden_size),
+ static_cast(params.num_attention_heads),
+ video_frame_rate,
+ params.positional_embedding_theta,
+ params.positional_embedding_max_pos,
+ params.vae_scale_factors,
+ params.causal_temporal_positioning,
+ params.use_middle_indices_grid);
+ }
+ auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, video_token_count * params.num_attention_heads);
ggml_set_name(video_pe, "ltxav_video_pe");
set_backend_tensor_data(video_pe, video_pe_vec.data());
@@ -1712,18 +1800,27 @@ namespace LTXV {
set_backend_tensor_data(audio_pe, audio_pe_vec.data());
int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
- video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0],
- vx->ne[1],
- vx->ne[2],
- static_cast(params.audio_cross_attention_dim),
- static_cast(params.audio_num_attention_heads),
- video_frame_rate,
- params.positional_embedding_theta,
- temporal_max_pos,
- std::get<0>(params.vae_scale_factors),
- params.causal_temporal_positioning,
- true);
- video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads);
+ if (has_video_positions) {
+ video_cross_pe_vec = build_video_temporal_rope_matrix_from_positions(video_positions_tensor,
+ static_cast(params.audio_cross_attention_dim),
+ static_cast(params.audio_num_attention_heads),
+ params.positional_embedding_theta,
+ temporal_max_pos,
+ true);
+ } else {
+ video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0],
+ vx->ne[1],
+ vx->ne[2],
+ static_cast(params.audio_cross_attention_dim),
+ static_cast(params.audio_num_attention_heads),
+ video_frame_rate,
+ params.positional_embedding_theta,
+ temporal_max_pos,
+ std::get<0>(params.vae_scale_factors),
+ params.causal_temporal_positioning,
+ true);
+ }
+ video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, video_token_count * params.audio_num_attention_heads);
ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());
@@ -1806,9 +1903,10 @@ namespace LTXV {
const sd::Tensor& audio_x = {},
const sd::Tensor& audio_timesteps = {},
int audio_length = 0,
- float frame_rate = 24.f) {
+ float frame_rate = 24.f,
+ const sd::Tensor& video_positions = {}) {
auto get_graph = [&]() -> ggml_cgraph* {
- return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate);
+ return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate, video_positions);
};
auto out = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim());
return out;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index b2558ab66..b51774bec 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1827,7 +1827,8 @@ class StableDiffusionGGML {
float vace_strength,
int audio_length,
float frame_rate,
- const sd_cache_params_t* cache_params) {
+ const sd_cache_params_t* cache_params,
+ const sd::Tensor& video_positions = {}) {
std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
float cfg_scale = guidance.txt_cfg;
float img_cfg_scale = guidance.img_cfg;
@@ -1933,6 +1934,7 @@ class StableDiffusionGGML {
diffusion_params.vace_strength = vace_strength;
diffusion_params.audio_length = audio_length;
diffusion_params.frame_rate = frame_rate;
+ diffusion_params.video_positions = video_positions.empty() ? nullptr : &video_positions;
diffusion_params.skip_layers = nullptr;
compute_sample_controls(control_image,
@@ -3216,16 +3218,99 @@ struct ImageGenerationLatents {
sd::Tensor concat_latent;
sd::Tensor uncond_concat_latent;
sd::Tensor audio_latent;
+ sd::Tensor video_positions;
sd::Tensor control_image;
std::vector> ref_images;
std::vector> ref_latents;
sd::Tensor denoise_mask;
sd::Tensor clip_vision_output;
sd::Tensor vace_context;
- int64_t ref_image_num = 0;
- int audio_length = 0;
+ int64_t ref_image_num = 0;
+ int64_t video_conditioning_frame_count = 0;
+ int64_t video_target_frame_count = 0;
+ int audio_length = 0;
};
+static float ltxv_latent_corner_to_pixel_frame(int64_t corner_index,
+ int temporal_scale,
+ bool causal_temporal_positioning) {
+ float pixel_t = static_cast(corner_index * temporal_scale);
+ if (causal_temporal_positioning) {
+ pixel_t = std::max(0.f, pixel_t + 1.f - static_cast(temporal_scale));
+ }
+ return pixel_t;
+}
+
+static void set_ltxv_video_position(sd::Tensor* positions,
+ int64_t token,
+ float t_start,
+ float t_end,
+ float h_start,
+ float h_end,
+ float w_start,
+ float w_end) {
+ positions->index(0, 0, token, 0) = t_start;
+ positions->index(1, 0, token, 0) = t_end;
+ positions->index(0, 1, token, 0) = h_start;
+ positions->index(1, 1, token, 0) = h_end;
+ positions->index(0, 2, token, 0) = w_start;
+ positions->index(1, 2, token, 0) = w_end;
+}
+
+static sd::Tensor build_ltxv_video_positions(int64_t width,
+ int64_t height,
+ int64_t target_latent_frames,
+ int64_t keyframe_latent_frames,
+ int keyframe_frame_idx,
+ int keyframe_pixel_frames,
+ int fps,
+ int spatial_scale,
+ int temporal_scale,
+ bool causal_temporal_positioning) {
+ GGML_ASSERT(width > 0 && height > 0 && target_latent_frames > 0);
+ GGML_ASSERT(keyframe_latent_frames > 0);
+ GGML_ASSERT(fps > 0);
+
+ int64_t total_tokens = width * height * (target_latent_frames + keyframe_latent_frames);
+ sd::Tensor positions({2, 3, total_tokens, 1});
+ int64_t token = 0;
+
+ for (int64_t t = 0; t < target_latent_frames; t++) {
+ float t_start = ltxv_latent_corner_to_pixel_frame(t, temporal_scale, causal_temporal_positioning) / static_cast(fps);
+ float t_end = ltxv_latent_corner_to_pixel_frame(t + 1, temporal_scale, causal_temporal_positioning) / static_cast(fps);
+ for (int64_t h = 0; h < height; h++) {
+ float h_start = static_cast(h * spatial_scale);
+ float h_end = static_cast((h + 1) * spatial_scale);
+ for (int64_t w = 0; w < width; w++) {
+ float w_start = static_cast(w * spatial_scale);
+ float w_end = static_cast((w + 1) * spatial_scale);
+ set_ltxv_video_position(&positions, token++, t_start, t_end, h_start, h_end, w_start, w_end);
+ }
+ }
+ }
+
+ for (int64_t t = 0; t < keyframe_latent_frames; t++) {
+ float t_start = static_cast(keyframe_frame_idx + t * temporal_scale);
+ float t_end = static_cast(keyframe_frame_idx + (t + 1) * temporal_scale);
+ if (keyframe_pixel_frames == 1) {
+ t_end = t_start + 1.f;
+ }
+ t_start /= static_cast(fps);
+ t_end /= static_cast(fps);
+ for (int64_t h = 0; h < height; h++) {
+ float h_start = static_cast(h * spatial_scale);
+ float h_end = static_cast((h + 1) * spatial_scale);
+ for (int64_t w = 0; w < width; w++) {
+ float w_start = static_cast(w * spatial_scale);
+ float w_end = static_cast((w + 1) * spatial_scale);
+ set_ltxv_video_position(&positions, token++, t_start, t_end, h_start, h_end, w_start, w_end);
+ }
+ }
+ }
+
+ return positions;
+}
+
static sd::Tensor pack_ltxav_audio_and_video_latents(const sd::Tensor& video_latent,
const sd::Tensor& audio_latent) {
if (audio_latent.empty()) {
@@ -4136,33 +4221,27 @@ static std::optional prepare_video_generation_latents(sd
}
if (sd_version_is_ltxav(sd_ctx->sd->version)) {
- if (!end_image.empty() || sd_vid_gen_params->control_frames_size > 0) {
- LOG_ERROR("LTXAV currently supports txt2vid and init_image i2v only; end_image and control_frames are not implemented");
+ if (sd_vid_gen_params->control_frames_size > 0) {
+ LOG_ERROR("LTXAV control_frames are not implemented");
return std::nullopt;
}
- if (!start_image.empty()) {
+ if (!start_image.empty() || !end_image.empty()) {
if (sd_ctx->sd->vae_decode_only) {
- LOG_ERROR("LTXAV init_image i2v requires VAE encoder weights; create the context with vae_decode_only=false");
+ LOG_ERROR("LTXAV image conditioning requires VAE encoder weights; create the context with vae_decode_only=false");
return std::nullopt;
}
- LOG_INFO("IMG2VID");
-
- int64_t t1 = ggml_time_ms();
- auto init_img = start_image.reshape({start_image.shape()[0],
- start_image.shape()[1],
- 1,
- start_image.shape()[2],
- start_image.shape()[3]});
- auto init_image_latent = sd_ctx->sd->encode_first_stage(init_img);
- if (init_image_latent.empty()) {
- LOG_ERROR("failed to encode LTXAV init image");
- return std::nullopt;
+ if (!start_image.empty() && !end_image.empty()) {
+ LOG_INFO("FLF2V");
+ } else if (!start_image.empty()) {
+ LOG_INFO("IMG2VID");
+ } else {
+ LOG_INFO("END2VID");
}
+ int64_t t1 = ggml_time_ms();
latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true);
- sd::ops::slice_assign(&latents.init_latent, 2, 0, init_image_latent.shape()[2], init_image_latent);
float conditioning_strength = std::clamp(request->strength, 0.f, 1.f);
float conditioned_mask = 1.0f - conditioning_strength;
@@ -4172,7 +4251,94 @@ static std::optional prepare_video_generation_latents(sd
1,
1},
1.f);
- sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], conditioned_mask);
+
+ auto encode_ltxav_condition_image = [&](const sd::Tensor& image, const char* name) -> sd::Tensor {
+ auto condition_image = image.reshape({image.shape()[0],
+ image.shape()[1],
+ 1,
+ image.shape()[2],
+ image.shape()[3]});
+ auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image);
+ if (condition_latent.empty()) {
+ LOG_ERROR("failed to encode LTXAV %s image", name);
+ }
+ return condition_latent;
+ };
+
+ auto apply_video_condition_by_latent_index = [&](const sd::Tensor& condition_latent,
+ int64_t latent_idx,
+ const char* name) -> bool {
+ int64_t latent_frames = latents.init_latent.shape()[2];
+ int64_t condition_frames = condition_latent.shape()[2];
+ if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) {
+ LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64,
+ name,
+ latent_idx,
+ condition_frames,
+ latent_frames);
+ return false;
+ }
+
+ sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent);
+ sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask);
+ return true;
+ };
+
+ auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor& keyframes,
+ int frame_idx,
+ const char* name) -> bool {
+ int64_t keyframe_frames = keyframes.shape()[2];
+ if (keyframe_frames <= 0 || keyframes.shape()[0] != latents.init_latent.shape()[0] ||
+ keyframes.shape()[1] != latents.init_latent.shape()[1] ||
+ keyframes.shape()[3] != latents.init_latent.shape()[3]) {
+ LOG_ERROR("invalid LTXAV %s keyframe latent shape", name);
+ return false;
+ }
+
+ latents.video_target_frame_count = latents.init_latent.shape()[2];
+ latents.video_conditioning_frame_count = keyframe_frames;
+ latents.init_latent = sd::ops::concat(latents.init_latent, keyframes, 2);
+
+ auto keyframe_mask = sd::full({keyframes.shape()[0],
+ keyframes.shape()[1],
+ keyframes.shape()[2],
+ 1,
+ 1},
+ conditioned_mask);
+ latents.denoise_mask = sd::ops::concat(latents.denoise_mask, keyframe_mask, 2);
+ latents.video_positions = build_ltxv_video_positions(latents.init_latent.shape()[0],
+ latents.init_latent.shape()[1],
+ latents.video_target_frame_count,
+ keyframe_frames,
+ frame_idx,
+ 1,
+ request->fps,
+ request->vae_scale_factor,
+ 8,
+ true);
+ return true;
+ };
+
+ if (!start_image.empty()) {
+ auto start_image_latent = encode_ltxav_condition_image(start_image, "init");
+ if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) {
+ return std::nullopt;
+ }
+ }
+
+ if (!end_image.empty()) {
+ auto end_image_latent = encode_ltxav_condition_image(end_image, "end");
+ if (end_image_latent.empty()) {
+ return std::nullopt;
+ }
+
+ int frame_idx = request->frames - 1;
+ bool ok = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end")
+ : apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end");
+ if (!ok) {
+ return std::nullopt;
+ }
+ }
int64_t t2 = ggml_time_ms();
LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
@@ -4528,7 +4694,8 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
request.vace_strength,
latents.audio_length,
static_cast(request.fps),
- request.cache_params);
+ request.cache_params,
+ latents.video_positions);
int64_t sampling_end = ggml_time_ms();
if (x_t_sampled.empty()) {
LOG_ERROR("sampling(high noise) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
@@ -4573,7 +4740,8 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
request.vace_strength,
latents.audio_length,
static_cast(request.fps),
- request.cache_params);
+ request.cache_params,
+ latents.video_positions);
int64_t sampling_end = ggml_time_ms();
if (sd_ctx->sd->free_params_immediately) {
@@ -4602,6 +4770,12 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
}
}
+ if (latents.video_conditioning_frame_count > 0) {
+ int64_t target_frames = latents.video_target_frame_count > 0 ? latents.video_target_frame_count
+ : final_latent.shape()[2] - latents.video_conditioning_frame_count;
+ final_latent = sd::ops::slice(final_latent, 2, 0, target_frames);
+ }
+
if (latents.ref_image_num > 0) {
final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]);
}