diff --git a/assets/ltx2/flf2v.webm b/assets/ltx2/flf2v.webm new file mode 100644 index 000000000..1b22a437c Binary files /dev/null and b/assets/ltx2/flf2v.webm differ diff --git a/docs/ltx2.md b/docs/ltx2.md index 4861cf0e9..b8ec4cf98 100644 --- a/docs/ltx2.md +++ b/docs/ltx2.md @@ -38,4 +38,16 @@ src="../assets/ltx2/i2v.webm" controls muted + style="max-width: 100%; height: auto;"> + +### LTX-2.3 dev FLF2V + +``` +.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png -o flf2v.webm +``` + + \ No newline at end of file diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index e5aa5073d..9e4e444ec 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -40,6 +40,7 @@ struct DiffusionParams { float vace_strength = 1.f; int audio_length = 0; float frame_rate = 24.f; + const sd::Tensor* video_positions = nullptr; const std::vector* skip_layers = nullptr; }; @@ -766,7 +767,8 @@ struct LTXAVModel : public DiffusionModel { tensor_or_empty(diffusion_params.audio_x), tensor_or_empty(diffusion_params.audio_timesteps), diffusion_params.audio_length, - diffusion_params.frame_rate); + diffusion_params.frame_rate, + tensor_or_empty(diffusion_params.video_positions)); } }; diff --git a/src/ltxv.hpp b/src/ltxv.hpp index a4aeea6f5..fa6c0601d 100644 --- a/src/ltxv.hpp +++ b/src/ltxv.hpp @@ -243,6 +243,56 @@ namespace LTXV { return build_rope_matrix_from_frequencies(freqs, dim); } + __STATIC_INLINE__ std::vector build_video_rope_matrix_from_positions(const sd::Tensor& positions, + int dim, + int num_heads, + float theta, + const std::vector& max_pos, + bool use_middle_indices_grid) { + GGML_ASSERT(max_pos.size() == 3); + GGML_ASSERT(dim % num_heads == 0); + GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4); + GGML_ASSERT(positions.shape()[0] == 2); + GGML_ASSERT(positions.shape()[1] == 3); + if (positions.dim() == 4) { + GGML_ASSERT(positions.shape()[3] == 1); + } + + const int64_t tokens = positions.shape()[2]; + const std::vector indices = generate_freq_grid(theta, 3, dim); + const int half_dim = dim / 2; + const int pad_size = half_dim - static_cast(indices.size()) * 3; + std::vector> freqs(static_cast(tokens), std::vector(half_dim, 0.f)); + + for (int64_t token = 0; token < tokens; token++) { + int out_idx = 0; + for (int i = 0; i < pad_size; i++) { + freqs[token][out_idx++] = 0.f; + } + + float coords[3]; + for (int axis = 0; axis < 3; axis++) { + float start = positions.dim() == 4 ? positions.index(0, axis, token, 0) + : positions.index(0, axis, token); + float end = positions.dim() == 4 ? positions.index(1, axis, token, 0) + : positions.index(1, axis, token); + float coord = use_middle_indices_grid ? 0.5f * (start + end) : start; + coords[axis] = coord / static_cast(max_pos[axis]); + } + + for (float index : indices) { + for (int axis = 0; axis < 3; axis++) { + freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f); + } + } + } + + if (num_heads > 1) { + return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads); + } + return build_rope_matrix_from_frequencies(freqs, dim); + } + __STATIC_INLINE__ std::vector build_1d_rope_matrix(int64_t seq_len, int dim, int num_heads = 1, @@ -848,6 +898,31 @@ namespace LTXV { return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast(max_pos_t)); } + __STATIC_INLINE__ std::vector build_video_temporal_rope_matrix_from_positions(const sd::Tensor& positions, + int dim, + int num_heads, + float theta, + int max_pos_t, + bool use_middle_indices_grid) { + GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4); + GGML_ASSERT(positions.shape()[0] == 2); + GGML_ASSERT(positions.shape()[1] >= 1); + if (positions.dim() == 4) { + GGML_ASSERT(positions.shape()[3] == 1); + } + + std::vector coords; + coords.reserve(static_cast(positions.shape()[2])); + for (int64_t token = 0; token < positions.shape()[2]; token++) { + float start = positions.dim() == 4 ? positions.index(0, 0, token, 0) + : positions.index(0, 0, token); + float end = positions.dim() == 4 ? positions.index(1, 0, token, 0) + : positions.index(1, 0, token); + coords.push_back(use_middle_indices_grid ? 0.5f * (start + end) : start); + } + return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast(max_pos_t)); + } + __STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index, int audio_latent_downsample_factor = 4, int hop_length = 160, @@ -1664,7 +1739,8 @@ namespace LTXV { const sd::Tensor& audio_x_tensor = {}, const sd::Tensor& audio_timesteps_tensor = {}, int audio_length = 0, - float frame_rate = 24.f) { + float frame_rate = 24.f, + const sd::Tensor& video_positions_tensor = {}) { auto split_inputs = split_av_latents(x_tensor, audio_length); vx_input_cache = split_inputs.first; if (!audio_x_tensor.empty()) { @@ -1681,19 +1757,31 @@ namespace LTXV { ggml_cgraph* gf = new_graph_custom(LTXAV_GRAPH_SIZE); - float video_frame_rate = frame_rate > 0.f ? frame_rate : 24.f; - video_pe_vec = build_video_rope_matrix(vx->ne[0], - vx->ne[1], - vx->ne[2], - static_cast(params.hidden_size), - static_cast(params.num_attention_heads), - video_frame_rate, - params.positional_embedding_theta, - params.positional_embedding_max_pos, - params.vae_scale_factors, - params.causal_temporal_positioning, - params.use_middle_indices_grid); - auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads); + float video_frame_rate = frame_rate > 0.f ? frame_rate : 24.f; + int64_t video_token_count = vx->ne[0] * vx->ne[1] * vx->ne[2]; + bool has_video_positions = !video_positions_tensor.empty(); + if (has_video_positions) { + GGML_ASSERT(video_positions_tensor.shape()[2] == video_token_count); + video_pe_vec = build_video_rope_matrix_from_positions(video_positions_tensor, + static_cast(params.hidden_size), + static_cast(params.num_attention_heads), + params.positional_embedding_theta, + params.positional_embedding_max_pos, + params.use_middle_indices_grid); + } else { + video_pe_vec = build_video_rope_matrix(vx->ne[0], + vx->ne[1], + vx->ne[2], + static_cast(params.hidden_size), + static_cast(params.num_attention_heads), + video_frame_rate, + params.positional_embedding_theta, + params.positional_embedding_max_pos, + params.vae_scale_factors, + params.causal_temporal_positioning, + params.use_middle_indices_grid); + } + auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, video_token_count * params.num_attention_heads); ggml_set_name(video_pe, "ltxav_video_pe"); set_backend_tensor_data(video_pe, video_pe_vec.data()); @@ -1712,18 +1800,27 @@ namespace LTXV { set_backend_tensor_data(audio_pe, audio_pe_vec.data()); int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]); - video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0], - vx->ne[1], - vx->ne[2], - static_cast(params.audio_cross_attention_dim), - static_cast(params.audio_num_attention_heads), - video_frame_rate, - params.positional_embedding_theta, - temporal_max_pos, - std::get<0>(params.vae_scale_factors), - params.causal_temporal_positioning, - true); - video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads); + if (has_video_positions) { + video_cross_pe_vec = build_video_temporal_rope_matrix_from_positions(video_positions_tensor, + static_cast(params.audio_cross_attention_dim), + static_cast(params.audio_num_attention_heads), + params.positional_embedding_theta, + temporal_max_pos, + true); + } else { + video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0], + vx->ne[1], + vx->ne[2], + static_cast(params.audio_cross_attention_dim), + static_cast(params.audio_num_attention_heads), + video_frame_rate, + params.positional_embedding_theta, + temporal_max_pos, + std::get<0>(params.vae_scale_factors), + params.causal_temporal_positioning, + true); + } + video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, video_token_count * params.audio_num_attention_heads); ggml_set_name(video_cross_pe, "ltxav_video_cross_pe"); set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data()); @@ -1806,9 +1903,10 @@ namespace LTXV { const sd::Tensor& audio_x = {}, const sd::Tensor& audio_timesteps = {}, int audio_length = 0, - float frame_rate = 24.f) { + float frame_rate = 24.f, + const sd::Tensor& video_positions = {}) { auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate); + return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate, video_positions); }; auto out = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); return out; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index b2558ab66..b51774bec 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1827,7 +1827,8 @@ class StableDiffusionGGML { float vace_strength, int audio_length, float frame_rate, - const sd_cache_params_t* cache_params) { + const sd_cache_params_t* cache_params, + const sd::Tensor& video_positions = {}) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); float cfg_scale = guidance.txt_cfg; float img_cfg_scale = guidance.img_cfg; @@ -1933,6 +1934,7 @@ class StableDiffusionGGML { diffusion_params.vace_strength = vace_strength; diffusion_params.audio_length = audio_length; diffusion_params.frame_rate = frame_rate; + diffusion_params.video_positions = video_positions.empty() ? nullptr : &video_positions; diffusion_params.skip_layers = nullptr; compute_sample_controls(control_image, @@ -3216,16 +3218,99 @@ struct ImageGenerationLatents { sd::Tensor concat_latent; sd::Tensor uncond_concat_latent; sd::Tensor audio_latent; + sd::Tensor video_positions; sd::Tensor control_image; std::vector> ref_images; std::vector> ref_latents; sd::Tensor denoise_mask; sd::Tensor clip_vision_output; sd::Tensor vace_context; - int64_t ref_image_num = 0; - int audio_length = 0; + int64_t ref_image_num = 0; + int64_t video_conditioning_frame_count = 0; + int64_t video_target_frame_count = 0; + int audio_length = 0; }; +static float ltxv_latent_corner_to_pixel_frame(int64_t corner_index, + int temporal_scale, + bool causal_temporal_positioning) { + float pixel_t = static_cast(corner_index * temporal_scale); + if (causal_temporal_positioning) { + pixel_t = std::max(0.f, pixel_t + 1.f - static_cast(temporal_scale)); + } + return pixel_t; +} + +static void set_ltxv_video_position(sd::Tensor* positions, + int64_t token, + float t_start, + float t_end, + float h_start, + float h_end, + float w_start, + float w_end) { + positions->index(0, 0, token, 0) = t_start; + positions->index(1, 0, token, 0) = t_end; + positions->index(0, 1, token, 0) = h_start; + positions->index(1, 1, token, 0) = h_end; + positions->index(0, 2, token, 0) = w_start; + positions->index(1, 2, token, 0) = w_end; +} + +static sd::Tensor build_ltxv_video_positions(int64_t width, + int64_t height, + int64_t target_latent_frames, + int64_t keyframe_latent_frames, + int keyframe_frame_idx, + int keyframe_pixel_frames, + int fps, + int spatial_scale, + int temporal_scale, + bool causal_temporal_positioning) { + GGML_ASSERT(width > 0 && height > 0 && target_latent_frames > 0); + GGML_ASSERT(keyframe_latent_frames > 0); + GGML_ASSERT(fps > 0); + + int64_t total_tokens = width * height * (target_latent_frames + keyframe_latent_frames); + sd::Tensor positions({2, 3, total_tokens, 1}); + int64_t token = 0; + + for (int64_t t = 0; t < target_latent_frames; t++) { + float t_start = ltxv_latent_corner_to_pixel_frame(t, temporal_scale, causal_temporal_positioning) / static_cast(fps); + float t_end = ltxv_latent_corner_to_pixel_frame(t + 1, temporal_scale, causal_temporal_positioning) / static_cast(fps); + for (int64_t h = 0; h < height; h++) { + float h_start = static_cast(h * spatial_scale); + float h_end = static_cast((h + 1) * spatial_scale); + for (int64_t w = 0; w < width; w++) { + float w_start = static_cast(w * spatial_scale); + float w_end = static_cast((w + 1) * spatial_scale); + set_ltxv_video_position(&positions, token++, t_start, t_end, h_start, h_end, w_start, w_end); + } + } + } + + for (int64_t t = 0; t < keyframe_latent_frames; t++) { + float t_start = static_cast(keyframe_frame_idx + t * temporal_scale); + float t_end = static_cast(keyframe_frame_idx + (t + 1) * temporal_scale); + if (keyframe_pixel_frames == 1) { + t_end = t_start + 1.f; + } + t_start /= static_cast(fps); + t_end /= static_cast(fps); + for (int64_t h = 0; h < height; h++) { + float h_start = static_cast(h * spatial_scale); + float h_end = static_cast((h + 1) * spatial_scale); + for (int64_t w = 0; w < width; w++) { + float w_start = static_cast(w * spatial_scale); + float w_end = static_cast((w + 1) * spatial_scale); + set_ltxv_video_position(&positions, token++, t_start, t_end, h_start, h_end, w_start, w_end); + } + } + } + + return positions; +} + static sd::Tensor pack_ltxav_audio_and_video_latents(const sd::Tensor& video_latent, const sd::Tensor& audio_latent) { if (audio_latent.empty()) { @@ -4136,33 +4221,27 @@ static std::optional prepare_video_generation_latents(sd } if (sd_version_is_ltxav(sd_ctx->sd->version)) { - if (!end_image.empty() || sd_vid_gen_params->control_frames_size > 0) { - LOG_ERROR("LTXAV currently supports txt2vid and init_image i2v only; end_image and control_frames are not implemented"); + if (sd_vid_gen_params->control_frames_size > 0) { + LOG_ERROR("LTXAV control_frames are not implemented"); return std::nullopt; } - if (!start_image.empty()) { + if (!start_image.empty() || !end_image.empty()) { if (sd_ctx->sd->vae_decode_only) { - LOG_ERROR("LTXAV init_image i2v requires VAE encoder weights; create the context with vae_decode_only=false"); + LOG_ERROR("LTXAV image conditioning requires VAE encoder weights; create the context with vae_decode_only=false"); return std::nullopt; } - LOG_INFO("IMG2VID"); - - int64_t t1 = ggml_time_ms(); - auto init_img = start_image.reshape({start_image.shape()[0], - start_image.shape()[1], - 1, - start_image.shape()[2], - start_image.shape()[3]}); - auto init_image_latent = sd_ctx->sd->encode_first_stage(init_img); - if (init_image_latent.empty()) { - LOG_ERROR("failed to encode LTXAV init image"); - return std::nullopt; + if (!start_image.empty() && !end_image.empty()) { + LOG_INFO("FLF2V"); + } else if (!start_image.empty()) { + LOG_INFO("IMG2VID"); + } else { + LOG_INFO("END2VID"); } + int64_t t1 = ggml_time_ms(); latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true); - sd::ops::slice_assign(&latents.init_latent, 2, 0, init_image_latent.shape()[2], init_image_latent); float conditioning_strength = std::clamp(request->strength, 0.f, 1.f); float conditioned_mask = 1.0f - conditioning_strength; @@ -4172,7 +4251,94 @@ static std::optional prepare_video_generation_latents(sd 1, 1}, 1.f); - sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], conditioned_mask); + + auto encode_ltxav_condition_image = [&](const sd::Tensor& image, const char* name) -> sd::Tensor { + auto condition_image = image.reshape({image.shape()[0], + image.shape()[1], + 1, + image.shape()[2], + image.shape()[3]}); + auto condition_latent = sd_ctx->sd->encode_first_stage(condition_image); + if (condition_latent.empty()) { + LOG_ERROR("failed to encode LTXAV %s image", name); + } + return condition_latent; + }; + + auto apply_video_condition_by_latent_index = [&](const sd::Tensor& condition_latent, + int64_t latent_idx, + const char* name) -> bool { + int64_t latent_frames = latents.init_latent.shape()[2]; + int64_t condition_frames = condition_latent.shape()[2]; + if (latent_idx < 0 || condition_frames <= 0 || latent_idx + condition_frames > latent_frames) { + LOG_ERROR("invalid LTXAV %s image latent range: start=%" PRId64 ", length=%" PRId64 ", latent_frames=%" PRId64, + name, + latent_idx, + condition_frames, + latent_frames); + return false; + } + + sd::ops::slice_assign(&latents.init_latent, 2, latent_idx, latent_idx + condition_frames, condition_latent); + sd::ops::fill_slice(&latents.denoise_mask, 2, latent_idx, latent_idx + condition_frames, conditioned_mask); + return true; + }; + + auto apply_video_condition_by_keyframe_index = [&](const sd::Tensor& keyframes, + int frame_idx, + const char* name) -> bool { + int64_t keyframe_frames = keyframes.shape()[2]; + if (keyframe_frames <= 0 || keyframes.shape()[0] != latents.init_latent.shape()[0] || + keyframes.shape()[1] != latents.init_latent.shape()[1] || + keyframes.shape()[3] != latents.init_latent.shape()[3]) { + LOG_ERROR("invalid LTXAV %s keyframe latent shape", name); + return false; + } + + latents.video_target_frame_count = latents.init_latent.shape()[2]; + latents.video_conditioning_frame_count = keyframe_frames; + latents.init_latent = sd::ops::concat(latents.init_latent, keyframes, 2); + + auto keyframe_mask = sd::full({keyframes.shape()[0], + keyframes.shape()[1], + keyframes.shape()[2], + 1, + 1}, + conditioned_mask); + latents.denoise_mask = sd::ops::concat(latents.denoise_mask, keyframe_mask, 2); + latents.video_positions = build_ltxv_video_positions(latents.init_latent.shape()[0], + latents.init_latent.shape()[1], + latents.video_target_frame_count, + keyframe_frames, + frame_idx, + 1, + request->fps, + request->vae_scale_factor, + 8, + true); + return true; + }; + + if (!start_image.empty()) { + auto start_image_latent = encode_ltxav_condition_image(start_image, "init"); + if (start_image_latent.empty() || !apply_video_condition_by_latent_index(start_image_latent, 0, "init")) { + return std::nullopt; + } + } + + if (!end_image.empty()) { + auto end_image_latent = encode_ltxav_condition_image(end_image, "end"); + if (end_image_latent.empty()) { + return std::nullopt; + } + + int frame_idx = request->frames - 1; + bool ok = frame_idx == 0 ? apply_video_condition_by_latent_index(end_image_latent, 0, "end") + : apply_video_condition_by_keyframe_index(end_image_latent, frame_idx, "end"); + if (!ok) { + return std::nullopt; + } + } int64_t t2 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); @@ -4528,7 +4694,8 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, request.vace_strength, latents.audio_length, static_cast(request.fps), - request.cache_params); + request.cache_params, + latents.video_positions); int64_t sampling_end = ggml_time_ms(); if (x_t_sampled.empty()) { LOG_ERROR("sampling(high noise) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); @@ -4573,7 +4740,8 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, request.vace_strength, latents.audio_length, static_cast(request.fps), - request.cache_params); + request.cache_params, + latents.video_positions); int64_t sampling_end = ggml_time_ms(); if (sd_ctx->sd->free_params_immediately) { @@ -4602,6 +4770,12 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, } } + if (latents.video_conditioning_frame_count > 0) { + int64_t target_frames = latents.video_target_frame_count > 0 ? latents.video_target_frame_count + : final_latent.shape()[2] - latents.video_conditioning_frame_count; + final_latent = sd::ops::slice(final_latent, 2, 0, target_frames); + } + if (latents.ref_image_num > 0) { final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]); }