Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added assets/ltx2/flf2v.webm
Binary file not shown.
12 changes: 12 additions & 0 deletions docs/ltx2.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,16 @@
src="../assets/ltx2/i2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>

### LTX-2.3 dev FLF2V

```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png -o flf2v.webm
```

<video
src="../assets/ltx2/flf2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>
4 changes: 3 additions & 1 deletion src/diffusion_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ struct DiffusionParams {
float vace_strength = 1.f;
int audio_length = 0;
float frame_rate = 24.f;
const sd::Tensor<float>* video_positions = nullptr;
const std::vector<int>* skip_layers = nullptr;
};

Expand Down Expand Up @@ -766,7 +767,8 @@ struct LTXAVModel : public DiffusionModel {
tensor_or_empty(diffusion_params.audio_x),
tensor_or_empty(diffusion_params.audio_timesteps),
diffusion_params.audio_length,
diffusion_params.frame_rate);
diffusion_params.frame_rate,
tensor_or_empty(diffusion_params.video_positions));
}
};

Expand Down
154 changes: 126 additions & 28 deletions src/ltxv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,56 @@ namespace LTXV {
return build_rope_matrix_from_frequencies(freqs, dim);
}

__STATIC_INLINE__ std::vector<float> build_video_rope_matrix_from_positions(const sd::Tensor<float>& positions,
int dim,
int num_heads,
float theta,
const std::vector<int>& max_pos,
bool use_middle_indices_grid) {
GGML_ASSERT(max_pos.size() == 3);
GGML_ASSERT(dim % num_heads == 0);
GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4);
GGML_ASSERT(positions.shape()[0] == 2);
GGML_ASSERT(positions.shape()[1] == 3);
if (positions.dim() == 4) {
GGML_ASSERT(positions.shape()[3] == 1);
}

const int64_t tokens = positions.shape()[2];
const std::vector<float> indices = generate_freq_grid(theta, 3, dim);
const int half_dim = dim / 2;
const int pad_size = half_dim - static_cast<int>(indices.size()) * 3;
std::vector<std::vector<float>> freqs(static_cast<size_t>(tokens), std::vector<float>(half_dim, 0.f));

for (int64_t token = 0; token < tokens; token++) {
int out_idx = 0;
for (int i = 0; i < pad_size; i++) {
freqs[token][out_idx++] = 0.f;
}

float coords[3];
for (int axis = 0; axis < 3; axis++) {
float start = positions.dim() == 4 ? positions.index(0, axis, token, 0)
: positions.index(0, axis, token);
float end = positions.dim() == 4 ? positions.index(1, axis, token, 0)
: positions.index(1, axis, token);
float coord = use_middle_indices_grid ? 0.5f * (start + end) : start;
coords[axis] = coord / static_cast<float>(max_pos[axis]);
}

for (float index : indices) {
for (int axis = 0; axis < 3; axis++) {
freqs[token][out_idx++] = index * (coords[axis] * 2.f - 1.f);
}
}
}

if (num_heads > 1) {
return build_rope_matrix_from_frequencies(split_frequencies_by_heads(freqs, dim, num_heads), dim / num_heads);
}
return build_rope_matrix_from_frequencies(freqs, dim);
}

__STATIC_INLINE__ std::vector<float> build_1d_rope_matrix(int64_t seq_len,
int dim,
int num_heads = 1,
Expand Down Expand Up @@ -848,6 +898,31 @@ namespace LTXV {
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
}

__STATIC_INLINE__ std::vector<float> build_video_temporal_rope_matrix_from_positions(const sd::Tensor<float>& positions,
int dim,
int num_heads,
float theta,
int max_pos_t,
bool use_middle_indices_grid) {
GGML_ASSERT(positions.dim() == 3 || positions.dim() == 4);
GGML_ASSERT(positions.shape()[0] == 2);
GGML_ASSERT(positions.shape()[1] >= 1);
if (positions.dim() == 4) {
GGML_ASSERT(positions.shape()[3] == 1);
}

std::vector<float> coords;
coords.reserve(static_cast<size_t>(positions.shape()[2]));
for (int64_t token = 0; token < positions.shape()[2]; token++) {
float start = positions.dim() == 4 ? positions.index(0, 0, token, 0)
: positions.index(0, 0, token);
float end = positions.dim() == 4 ? positions.index(1, 0, token, 0)
: positions.index(1, 0, token);
coords.push_back(use_middle_indices_grid ? 0.5f * (start + end) : start);
}
return build_1d_rope_matrix_from_coords(coords, dim, num_heads, theta, static_cast<float>(max_pos_t));
}

__STATIC_INLINE__ float audio_latent_start_time_sec(int64_t latent_index,
int audio_latent_downsample_factor = 4,
int hop_length = 160,
Expand Down Expand Up @@ -1664,7 +1739,8 @@ namespace LTXV {
const sd::Tensor<float>& audio_x_tensor = {},
const sd::Tensor<float>& audio_timesteps_tensor = {},
int audio_length = 0,
float frame_rate = 24.f) {
float frame_rate = 24.f,
const sd::Tensor<float>& video_positions_tensor = {}) {
auto split_inputs = split_av_latents(x_tensor, audio_length);
vx_input_cache = split_inputs.first;
if (!audio_x_tensor.empty()) {
Expand All @@ -1681,19 +1757,31 @@ namespace LTXV {

ggml_cgraph* gf = new_graph_custom(LTXAV_GRAPH_SIZE);

float video_frame_rate = frame_rate > 0.f ? frame_rate : 24.f;
video_pe_vec = build_video_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.hidden_size),
static_cast<int>(params.num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
params.positional_embedding_max_pos,
params.vae_scale_factors,
params.causal_temporal_positioning,
params.use_middle_indices_grid);
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.num_attention_heads);
float video_frame_rate = frame_rate > 0.f ? frame_rate : 24.f;
int64_t video_token_count = vx->ne[0] * vx->ne[1] * vx->ne[2];
bool has_video_positions = !video_positions_tensor.empty();
if (has_video_positions) {
GGML_ASSERT(video_positions_tensor.shape()[2] == video_token_count);
video_pe_vec = build_video_rope_matrix_from_positions(video_positions_tensor,
static_cast<int>(params.hidden_size),
static_cast<int>(params.num_attention_heads),
params.positional_embedding_theta,
params.positional_embedding_max_pos,
params.use_middle_indices_grid);
} else {
video_pe_vec = build_video_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.hidden_size),
static_cast<int>(params.num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
params.positional_embedding_max_pos,
params.vae_scale_factors,
params.causal_temporal_positioning,
params.use_middle_indices_grid);
}
auto video_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.attention_head_dim / 2, video_token_count * params.num_attention_heads);
ggml_set_name(video_pe, "ltxav_video_pe");
set_backend_tensor_data(video_pe, video_pe_vec.data());

Expand All @@ -1712,18 +1800,27 @@ namespace LTXV {
set_backend_tensor_data(audio_pe, audio_pe_vec.data());

int temporal_max_pos = std::max(params.positional_embedding_max_pos[0], params.audio_positional_embedding_max_pos[0]);
video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
temporal_max_pos,
std::get<0>(params.vae_scale_factors),
params.causal_temporal_positioning,
true);
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, vx->ne[0] * vx->ne[1] * vx->ne[2] * params.audio_num_attention_heads);
if (has_video_positions) {
video_cross_pe_vec = build_video_temporal_rope_matrix_from_positions(video_positions_tensor,
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
params.positional_embedding_theta,
temporal_max_pos,
true);
} else {
video_cross_pe_vec = build_video_temporal_rope_matrix(vx->ne[0],
vx->ne[1],
vx->ne[2],
static_cast<int>(params.audio_cross_attention_dim),
static_cast<int>(params.audio_num_attention_heads),
video_frame_rate,
params.positional_embedding_theta,
temporal_max_pos,
std::get<0>(params.vae_scale_factors),
params.causal_temporal_positioning,
true);
}
video_cross_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, params.audio_attention_head_dim / 2, video_token_count * params.audio_num_attention_heads);
ggml_set_name(video_cross_pe, "ltxav_video_cross_pe");
set_backend_tensor_data(video_cross_pe, video_cross_pe_vec.data());

Expand Down Expand Up @@ -1806,9 +1903,10 @@ namespace LTXV {
const sd::Tensor<float>& audio_x = {},
const sd::Tensor<float>& audio_timesteps = {},
int audio_length = 0,
float frame_rate = 24.f) {
float frame_rate = 24.f,
const sd::Tensor<float>& video_positions = {}) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate);
return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate, video_positions);
};
auto out = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
return out;
Expand Down
Loading
Loading