LTX2 Latent Optimization issue #12966

asafjo23 · 2026-01-11T23:33:54Z

asafjo23
Jan 11, 2026

Hi,

I'm trying to capture the model prompt cross attention in order to apply some latent optimization techniques during inference, but for some reason I'm getting latents.grad as None no matter what I'm trying. I'll add small code snippets to describe what I'm trying to do:

class Optimizer: def __init__( self, loss_fn: LayoutLoss, num_refinements: int = 3, lr_start: float = 0.01, lr_end: float = 0.05, betas: tuple[float, float] = (0.4, 0.9), weight_decay: float = 0.0, ): self.loss_fn = loss_fn self.num_refinements = num_refinements self.lr_start = lr_start self.lr_end = lr_end self.betas = betas self.weight_decay = weight_decay def optimize( self, transformer: LTX2VideoTransformer3DModel, latents: torch.Tensor, audio_latents: torch.Tensor, prompt_embeds: torch.Tensor, audio_prompt_embeds: torch.Tensor, timestep: torch.Tensor, attention_mask: torch.Tensor, num_frames: int, height: int, width: int, fps: float, audio_num_frames: int, video_coords: torch.Tensor, audio_coords: torch.Tensor, attention_kwargs: Dict[str, Any], store: AttentionStore, progress_bar: tqdm.tqdm, ) -> torch.Tensor: latents = latents.clone().detach() latents = latents.to(transformer.dtype) optimizer = torch.optim.AdamW( [latents], lr=self.lr_start, betas=self.betas, weight_decay=self.weight_decay, ) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=self.num_refinements, eta_min=self.lr_end ) transformer.zero_grad(set_to_none=True) first_loss = None with torch.enable_grad(): for i in range(self.num_refinements): latents = latents.requires_grad_(True) store.reset() latent_model_input = latents.to(transformer.dtype) _ = transformer( hidden_states=latent_model_input, audio_hidden_states=audio_latents, encoder_hidden_states=prompt_embeds, audio_encoder_hidden_states=audio_prompt_embeds, timestep=timestep, encoder_attention_mask=attention_mask, audio_encoder_attention_mask=attention_mask, num_frames=num_frames, height=height, width=width, fps=fps, audio_num_frames=audio_num_frames, video_coords=video_coords, audio_coords=audio_coords, attention_kwargs=attention_kwargs, return_dict=False, )[0] attn = store.get_avg_attention().unsqueeze(0) loss = self.loss_fn(attn) # Backward loss.backward() # FIX 4: Verify gradients exist before stepping if latents.grad is None: print(f"WARNING: latents.grad is None at iteration {i+1}!") print(" Gradient flow is broken. Check:") print(" 1. AttentionStore doesn't use .clone()") print(" 2. No dtype conversion breaks the computation graph") print(" 3. Gradient checkpointing is disabled") break # Only step if we have gradients optimizer.step() scheduler.step() if i == 0: first_loss = loss.item() current_lr = scheduler.get_last_lr()[0] progress_bar.set_postfix( loss=f"{first_loss:.2f}→{loss.item():.2f}", grad=f"{latents.grad.norm().item():.2e}", lr=f"{current_lr:.2e}", refine_step=f"{i + 1}/{self.num_refinements}", ) store.reset() return latents.detach() class AttentionStore: def __init__(self): self.accumulator = None self.count = 0 self.keep_heads = False def __call__(self, probs: torch.Tensor) -> torch.Tensor: if probs.shape[0] == 2: probs = probs[1:] if not self.keep_heads: probs = probs.mean(dim=1) if self.accumulator is None: self.accumulator = probs else: self.accumulator = self.accumulator + probs self.count += 1 return probs def reset(self): self.accumulator = None self.count = 0 def get_avg_attention(self) -> torch.Tensor: return self.accumulator / self.count class AttnProcessor: r""" Processor for implementing attention (SDPA is used by default if you're using PyTorch 2.0) for the LTX-2.0 model. Compared to the LTX-1.0 model, we allow the RoPE embeddings for the queries and keys to be separate so that we can support audio-to-video (a2v) and video-to-audio (v2a) cross attention. FIXED: Now uses manual attention output for cross-attention to maintain gradient flow. """ _attention_backend = None _parallel_config = None def __init__(self, store: AttentionStore, name: str): if is_torch_version("<", "2.0"): raise ValueError( "LTX attention processors require a minimum PyTorch version of 2.0. Please upgrade your PyTorch installation." ) self.store = store self.name = name def __call__( self, attn: "LTX2Attention", hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, query_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, key_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> torch.Tensor: batch_size, sequence_length, _ = ( hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape ) if attention_mask is not None: attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) original_encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is None: encoder_hidden_states = hidden_states query = attn.to_q(hidden_states) key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) query = attn.norm_q(query) key = attn.norm_k(key) if query_rotary_emb is not None: if attn.rope_type == "interleaved": query = apply_interleaved_rotary_emb(query, query_rotary_emb) key = apply_interleaved_rotary_emb( key, key_rotary_emb if key_rotary_emb is not None else query_rotary_emb ) elif attn.rope_type == "split": query = apply_split_rotary_emb(query, query_rotary_emb) key = apply_split_rotary_emb(key, key_rotary_emb if key_rotary_emb is not None else query_rotary_emb) query = query.unflatten(2, (attn.heads, -1)) key = key.unflatten(2, (attn.heads, -1)) value = value.unflatten(2, (attn.heads, -1)) is_cross = original_encoder_hidden_states is not None and original_encoder_hidden_states is not hidden_states if is_cross: q = query.permute(0, 2, 1, 3) k = key.permute(0, 2, 1, 3) scale_factor = 1.0 / math.sqrt(q.size(-1)) scores = torch.matmul(q, k.transpose(-1, -2)) * scale_factor if attention_mask is not None: scores = scores + attention_mask probs = F.softmax(scores, dim=-1) self.store(probs) hidden_states = dispatch_attention_fn( query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False, backend=self._attention_backend, ) hidden_states = hidden_states.flatten(2, 3) hidden_states = hidden_states.to(query.dtype) hidden_states = attn.to_out[0](hidden_states) hidden_states = attn.to_out[1](hidden_states) return hidden_states

Hoped someone can point me to my issue, I have a feeling the the captured attention are not affecting the latents during the forward (graph computation).

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

LTX2 Latent Optimization issue #12966

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

LTX2 Latent Optimization issue #12966

Uh oh!

asafjo23 Jan 11, 2026

Replies: 0 comments

asafjo23
Jan 11, 2026