how to use quantization and device_map=balance to run qwen-image on kaggle T4 * 2

@yiyixuxu

!python3 -m pip install -U diffusers peft bitsandbytes protobuf import diffusers, torch, math qwen = diffusers.QwenImagePipeline.from_pretrained('Qwen/Qwen-Image', quantization_config=diffusers.PipelineQuantizationConfig(quant_backend='bitsandbytes_4bit', quant_kwargs={'load_in_4bit':True, 'bnb_4bit_quant_type':'nf4', 'bnb_4bit_compute_dtype':torch.float16}, components_to_quantize=['transformer', 'text_encoder']), torch_dtype=torch.float16, device_map='balanced') print(qwen.hf_device_map) qwen.scheduler = diffusers.FlowMatchEulerDiscreteScheduler.from_config({'base_image_seq_len':256, 'base_shift':math.log(3), 'invert_sigmas':False, 'max_image_seq_len':8192, 'max_shift':math.log(3), 'num_train_timesteps':1000, 'shift':1, 'shift_terminal':None, 'stochastic_sampling':False, 'time_shift_type':'exponential', 'use_beta_sigmas':False, 'use_dynamic_shifting':True, 'use_exponential_sigmas':False, 'use_karras_sigmas':False}) qwen.load_lora_weights('lightx2v/Qwen-Image-Lightning', weight_name='Qwen-Image-Lightning-4steps-V2.0.safetensors', adapter_name='lightning') qwen.set_adapters('lightning', adapter_weights=1) qwen(prompt='a beautiful girl', height=1280, width=720, num_inference_steps=4, true_cfg_scale=1).images[0].save('a.png')

WARNING:accelerate.big_modeling:Some parameters are on the meta device because they were offloaded to the cpu.
{'text_encoder': 'cpu', 'vae': 0} where is the transformer ?

NotImplementedError: Cannot copy out of meta tensor; no data!

I want to ask how to make the above code work in kaggle. why 16G * 2 vram still not enough to run q4 quantization qwen-image? I want to take full advantage of 2 gpu. Do I need max_memory?

full error logs:
/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
117
118 return decorate_context

/usr/local/lib/python3.11/dist-packages/diffusers/pipelines/qwenimage/pipeline_qwenimage.py in call(self, prompt, negative_prompt, true_cfg_scale, height, width, num_inference_steps, sigmas, guidance_scale, num_images_per_prompt, generator, latents, prompt_embeds, prompt_embeds_mask, negative_prompt_embeds, negative_prompt_embeds_mask, output_type, return_dict, attention_kwargs, callback_on_step_end, callback_on_step_end_tensor_inputs, max_sequence_length)
566 )
567 do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
--> 568 prompt_embeds, prompt_embeds_mask = self.encode_prompt(
569 prompt=prompt,
570 prompt_embeds=prompt_embeds,

/usr/local/lib/python3.11/dist-packages/diffusers/pipelines/qwenimage/pipeline_qwenimage.py in encode_prompt(self, prompt, device, num_images_per_prompt, prompt_embeds, prompt_embeds_mask, max_sequence_length)
252
253 if prompt_embeds is None:
--> 254 prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
255
256 prompt_embeds = prompt_embeds[:, :max_sequence_length]

/usr/local/lib/python3.11/dist-packages/diffusers/pipelines/qwenimage/pipeline_qwenimage.py in _get_qwen_prompt_embeds(self, prompt, device, dtype)
203 txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt"
204 ).to(device)
--> 205 encoder_hidden_states = self.text_encoder(
206 input_ids=txt_tokens.input_ids,
207 attention_mask=txt_tokens.attention_mask,

/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)
1740
1741 # torchrec tests the code consistency with the following code