forked from kandinskylab/kandinsky-5
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkandi_generate_video.py
More file actions
2086 lines (1863 loc) · 87.3 KB
/
kandi_generate_video.py
File metadata and controls
2086 lines (1863 loc) · 87.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import argparse
import time
import warnings
import logging
import os
import tempfile
import sys
import numpy as np
import torch
from PIL import Image
# Early parse --no_compile to set the flag before importing kandinsky
def _early_parse_no_compile():
for i, arg in enumerate(sys.argv):
if arg == '--no_compile':
return True
return False
# Early parse SDNQ flags to enable optimal int8 performance BEFORE importing SDNQ
def _early_parse_sdnq_flags():
"""Parse SDNQ-related flags early to set env vars before module import.
CRITICAL: SDNQ module evaluates these environment variables at import time.
Setting them after import has no effect.
"""
use_sdnq = '--use_sdnq' in sys.argv
no_triton_mm = '--no_sdnq_triton_mm' in sys.argv
no_compile = '--no_sdnq_compile' in sys.argv
if use_sdnq:
# Handle Triton int8 matmul kernel setting
if no_triton_mm:
os.environ["SDNQ_USE_TRITON_MM"] = "0"
print("SDNQ: Using torch._int_mm (Triton MM disabled)")
elif os.environ.get("SDNQ_USE_TRITON_MM") is None:
# Enable Triton int8 matmul kernel for optimal tensor core utilization on CUDA
# Default SDNQ only enables Triton MM for RDNA2/ZLUDA, but it's faster on 4090/5090 too
os.environ["SDNQ_USE_TRITON_MM"] = "1"
print("SDNQ: Enabling Triton int8 matmul kernel for optimal CUDA performance")
# Handle torch.compile setting
if no_compile:
os.environ["SDNQ_USE_TORCH_COMPILE"] = "0"
print("SDNQ: torch.compile disabled for faster startup")
elif os.environ.get("SDNQ_USE_TORCH_COMPILE") is None:
# Enable torch.compile for SDNQ dequantization (if Triton is available)
os.environ["SDNQ_USE_TORCH_COMPILE"] = "1"
print("SDNQ: Enabling torch.compile for dequantization kernels")
# Must be called before any SDNQ-related imports
_early_parse_sdnq_flags()
# Set global compile flag before importing kandinsky modules
import kandinsky.models.compile_config as compile_config
_no_compile = _early_parse_no_compile()
compile_config.USE_TORCH_COMPILE = not _no_compile
if _no_compile:
print("torch.compile() disabled for faster startup")
from kandinsky import get_T2V_pipeline, get_I2V_pipeline, get_I2V_pipeline_with_block_swap, get_T2V_pipeline_with_block_swap, get_T2I_pipeline
from kandinsky.generation_utils import generate_sample_from_checkpoint, generate_sample_i2v_from_checkpoint, generate_sample_v2v, generate_sample_v2v_join
from kandinsky.i2v_pipeline import (
get_conditioning_frames_from_video,
get_conditioning_frames_from_two_videos,
get_conditioning_latents_from_two_images,
get_conditioning_video_and_image,
encode_video_to_latents,
Kandinsky5DenoisePipeline
)
from kandinsky.generation_utils import generate_sample_denoise
try:
from scripts.latentpreviewer import LatentPreviewer
except ImportError:
LatentPreviewer = None
def disable_warnings():
warnings.filterwarnings("ignore")
logging.getLogger("torch").setLevel(logging.ERROR)
torch._logging.set_logs(
dynamo=logging.ERROR,
dynamic=logging.ERROR,
aot=logging.ERROR,
inductor=logging.ERROR,
guards=False,
recompiles=False
)
def resize_image_to_resolution(image_path, target_width, target_height, alignment=32):
"""
Resize image to target resolution while maintaining aspect ratio and ensuring
dimensions are multiples of alignment.
Args:
image_path: Path to the input image
target_width: Target width (will be rounded to alignment)
target_height: Target height (will be rounded to alignment)
alignment: Pixel alignment (32 for standard, 128 for NABLA)
Returns:
Path to the resized image (temporary file)
Note:
NABLA attention requires 128-pixel alignment due to fractal flattening.
Standard attention only requires 32-pixel alignment.
"""
try:
img = Image.open(image_path)
original_width, original_height = img.size
# Ensure target dimensions are multiples of alignment
target_width = (target_width // alignment) * alignment
target_height = (target_height // alignment) * alignment
target_width = max(alignment * 2, target_width) # Minimum 2x alignment
target_height = max(alignment * 2, target_height)
# Check if resizing is needed
if original_width == target_width and original_height == target_height:
print(f"Image already at target resolution: {target_width}x{target_height}")
return image_path
print(f"Resizing image from {original_width}x{original_height} to {target_width}x{target_height}")
# Resize the image
resized_img = img.resize((target_width, target_height), Image.LANCZOS)
# Save to temporary file
temp_dir = tempfile.gettempdir()
temp_filename = f"resized_input_{os.path.basename(image_path)}"
temp_path = os.path.join(temp_dir, temp_filename)
# Preserve the image format
resized_img.save(temp_path, format=img.format if img.format else 'PNG')
print(f"Resized image saved to: {temp_path}")
return temp_path
except Exception as e:
print(f"Error resizing image: {e}")
print(f"Using original image: {image_path}")
return image_path
def normalize_join_frames(video1, video2, num_frames):
"""
Normalize frames at the join point between two videos to reduce flash/discontinuity.
This function:
1. Computes color statistics (mean, std) for boundary regions
2. Applies color matching to align the second video's colors with the first
3. Applies cross-fade blending in the overlap region
Args:
video1: First video tensor [num_frames, H, W, C] (float, 0-255 range)
video2: Second video tensor [num_frames, H, W, C] (float, 0-255 range)
num_frames: Number of frames to blend at the boundary
Returns:
video1: Unchanged first video
video2: Color-matched and blended second video
"""
if num_frames <= 0:
return video1, video2
# Ensure we have enough frames
num_frames = min(num_frames, video1.shape[0], video2.shape[0])
# Get boundary regions for statistics
v1_end = video1[-num_frames:] # Last N frames of video1
v2_start = video2[:num_frames] # First N frames of video2
# Compute per-channel statistics for color matching
# Using the boundary frames to compute mean and std
v1_mean = v1_end.mean(dim=(0, 1, 2), keepdim=True) # [1, 1, 1, C]
v1_std = v1_end.std(dim=(0, 1, 2), keepdim=True) + 1e-6
v2_mean = v2_start.mean(dim=(0, 1, 2), keepdim=True)
v2_std = v2_start.std(dim=(0, 1, 2), keepdim=True) + 1e-6
# Apply color matching to entire video2
# Transform: (x - mean2) / std2 * std1 + mean1
video2_matched = (video2 - v2_mean) / v2_std * v1_std + v1_mean
video2_matched = video2_matched.clamp(0, 255)
# Apply cross-fade blending in the overlap region
# Modify the first num_frames of video2 to blend with end of video1
for i in range(num_frames):
# Alpha goes from 1 (favor video1) to 0 (favor video2)
alpha = 1.0 - (i + 1) / (num_frames + 1)
video2_matched[i] = alpha * video1[-(num_frames - i)] + (1 - alpha) * video2_matched[i]
print(f">>> Frame normalization: Applied color matching and {num_frames}-frame cross-fade")
return video1, video2_matched
def normalize_join_frames_triple(video1, middle, video2, num_frames):
"""
Normalize frames at both join points in a three-video concatenation.
Used for video join mode: video1 + middle + video2
Args:
video1: First video tensor [num_frames, H, W, C]
middle: Middle generated video tensor [num_frames, H, W, C]
video2: Second video tensor [num_frames, H, W, C]
num_frames: Number of frames to blend at each boundary
Returns:
video1, middle, video2: Normalized tensors with smooth transitions
"""
if num_frames <= 0:
return video1, middle, video2
# First junction: video1 -> middle
video1, middle = normalize_join_frames(video1, middle, num_frames)
# Second junction: middle -> video2
# For this, we need to normalize video2 to match middle's ending
middle, video2 = normalize_join_frames(middle, video2, num_frames)
return video1, middle, video2
def parse_args():
parser = argparse.ArgumentParser(
description="Generate a video using Kandinsky 5"
)
parser.add_argument(
'--local-rank',
type=int,
help='local rank'
)
parser.add_argument(
"--config",
type=str,
default="./configs/config_5s_sft.yaml",
help="The config file of the model"
)
parser.add_argument(
"--prompt",
type=str,
default="The dragon soars into the sunset sky.",
help="The prompt to generate video"
)
parser.add_argument(
"--image",
type=str,
default="./assets/test_image.jpg",
help="The input image for image-to-video generation"
)
parser.add_argument(
"--end_image",
type=str,
default=None,
help="Ending image for image-to-video generation. When provided with --image, generates a video transitioning from the start image to the end image."
)
parser.add_argument(
"--video",
type=str,
default=None,
help="Input video for video continuation (overrides --image)"
)
parser.add_argument(
"--video2",
type=str,
default=None,
help="Second input video for video joining mode. When provided with --video, creates a transition between the two videos."
)
parser.add_argument(
"--num_cond_frames",
type=int,
default=4,
help="Number of last frames to use as conditioning for video continuation (or frames from each video in join mode)"
)
parser.add_argument(
"--normalize_frames",
type=int,
default=0,
help="Number of frames to blend at join points (0=disabled). Smoothly transitions color/brightness at video boundaries."
)
parser.add_argument(
"--negative_prompt",
type=str,
default="Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards",
help="Negative prompt for classifier-free guidance"
)
parser.add_argument(
"--clip_prompt",
type=str,
default=None,
help="Separate prompt for CLIP encoder (if not provided, uses main prompt)"
)
parser.add_argument(
"--width",
type=int,
default=768,
help="Width of the video in pixels"
)
parser.add_argument(
"--height",
type=int,
default=512,
help="Height of the video in pixels"
)
parser.add_argument(
"--video_duration",
type=int,
default=5,
help="Duratioin of the video in seconds"
)
parser.add_argument(
"--expand_prompt",
type=int,
default=1,
help="Whether to use prompt expansion."
)
parser.add_argument(
"--no_prompt_template",
action='store_true',
default=False,
help="Disable the system prompt template wrapper. By default, prompts are wrapped with instructions that tell Qwen to describe video details (camera movement, style, etc.). Use this flag to pass your prompt directly without the template."
)
parser.add_argument(
"--sample_steps",
type=int,
default=None,
help="The sampling steps number."
)
parser.add_argument(
"--guidance_weight",
type=float,
default=None,
help="Guidance weight."
)
parser.add_argument(
"--scheduler_scale",
type=float,
default=5.0,
help="Scheduler scale."
)
parser.add_argument(
"--output_filename",
type=str,
default="./test.mp4",
help="Name of the resulting file"
)
parser.add_argument(
"--seed",
type=int,
default=1137,
help="Seed for the random number generator"
)
parser.add_argument(
"--offload",
action='store_true',
default=False,
help="Offload models to save memory or not"
)
parser.add_argument(
"--magcache",
action='store_true',
default=False,
help="Using MagCache (for 50 steps models only)"
)
parser.add_argument(
"--qwen_quantization",
action='store_true',
default=False,
help="Use quantized Qwen2.5-VL model (4-bit quantization)"
)
parser.add_argument(
"--attention_engine",
type=str,
default="auto",
help="Name of the full attention algorithm to use for <=5 second generation",
choices=["flash_attention_2", "flash_attention_3", "sdpa", "sage", "auto"]
)
parser.add_argument(
"--enable_block_swap",
action='store_true',
default=False,
help="Enable block swapping for large models (e.g., 20B) to fit in limited VRAM"
)
parser.add_argument(
"--blocks_in_memory",
type=int,
default=6,
help="Number of transformer blocks to keep in GPU memory when using block swapping"
)
parser.add_argument(
"--dtype",
type=str,
default="bfloat16",
choices=["float32", "float16", "bfloat16", "fp8_scaled"],
help="Data type for model weights (default: bfloat16). Use bfloat16 for best memory efficiency with minimal quality loss. Use fp8_scaled for maximum memory savings (~50%% vs bf16). This sets all dtypes if specific ones are not provided."
)
parser.add_argument(
"--text_encoder_dtype",
type=str,
default=None,
choices=["float32", "float16", "bfloat16", "fp8_scaled"],
help="Data type specifically for text encoder. If not set, uses --dtype value."
)
parser.add_argument(
"--vae_dtype",
type=str,
default=None,
choices=["float32", "float16", "bfloat16", "fp8_scaled"],
help="Data type specifically for VAE. If not set, uses --dtype value."
)
parser.add_argument(
"--computation_dtype",
type=str,
default=None,
choices=["float32", "float16", "bfloat16", "fp8_scaled"],
help="Data type for activations/computations. If not set, uses --dtype value."
)
parser.add_argument(
"--use_mixed_weights",
action='store_true',
default=False,
help="Use mixed precision weights - preserve fp32 for critical layers (norms, embeddings) while using specified dtype for activations. Prevents dtype conversion errors."
)
parser.add_argument(
"--checkpoint_path",
type=str,
default=None,
help="Override DiT model checkpoint path from config. Provide path to your .safetensors file."
)
# INT8 quantization configuration (legacy)
parser.add_argument(
"--use_int8",
action='store_true',
default=False,
help="Use legacy INT8 quantization for linear layers. Consider --use_sdnq for better performance."
)
parser.add_argument(
"--int8_block_size",
type=int,
default=128,
help="Block size for legacy INT8 quantization (must be 128 for Triton kernels, default: 128)"
)
# SDNQ quantization configuration (recommended)
parser.add_argument(
"--use_sdnq",
action='store_true',
default=False,
help="Use SDNQ quantization with auto-tuned Triton kernels (20-40%% faster than legacy INT8)"
)
parser.add_argument(
"--sdnq_weights_dtype",
type=str,
default="int8",
choices=["int8", "fp8", "int4"],
help="SDNQ weight storage dtype (default: int8). int8=best balance, fp8=H100+, int4=experimental"
)
parser.add_argument(
"--sdnq_use_quantized_matmul",
action='store_true',
default=True,
help="Use accelerated quantized matmul (default: True). Disable for debugging."
)
parser.add_argument(
"--no_sdnq_quantized_matmul",
action='store_true',
default=False,
help="Disable SDNQ quantized matmul (forces dequantize+fp matmul path)"
)
parser.add_argument(
"--sdnq_triton_mm",
action='store_true',
default=True,
help="Use Triton int8 matmul kernel (default: True for CUDA). Faster on 4090/5090."
)
parser.add_argument(
"--no_sdnq_triton_mm",
action='store_true',
default=False,
help="Disable Triton int8 matmul kernel (fallback to torch._int_mm)"
)
parser.add_argument(
"--sdnq_compile",
action='store_true',
default=True,
help="Enable torch.compile for SDNQ kernels (default: True). Improves performance after warmup."
)
parser.add_argument(
"--no_sdnq_compile",
action='store_true',
default=False,
help="Disable torch.compile for SDNQ kernels (faster startup, slower inference)"
)
# NABLA sparse attention configuration
parser.add_argument(
"--attention_type",
type=str,
default=None,
choices=["auto", "flash", "nabla"],
help="Attention type: 'flash' for full attention, 'nabla' for sparse attention, 'auto' uses config default."
)
parser.add_argument(
"--nabla_P",
type=float,
default=0.9,
help="NABLA attention: Top-k probability threshold (default: 0.9)"
)
parser.add_argument(
"--nabla_wT",
type=int,
default=11,
help="NABLA attention: Temporal window size (default: 11 for 10s, 7 for 5s)"
)
parser.add_argument(
"--nabla_wW",
type=int,
default=3,
help="NABLA attention: Width window size (default: 3)"
)
parser.add_argument(
"--nabla_wH",
type=int,
default=3,
help="NABLA attention: Height window size (default: 3)"
)
parser.add_argument(
"--nabla_method",
type=str,
default="topcdf",
choices=["topcdf"],
help="NABLA attention: Selection method (default: topcdf)"
)
parser.add_argument(
"--nabla_add_sta",
action='store_true',
default=True,
help="NABLA attention: Add spatial-temporal attention (default: True)"
)
parser.add_argument(
"--preview",
type=int,
default=None,
metavar="N",
help="Enable latent preview every N steps. Generates previews in 'previews' subdirectory."
)
parser.add_argument(
"--preview_suffix",
type=str,
default=None,
help="Unique suffix for preview files to avoid conflicts in concurrent runs."
)
# APG (Adaptive Projected Guidance) for video continuation
parser.add_argument(
"--use_apg",
action='store_true',
default=False,
help="Enable Adaptive Projected Guidance to reduce color drift in video continuation"
)
parser.add_argument(
"--apg_momentum",
type=float,
default=-0.75,
help="Momentum for APG running average (default: -0.75)"
)
parser.add_argument(
"--apg_norm_threshold",
type=float,
default=55.0,
help="Norm threshold for APG guidance clipping (default: 55.0)"
)
# End frame blending for video join modes
parser.add_argument(
"--end_blend_weight",
type=float,
default=0.0,
help="Final blend weight for end frames in v2v join mode. 0.0 = use denoised result (smooth transition), 1.0 = use target latent (may cause jump). Default: 0.0"
)
# VAE temporal chunking configuration
parser.add_argument(
"--vae_temporal_tile_frames",
type=int,
default=None,
help="Temporal chunk size for VAE decode in pixel-space frames (default: 16). Lower values reduce memory usage. Recommended: 12 for moderate memory reduction, 8 for aggressive reduction. Must be divisible by 4."
)
parser.add_argument(
"--vae_temporal_stride_frames",
type=int,
default=None,
help="Temporal stride for VAE decode in pixel-space frames (default: tile_frames - 4). Controls overlap between chunks for smooth blending. If not specified, auto-calculated as tile_frames - 4."
)
parser.add_argument(
"--vae_spatial_tile_height",
type=int,
default=None,
help="Spatial tile height for VAE decode (default: 256). Lower values reduce memory usage but increase processing time."
)
parser.add_argument(
"--vae_spatial_tile_width",
type=int,
default=None,
help="Spatial tile width for VAE decode (default: 256). Lower values reduce memory usage but increase processing time."
)
parser.add_argument(
"--no_compile",
action='store_true',
default=False,
help="Disable torch.compile() for faster startup (2-5 minutes faster) at the cost of slower inference"
)
parser.add_argument(
"--resume_from",
type=str,
default=None,
help="Path to checkpoint file to resume generation from"
)
parser.add_argument(
"--save_latents",
type=str,
default=None,
help="Path to save latents before VAE decoding (e.g., latents.pt). Saves all info needed for later decoding."
)
parser.add_argument(
"--decode_from_file",
type=str,
default=None,
help="Path to load and decode previously saved latents. Skips generation and only runs VAE decoding."
)
# Video denoise mode
parser.add_argument(
"--denoise",
action='store_true',
default=False,
help="Enable video denoise mode. Applies light denoising to smooth video artifacts."
)
parser.add_argument(
"--denoise_strength",
type=float,
default=0.2,
help="Denoise strength (0.1-0.5 typical). Higher = more change. Default: 0.2"
)
# LoRA support
parser.add_argument(
"--lora_path",
type=str,
nargs="*",
default=None,
help="Path(s) to LoRA directories containing config_lora.json and lora.safetensors. Multiple LoRAs can be loaded (e.g., --lora_path ./lora1 ./lora2)"
)
parser.add_argument(
"--lora_weight",
type=float,
nargs="*",
default=None,
help="Weight(s) for each LoRA (0.0-1.0). Must match number of --lora_path entries. Default: 1.0 for each"
)
parser.add_argument(
"--lora_trigger",
type=str,
nargs="*",
default=None,
help="Override trigger word(s) for each LoRA. If not specified, auto-detected from LoRA metadata"
)
# UltraViCo: Attention decay for long video extrapolation
parser.add_argument(
"--ultravico",
action='store_true',
default=False,
help="Enable UltraViCo attention decay for long video generation. Helps prevent quality degradation and content repetition when generating videos longer than training length."
)
parser.add_argument(
"--ultravico_alpha",
type=float,
default=0.9,
help="UltraViCo: Decay factor for out-of-window attention (0.85-0.95 recommended). Lower = stronger decay. Default: 0.9"
)
parser.add_argument(
"--ultravico_training_frames",
type=int,
default=None,
help="UltraViCo: Training window in latent frames. Auto-detected from config if not set (5s=31, 10s=61)."
)
parser.add_argument(
"--ultravico_suppress_harmonics",
action='store_true',
default=False,
help="UltraViCo: Enable stronger suppression at harmonic positions. Use if you see content repetition/looping."
)
parser.add_argument(
"--ultravico_beta",
type=float,
default=0.6,
help="UltraViCo: Decay factor for harmonic risk positions (only with --ultravico_suppress_harmonics). Default: 0.6"
)
args = parser.parse_args()
return args
if __name__ == "__main__":
disable_warnings()
args = parse_args()
# Log prompt template status
if args.no_prompt_template:
print(">>> Prompt template DISABLED - using raw prompts without system instruction wrapper")
# Convert string dtype to torch dtype
dtype_map = {
"float32": torch.float32,
"float16": torch.float16,
"bfloat16": torch.bfloat16,
"fp8_scaled": torch.bfloat16, # FP8 uses bfloat16 as compute dtype
}
# Track which components should use FP8
use_fp8 = args.dtype == "fp8_scaled"
use_fp8_text_encoder = args.text_encoder_dtype == "fp8_scaled" if args.text_encoder_dtype else use_fp8
use_fp8_vae = args.vae_dtype == "fp8_scaled" if args.vae_dtype else use_fp8
use_fp8_computation = args.computation_dtype == "fp8_scaled" if args.computation_dtype else use_fp8
# SDNQ quantization settings
use_sdnq = args.use_sdnq
sdnq_weights_dtype = args.sdnq_weights_dtype
sdnq_use_quantized_matmul = args.sdnq_use_quantized_matmul and not args.no_sdnq_quantized_matmul
# If SDNQ is enabled, disable legacy INT8 and FP8 (they are mutually exclusive)
if use_sdnq:
if args.use_int8:
print("Note: --use_sdnq takes priority over --use_int8. Legacy INT8 disabled.")
if use_fp8_computation:
print("Note: --use_sdnq takes priority over fp8_scaled. Legacy FP8 disabled.")
use_fp8_computation = False
model_dtype = dtype_map[args.dtype]
# Set individual component dtypes (fall back to model_dtype if not specified)
text_encoder_dtype = dtype_map[args.text_encoder_dtype] if args.text_encoder_dtype else model_dtype
vae_dtype = dtype_map[args.vae_dtype] if args.vae_dtype else model_dtype
computation_dtype = dtype_map[args.computation_dtype] if args.computation_dtype else model_dtype
# Build attention config override if attention_type is specified
attention_config = None
if args.attention_type and args.attention_type != "auto":
attention_config = {
"type": args.attention_type,
"causal": False,
"local": False,
"glob": False,
"window": 3,
}
if args.attention_type == "nabla":
attention_config.update({
"P": args.nabla_P,
"wT": args.nabla_wT,
"wW": args.nabla_wW,
"wH": args.nabla_wH,
"add_sta": args.nabla_add_sta,
"method": args.nabla_method,
})
# Initialize UltraViCo if enabled (for long video extrapolation)
if args.ultravico:
from kandinsky.models.ultravico import UltraViCoConfig, set_ultravico_config
# Auto-detect training frames from config name if not specified
training_frames = args.ultravico_training_frames
if training_frames is None:
if "10s" in args.config:
training_frames = 61 # 10s = 61 latent frames
else:
training_frames = 31 # 5s = 31 latent frames (default)
ultravico_config = UltraViCoConfig(
enabled=True,
training_frames=training_frames,
alpha=args.ultravico_alpha,
beta=args.ultravico_beta,
suppress_harmonics=args.ultravico_suppress_harmonics,
gamma=4,
)
set_ultravico_config(ultravico_config)
print(f"UltraViCo enabled: training_frames={training_frames}, alpha={args.ultravico_alpha}, "
f"suppress_harmonics={args.ultravico_suppress_harmonics}")
# Determine model type from config filename
is_t2i = "t2i" in args.config.lower()
is_i2v = "i2v" in args.config.lower()
is_t2v_pro = "t2v" in args.config.lower() and ("pro" in args.config.lower() or "20b" in args.config.lower())
if is_t2i:
# Use T2I pipeline for text-to-image generation
pipe = get_T2I_pipeline(
device_map={"dit": "cuda:0", "vae": "cuda:0",
"text_embedder": "cuda:0"},
conf_path=args.config,
offload=args.offload,
magcache=args.magcache,
quantized_qwen=args.qwen_quantization,
attention_engine=args.attention_engine,
)
elif is_i2v:
if args.enable_block_swap:
# Use block swapping pipeline for large I2V models
pipe = get_I2V_pipeline_with_block_swap(
device_map={"dit": "cuda:0", "vae": "cuda:0",
"text_embedder": "cuda:0"},
conf_path=args.config,
checkpoint_path_override=args.checkpoint_path,
attention_config_override=attention_config,
offload=args.offload,
magcache=args.magcache,
quantized_qwen=args.qwen_quantization,
attention_engine=args.attention_engine,
blocks_in_memory=args.blocks_in_memory,
enable_block_swap=True,
dtype=model_dtype,
use_mixed_weights=args.use_mixed_weights,
text_encoder_dtype=text_encoder_dtype,
vae_dtype=vae_dtype,
computation_dtype=computation_dtype,
use_int8=args.use_int8,
int8_block_size=args.int8_block_size,
use_fp8=use_fp8_computation,
use_fp8_text_encoder=use_fp8_text_encoder,
use_sdnq=use_sdnq,
sdnq_weights_dtype=sdnq_weights_dtype,
sdnq_use_quantized_matmul=sdnq_use_quantized_matmul,
vae_temporal_tile_frames=args.vae_temporal_tile_frames,
vae_temporal_stride_frames=args.vae_temporal_stride_frames,
vae_spatial_tile_height=args.vae_spatial_tile_height,
vae_spatial_tile_width=args.vae_spatial_tile_width,
)
else:
# Use standard I2V pipeline
pipe = get_I2V_pipeline(
device_map={"dit": "cuda:0", "vae": "cuda:0",
"text_embedder": "cuda:0"},
conf_path=args.config,
checkpoint_path_override=args.checkpoint_path,
attention_config_override=attention_config,
offload=args.offload,
magcache=args.magcache,
quantized_qwen=args.qwen_quantization,
attention_engine=args.attention_engine,
dtype=model_dtype,
use_mixed_weights=args.use_mixed_weights,
text_encoder_dtype=text_encoder_dtype,
vae_dtype=vae_dtype,
computation_dtype=computation_dtype,
use_int8=args.use_int8,
int8_block_size=args.int8_block_size,
use_fp8=use_fp8_computation,
use_fp8_text_encoder=use_fp8_text_encoder,
use_sdnq=use_sdnq,
sdnq_weights_dtype=sdnq_weights_dtype,
sdnq_use_quantized_matmul=sdnq_use_quantized_matmul,
vae_temporal_tile_frames=args.vae_temporal_tile_frames,
vae_temporal_stride_frames=args.vae_temporal_stride_frames,
vae_spatial_tile_height=args.vae_spatial_tile_height,
vae_spatial_tile_width=args.vae_spatial_tile_width,
)
else: # T2V
if is_t2v_pro and args.enable_block_swap:
# Use block swapping pipeline for T2V Pro (20B model)
pipe = get_T2V_pipeline_with_block_swap(
device_map={"dit": "cuda:0", "vae": "cuda:0",
"text_embedder": "cuda:0"},
resolution=512,
conf_path=args.config,
checkpoint_path_override=args.checkpoint_path,
attention_config_override=attention_config,
offload=args.offload,
magcache=args.magcache,
quantized_qwen=args.qwen_quantization,
attention_engine=args.attention_engine,
blocks_in_memory=args.blocks_in_memory,
enable_block_swap=True,
dtype=model_dtype,
use_mixed_weights=args.use_mixed_weights,
text_encoder_dtype=text_encoder_dtype,
vae_dtype=vae_dtype,
computation_dtype=computation_dtype,
use_int8=args.use_int8,
int8_block_size=args.int8_block_size,
use_fp8=use_fp8_computation,
use_fp8_text_encoder=use_fp8_text_encoder,
use_sdnq=use_sdnq,
sdnq_weights_dtype=sdnq_weights_dtype,
sdnq_use_quantized_matmul=sdnq_use_quantized_matmul,
vae_temporal_tile_frames=args.vae_temporal_tile_frames,
vae_temporal_stride_frames=args.vae_temporal_stride_frames,
vae_spatial_tile_height=args.vae_spatial_tile_height,
vae_spatial_tile_width=args.vae_spatial_tile_width,
)
else:
# Use standard T2V pipeline
pipe = get_T2V_pipeline(
device_map={"dit": "cuda:0", "vae": "cuda:0",
"text_embedder": "cuda:0"},
conf_path=args.config,
checkpoint_path_override=args.checkpoint_path,
attention_config_override=attention_config,
offload=args.offload,
magcache=args.magcache,
quantized_qwen=args.qwen_quantization,
attention_engine=args.attention_engine,
dtype=model_dtype,
use_mixed_weights=args.use_mixed_weights,
text_encoder_dtype=text_encoder_dtype,
vae_dtype=vae_dtype,
computation_dtype=computation_dtype,
use_int8=args.use_int8,
int8_block_size=args.int8_block_size,
use_fp8=use_fp8_computation,
use_fp8_text_encoder=use_fp8_text_encoder,
use_sdnq=use_sdnq,
sdnq_weights_dtype=sdnq_weights_dtype,
sdnq_use_quantized_matmul=sdnq_use_quantized_matmul,
vae_temporal_tile_frames=args.vae_temporal_tile_frames,
vae_temporal_stride_frames=args.vae_temporal_stride_frames,
vae_spatial_tile_height=args.vae_spatial_tile_height,
vae_spatial_tile_width=args.vae_spatial_tile_width,
)
# Load LoRA adapters if specified
if args.lora_path is not None and len(args.lora_path) > 0:
print(f"\n>>> Loading {len(args.lora_path)} LoRA adapter(s)...")
# Set default weights if not specified
lora_weights = args.lora_weight if args.lora_weight else [1.0] * len(args.lora_path)
if len(lora_weights) != len(args.lora_path):
raise ValueError(f"Number of --lora_weight ({len(lora_weights)}) must match --lora_path ({len(args.lora_path)})")
# Set default triggers if not specified
lora_triggers = args.lora_trigger if args.lora_trigger else [None] * len(args.lora_path)
if len(lora_triggers) != len(args.lora_path):
raise ValueError(f"Number of --lora_trigger ({len(lora_triggers)}) must match --lora_path ({len(args.lora_path)})")
for i, lora_item in enumerate(args.lora_path):
# Detect LoRA format: folder (official PEFT) or single file (musubi tuner)
if os.path.isdir(lora_item):
# Official K5 LoRA format: folder with config_lora.json + lora.safetensors
config_path = os.path.join(lora_item, "config_lora.json")
weights_path = os.path.join(lora_item, "lora.safetensors")
if not os.path.exists(config_path):
raise FileNotFoundError(f"LoRA config not found: {config_path}")
if not os.path.exists(weights_path):
raise FileNotFoundError(f"LoRA weights not found: {weights_path}")
adapter_name = os.path.basename(lora_item) or f"lora_{i}"
print(f">>> Loading LoRA {i+1}/{len(args.lora_path)} (PEFT format): {lora_item}")
print(f" Adapter name: {adapter_name}, Weight: {lora_weights[i]}")
pipe.load_adapter(
adapter_config=config_path,
adapter_path=weights_path,
adapter_name=adapter_name,
trigger=lora_triggers[i]
)
if lora_weights[i] != 1.0:
print(f" Note: LoRA weight {lora_weights[i]} specified but PEFT adapter system uses full weight.")
elif os.path.isfile(lora_item) and lora_item.endswith(".safetensors"):
# Musubi tuner format: single .safetensors file
print(f">>> Loading LoRA {i+1}/{len(args.lora_path)} (musubi format): {lora_item}")
print(f" Weight: {lora_weights[i]}")
pipe.load_musubi_lora(
lora_path=lora_item,
multiplier=lora_weights[i],
trigger=lora_triggers[i]
)
else:
raise ValueError(f"Invalid LoRA path: {lora_item}. Must be a directory or .safetensors file.")
print(f">>> All LoRA adapters loaded successfully\n")
if args.output_filename is None:
# Determine file extension based on generation mode
if is_t2i:
ext = ".png"
else:
ext = ".mp4"
args.output_filename = "./" + args.prompt.replace(" ", "_") + ext
# Set up file-based signal checking for early stop