model: target: sgm.models.diffusion.DiffusionEngine params: scale_factor: 0.13025 disable_first_stage_autocast: True denoiser_config: target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser params: num_idx: 1000 scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization network_config: target: sgm.modules.diffusionmodules.openaimodel.UNetModel params: adm_in_channels: 2816 num_classes: sequential use_checkpoint: True in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [4, 2] num_res_blocks: 2 channel_mult: [1, 2, 4] num_head_channels: 64 use_linear_in_transformer: True transformer_depth: [1, 2, 10] context_dim: 2048 spatial_transformer_attn_type: softmax-xformers conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: - is_trainable: False input_key: txt target: sgm.modules.encoders.modules.FrozenCLIPEmbedder params: layer: hidden layer_idx: 11 - is_trainable: False input_key: txt target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 params: arch: ViT-bigG-14 version: laion2b_s39b_b160k freeze: True layer: penultimate always_return_pooled: True legacy: False - is_trainable: False input_key: original_size_as_tuple target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 - is_trainable: False input_key: crop_coords_top_left target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 - is_trainable: False input_key: target_size_as_tuple target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 first_stage_config: target: sgm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: attn_type: vanilla-xformers double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [1, 2, 4, 4] num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity