Skip to content

Commit 7010bb4

Browse files
akleineleejet
andauthored
feat: support for SDXS-512 model (#1180)
* feat: add U-Net specials of SDXS * docs: update distilled_sd.md for SDXS-512 * feat: for SDXS use AutoencoderTiny as the primary VAE * docs: update distilled_sd.md for SDXS-512 * fix: SDXS code cleaning after review by stduhpf * format code * fix sdxs with --taesd-preview-only --------- Co-authored-by: leejet <leejet714@gmail.com>
1 parent 48d3161 commit 7010bb4

File tree

6 files changed

+70
-12
lines changed

6 files changed

+70
-12
lines changed

docs/distilled_sd.md

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ python convert_diffusers_to_original_stable_diffusion.py \
8383
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
8484

8585

86-
### Another available .ckpt file:
86+
##### Another available .ckpt file:
8787

8888
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
8989

@@ -97,3 +97,31 @@ for key, value in ckpt['state_dict'].items():
9797
ckpt['state_dict'][key] = value.contiguous()
9898
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
9999
```
100+
101+
102+
### SDXS-512
103+
104+
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
105+
106+
##### 1. Download the diffusers model from Hugging Face using Python:
107+
108+
```python
109+
from diffusers import StableDiffusionPipeline
110+
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
111+
pipe.save_pretrained(save_directory="sdxs")
112+
```
113+
##### 2. Create a safetensors file
114+
115+
```bash
116+
python convert_diffusers_to_original_stable_diffusion.py \
117+
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
118+
```
119+
120+
##### 3. Run the model as follows:
121+
122+
```bash
123+
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
124+
--cfg-scale 1 --steps 1
125+
```
126+
127+
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.

model.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,6 +1038,7 @@ SDVersion ModelLoader::get_sd_version() {
10381038
int64_t patch_embedding_channels = 0;
10391039
bool has_img_emb = false;
10401040
bool has_middle_block_1 = false;
1041+
bool has_output_block_71 = false;
10411042

10421043
for (auto& [name, tensor_storage] : tensor_storage_map) {
10431044
if (!(is_xl)) {
@@ -1094,6 +1095,9 @@ SDVersion ModelLoader::get_sd_version() {
10941095
tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) {
10951096
has_middle_block_1 = true;
10961097
}
1098+
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos) {
1099+
has_output_block_71 = true;
1100+
}
10971101
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
10981102
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
10991103
tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
@@ -1155,6 +1159,9 @@ SDVersion ModelLoader::get_sd_version() {
11551159
return VERSION_SD1_PIX2PIX;
11561160
}
11571161
if (!has_middle_block_1) {
1162+
if (!has_output_block_71) {
1163+
return VERSION_SDXS;
1164+
}
11581165
return VERSION_SD1_TINY_UNET;
11591166
}
11601167
return VERSION_SD1;

model.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ enum SDVersion {
2828
VERSION_SD2,
2929
VERSION_SD2_INPAINT,
3030
VERSION_SD2_TINY_UNET,
31+
VERSION_SDXS,
3132
VERSION_SDXL,
3233
VERSION_SDXL_INPAINT,
3334
VERSION_SDXL_PIX2PIX,
@@ -50,7 +51,7 @@ enum SDVersion {
5051
};
5152

5253
static inline bool sd_version_is_sd1(SDVersion version) {
53-
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) {
54+
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
5455
return true;
5556
}
5657
return false;

stable-diffusion.cpp

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ const char* model_version_to_str[] = {
3131
"SD 2.x",
3232
"SD 2.x Inpaint",
3333
"SD 2.x Tiny UNet",
34+
"SDXS",
3435
"SDXL",
3536
"SDXL Inpaint",
3637
"SDXL Instruct-Pix2Pix",
@@ -407,6 +408,11 @@ class StableDiffusionGGML {
407408
vae_decode_only = false;
408409
}
409410

411+
bool tae_preview_only = sd_ctx_params->tae_preview_only;
412+
if (version == VERSION_SDXS) {
413+
tae_preview_only = false;
414+
}
415+
410416
if (sd_ctx_params->circular_x || sd_ctx_params->circular_y) {
411417
LOG_INFO("Using circular padding for convolutions");
412418
}
@@ -591,7 +597,7 @@ class StableDiffusionGGML {
591597
vae_backend = backend;
592598
}
593599

594-
if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
600+
if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
595601
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
596602
first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
597603
offload_params_to_cpu,
@@ -629,8 +635,7 @@ class StableDiffusionGGML {
629635
first_stage_model->get_param_tensors(tensors, "first_stage_model");
630636
}
631637
}
632-
633-
if (use_tiny_autoencoder) {
638+
if (use_tiny_autoencoder || version == VERSION_SDXS) {
634639
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
635640
tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
636641
offload_params_to_cpu,
@@ -645,6 +650,10 @@ class StableDiffusionGGML {
645650
"decoder.layers",
646651
vae_decode_only,
647652
version);
653+
if (version == VERSION_SDXS) {
654+
tae_first_stage->alloc_params_buffer();
655+
tae_first_stage->get_param_tensors(tensors, "first_stage_model");
656+
}
648657
}
649658
if (sd_ctx_params->vae_conv_direct) {
650659
LOG_INFO("Using Conv2d direct in the tae model");
@@ -782,14 +791,15 @@ class StableDiffusionGGML {
782791
unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
783792
}
784793
size_t vae_params_mem_size = 0;
785-
if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
794+
if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
786795
vae_params_mem_size = first_stage_model->get_params_buffer_size();
787796
}
788-
if (use_tiny_autoencoder) {
789-
if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
797+
if (use_tiny_autoencoder || version == VERSION_SDXS) {
798+
if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path, n_threads)) {
790799
return false;
791800
}
792-
vae_params_mem_size = tae_first_stage->get_params_buffer_size();
801+
use_tiny_autoencoder = true; // now the processing is identical for VERSION_SDXS
802+
vae_params_mem_size = tae_first_stage->get_params_buffer_size();
793803
}
794804
size_t control_net_params_mem_size = 0;
795805
if (control_net) {
@@ -945,7 +955,7 @@ class StableDiffusionGGML {
945955
}
946956

947957
ggml_free(ctx);
948-
use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only;
958+
use_tiny_autoencoder = use_tiny_autoencoder && !tae_preview_only;
949959
return true;
950960
}
951961

tae.hpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,8 @@ struct TinyAutoEncoder : public GGMLRunner {
505505
struct ggml_tensor** output,
506506
struct ggml_context* output_ctx = nullptr) = 0;
507507

508-
virtual bool load_from_file(const std::string& file_path, int n_threads) = 0;
508+
virtual bool load_from_file(const std::string& file_path, int n_threads) = 0;
509+
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
509510
};
510511

511512
struct TinyImageAutoEncoder : public TinyAutoEncoder {
@@ -555,6 +556,10 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder {
555556
return success;
556557
}
557558

559+
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
560+
taesd.get_param_tensors(tensors, prefix);
561+
}
562+
558563
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
559564
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
560565
z = to_backend(z);
@@ -624,6 +629,10 @@ struct TinyVideoAutoEncoder : public TinyAutoEncoder {
624629
return success;
625630
}
626631

632+
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
633+
taehv.get_param_tensors(tensors, prefix);
634+
}
635+
627636
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
628637
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
629638
z = to_backend(z);

unet.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,10 +215,13 @@ class UnetModelBlock : public GGMLBlock {
215215
} else if (sd_version_is_unet_edit(version)) {
216216
in_channels = 8;
217217
}
218-
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET) {
218+
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
219219
num_res_blocks = 1;
220220
channel_mult = {1, 2, 4};
221221
tiny_unet = true;
222+
if (version == VERSION_SDXS) {
223+
attention_resolutions = {4, 2}; // here just like SDXL
224+
}
222225
}
223226

224227
// dims is always 2

0 commit comments

Comments
 (0)