[Feature] Supports ggml backend detection

### Feature Summary

Support ggml backend detection

### Detailed Description

This feature proposes adding a CMake build option to build the shared library (`stable-diffusion.dll`) **without statically linking ggml**, and instead using ggml as a shared library.

Proposed Build Options

* `SD_USE_SHARED_GGML`

  * Links ggml as a shared library instead of static linking

* `SD_ALL`

  * Enables all supported ggml backends
  * Automatically loads all backends via `ggml_backend_load_all()`
  * Selects the best available backend at runtime using `ggml_backend_init_best()`
  * Backend-specific behavior (e.g. CUDA / Vulkan / CPU) is selected **at runtime** rather than compile time


```diff
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ea1c47..5bc17a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
+option(SD_ALL                        "sd: all backend" OFF)
 option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
@@ -35,8 +36,18 @@ option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" OFF)
 option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
+option(SD_USE_SHARED_GGML            "sd: use GGML shared library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 
+if(SD_ALL)
+    message("-- Use all backend stable-diffusion")
+    set(GGML_CUDA ON)
+    set(SD_BUILD_SHARED_LIBS ON)
+    set(SD_BUILD_SHARED_GGML_LIB ON)
+    set(SD_USE_SHARED_GGML ON)
+    add_definitions(-DSD_USE_ALL)
+endif()
+
 if(SD_CUDA)
     message("-- Use CUDA as backend stable-diffusion")
     set(GGML_CUDA ON)
@@ -169,15 +180,25 @@ if (NOT TARGET ggml)
             message(FATAL_ERROR "System-installed GGML library not found.")
         endif()
         add_library(ggml ALIAS ggml::ggml)
-    else()
+    elseif(NOT SD_USE_SHARED_GGML)
         add_subdirectory(ggml)
     endif()
 endif()
 
 add_subdirectory(thirdparty)
 
-target_link_libraries(${SD_LIB} PUBLIC ggml zip)
-target_include_directories(${SD_LIB} PUBLIC . thirdparty)
+
+
+if(SD_USE_SHARED_GGML)
+    target_link_libraries(${SD_LIB} PRIVATE ggml zip)
+    target_include_directories(${SD_LIB} 
+      PUBLIC .
+      PRIVATE ./ggml/include thirdparty)
+else()
+    target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+    target_include_directories(${SD_LIB} PUBLIC . thirdparty)
+endif()
+
 target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
 
 
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index fcaa92c..cf9b13f 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1377,7 +1377,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm(struct ggml_context* c
 }
 
 __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) {
-#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
+#if defined(SD_USE_ALL) || defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
     if (!ggml_backend_is_cpu(backend)) {
         ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
         ggml_backend_synchronize(backend);
diff --git a/qwen_image.hpp b/qwen_image.hpp
index eeb823d..133e94c 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -99,6 +99,12 @@ namespace Qwen {
 #ifdef SD_USE_VULKAN
             force_prec_f32 = true;
 #endif
+
+#ifdef SD_USE_ALL
+            if (backend_dev_type == DEV_TYPE_VULKAN) {
+                force_prec_f32 = true;
+            }
+#endif
             // The purpose of the scale here is to prevent NaN issues in certain situations.
             // For example when using CUDA but the weights are k-quants (not all prompts).
             blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 17fe3fd..8ffee39 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -64,6 +64,8 @@ const char* sampling_methods_str[] = {
     "TCD",
 };
 
+dev_type_t backend_dev_type = DEV_TYPE_OTHER;
+
 /*================================================== Helper Functions ================================================*/
 
 void calculate_alphas_cumprod(float* alphas_cumprod,
@@ -155,6 +157,15 @@ public:
     }
 
     void init_backend() {
+#ifdef SD_USE_ALL
+        ggml_backend_load_all();
+        backend = ggml_backend_init_best();
+        ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+        const char * name = ggml_backend_dev_name(dev);
+        if(name != NULL && strstr(name, "Vulkan")) {
+            backend_dev_type = DEV_TYPE_VULKAN;
+        }
+#endif
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
         backend = ggml_backend_cuda_init(0);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index adb65a1..572ce70 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -142,6 +142,14 @@ enum lora_apply_mode_t {
     LORA_APPLY_MODE_COUNT,
 };
 
+
+enum dev_type_t {
+    DEV_TYPE_VULKAN,
+    DEV_TYPE_OTHER,
+};
+
+extern dev_type_t backend_dev_type;
+
 typedef struct {
     bool enabled;
     int tile_size_x;
diff --git a/upscaler.cpp b/upscaler.cpp
index 29ac981..bfe2ad7 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -24,6 +24,16 @@ struct UpscalerGGML {
                         bool offload_params_to_cpu,
                         int n_threads) {
         ggml_log_set(ggml_log_callback_default, nullptr);
+#ifdef SD_USE_ALL
+        LOG_DEBUG("Using backend = %s", ggml_backend_name(backend));
+        ggml_backend_load_all();
+        backend = ggml_backend_init_best();
+        ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+        const char * name = ggml_backend_dev_name(dev);
+        if(name != NULL && strstr(name, "Vulkan")) {
+            backend_dev_type = DEV_TYPE_VULKAN;
+        }
+#endif
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
         backend = ggml_backend_cuda_init(0);
diff --git a/z_image.hpp b/z_image.hpp
index bc554f1..86c907f 100644
--- a/z_image.hpp
+++ b/z_image.hpp
@@ -96,6 +96,13 @@ namespace ZImage {
 #ifdef SD_USE_VULKAN
             force_prec_f32 = true;
 #endif
+
+#ifdef SD_USE_ALL
+            if (backend_dev_type == DEV_TYPE_VULKAN) {
+                force_prec_f32 = true;
+            }
+#endif
+
             // The purpose of the scale here is to prevent NaN issues in certain situations.
             // For example, when using CUDA but the weights are k-quants.
             blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);
```


### Alternatives you considered

_No response_

### Additional context

_No response_

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Feature] Supports ggml backend detection #1127

Feature Summary

Detailed Description

Alternatives you considered

Additional context

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

[Feature] Supports ggml backend detection #1127

Description

Feature Summary

Detailed Description

Alternatives you considered

Additional context

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions