Skip to content

[Feature] Supports ggml backend detection #1127

@wh-kuromai

Description

@wh-kuromai

Feature Summary

Support ggml backend detection

Detailed Description

This feature proposes adding a CMake build option to build the shared library (stable-diffusion.dll) without statically linking ggml, and instead using ggml as a shared library.

Proposed Build Options

  • SD_USE_SHARED_GGML

    • Links ggml as a shared library instead of static linking
  • SD_ALL

    • Enables all supported ggml backends
    • Automatically loads all backends via ggml_backend_load_all()
    • Selects the best available backend at runtime using ggml_backend_init_best()
    • Backend-specific behavior (e.g. CUDA / Vulkan / CPU) is selected at runtime rather than compile time
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ea1c47..5bc17a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
+option(SD_ALL                        "sd: all backend" OFF)
 option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
@@ -35,8 +36,18 @@ option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" OFF)
 option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
+option(SD_USE_SHARED_GGML            "sd: use GGML shared library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 
+if(SD_ALL)
+    message("-- Use all backend stable-diffusion")
+    set(GGML_CUDA ON)
+    set(SD_BUILD_SHARED_LIBS ON)
+    set(SD_BUILD_SHARED_GGML_LIB ON)
+    set(SD_USE_SHARED_GGML ON)
+    add_definitions(-DSD_USE_ALL)
+endif()
+
 if(SD_CUDA)
     message("-- Use CUDA as backend stable-diffusion")
     set(GGML_CUDA ON)
@@ -169,15 +180,25 @@ if (NOT TARGET ggml)
             message(FATAL_ERROR "System-installed GGML library not found.")
         endif()
         add_library(ggml ALIAS ggml::ggml)
-    else()
+    elseif(NOT SD_USE_SHARED_GGML)
         add_subdirectory(ggml)
     endif()
 endif()
 
 add_subdirectory(thirdparty)
 
-target_link_libraries(${SD_LIB} PUBLIC ggml zip)
-target_include_directories(${SD_LIB} PUBLIC . thirdparty)
+
+
+if(SD_USE_SHARED_GGML)
+    target_link_libraries(${SD_LIB} PRIVATE ggml zip)
+    target_include_directories(${SD_LIB} 
+      PUBLIC .
+      PRIVATE ./ggml/include thirdparty)
+else()
+    target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+    target_include_directories(${SD_LIB} PUBLIC . thirdparty)
+endif()
+
 target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
 
 
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index fcaa92c..cf9b13f 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1377,7 +1377,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm(struct ggml_context* c
 }
 
 __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) {
-#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
+#if defined(SD_USE_ALL) || defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
     if (!ggml_backend_is_cpu(backend)) {
         ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
         ggml_backend_synchronize(backend);
diff --git a/qwen_image.hpp b/qwen_image.hpp
index eeb823d..133e94c 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -99,6 +99,12 @@ namespace Qwen {
 #ifdef SD_USE_VULKAN
             force_prec_f32 = true;
 #endif
+
+#ifdef SD_USE_ALL
+            if (backend_dev_type == DEV_TYPE_VULKAN) {
+                force_prec_f32 = true;
+            }
+#endif
             // The purpose of the scale here is to prevent NaN issues in certain situations.
             // For example when using CUDA but the weights are k-quants (not all prompts).
             blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 17fe3fd..8ffee39 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -64,6 +64,8 @@ const char* sampling_methods_str[] = {
     "TCD",
 };
 
+dev_type_t backend_dev_type = DEV_TYPE_OTHER;
+
 /*================================================== Helper Functions ================================================*/
 
 void calculate_alphas_cumprod(float* alphas_cumprod,
@@ -155,6 +157,15 @@ public:
     }
 
     void init_backend() {
+#ifdef SD_USE_ALL
+        ggml_backend_load_all();
+        backend = ggml_backend_init_best();
+        ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+        const char * name = ggml_backend_dev_name(dev);
+        if(name != NULL && strstr(name, "Vulkan")) {
+            backend_dev_type = DEV_TYPE_VULKAN;
+        }
+#endif
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
         backend = ggml_backend_cuda_init(0);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index adb65a1..572ce70 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -142,6 +142,14 @@ enum lora_apply_mode_t {
     LORA_APPLY_MODE_COUNT,
 };
 
+
+enum dev_type_t {
+    DEV_TYPE_VULKAN,
+    DEV_TYPE_OTHER,
+};
+
+extern dev_type_t backend_dev_type;
+
 typedef struct {
     bool enabled;
     int tile_size_x;
diff --git a/upscaler.cpp b/upscaler.cpp
index 29ac981..bfe2ad7 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -24,6 +24,16 @@ struct UpscalerGGML {
                         bool offload_params_to_cpu,
                         int n_threads) {
         ggml_log_set(ggml_log_callback_default, nullptr);
+#ifdef SD_USE_ALL
+        LOG_DEBUG("Using backend = %s", ggml_backend_name(backend));
+        ggml_backend_load_all();
+        backend = ggml_backend_init_best();
+        ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+        const char * name = ggml_backend_dev_name(dev);
+        if(name != NULL && strstr(name, "Vulkan")) {
+            backend_dev_type = DEV_TYPE_VULKAN;
+        }
+#endif
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
         backend = ggml_backend_cuda_init(0);
diff --git a/z_image.hpp b/z_image.hpp
index bc554f1..86c907f 100644
--- a/z_image.hpp
+++ b/z_image.hpp
@@ -96,6 +96,13 @@ namespace ZImage {
 #ifdef SD_USE_VULKAN
             force_prec_f32 = true;
 #endif
+
+#ifdef SD_USE_ALL
+            if (backend_dev_type == DEV_TYPE_VULKAN) {
+                force_prec_f32 = true;
+            }
+#endif
+
             // The purpose of the scale here is to prevent NaN issues in certain situations.
             // For example, when using CUDA but the weights are k-quants.
             blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);

Alternatives you considered

No response

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions