-
Notifications
You must be signed in to change notification settings - Fork 483
Open
Labels
enhancementNew feature or requestNew feature or request
Description
Feature Summary
Support ggml backend detection
Detailed Description
This feature proposes adding a CMake build option to build the shared library (stable-diffusion.dll) without statically linking ggml, and instead using ggml as a shared library.
Proposed Build Options
-
SD_USE_SHARED_GGML- Links ggml as a shared library instead of static linking
-
SD_ALL- Enables all supported ggml backends
- Automatically loads all backends via
ggml_backend_load_all() - Selects the best available backend at runtime using
ggml_backend_init_best() - Backend-specific behavior (e.g. CUDA / Vulkan / CPU) is selected at runtime rather than compile time
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ea1c47..5bc17a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
# general
#option(SD_BUILD_TESTS "sd: build tests" ${SD_STANDALONE})
option(SD_BUILD_EXAMPLES "sd: build examples" ${SD_STANDALONE})
+option(SD_ALL "sd: all backend" OFF)
option(SD_CUDA "sd: cuda backend" OFF)
option(SD_HIPBLAS "sd: rocm backend" OFF)
option(SD_METAL "sd: metal backend" OFF)
@@ -35,8 +36,18 @@ option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
+option(SD_USE_SHARED_GGML "sd: use GGML shared library" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON)
+if(SD_ALL)
+ message("-- Use all backend stable-diffusion")
+ set(GGML_CUDA ON)
+ set(SD_BUILD_SHARED_LIBS ON)
+ set(SD_BUILD_SHARED_GGML_LIB ON)
+ set(SD_USE_SHARED_GGML ON)
+ add_definitions(-DSD_USE_ALL)
+endif()
+
if(SD_CUDA)
message("-- Use CUDA as backend stable-diffusion")
set(GGML_CUDA ON)
@@ -169,15 +180,25 @@ if (NOT TARGET ggml)
message(FATAL_ERROR "System-installed GGML library not found.")
endif()
add_library(ggml ALIAS ggml::ggml)
- else()
+ elseif(NOT SD_USE_SHARED_GGML)
add_subdirectory(ggml)
endif()
endif()
add_subdirectory(thirdparty)
-target_link_libraries(${SD_LIB} PUBLIC ggml zip)
-target_include_directories(${SD_LIB} PUBLIC . thirdparty)
+
+
+if(SD_USE_SHARED_GGML)
+ target_link_libraries(${SD_LIB} PRIVATE ggml zip)
+ target_include_directories(${SD_LIB}
+ PUBLIC .
+ PRIVATE ./ggml/include thirdparty)
+else()
+ target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+ target_include_directories(${SD_LIB} PUBLIC . thirdparty)
+endif()
+
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index fcaa92c..cf9b13f 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1377,7 +1377,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm(struct ggml_context* c
}
__STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) {
-#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
+#if defined(SD_USE_ALL) || defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
if (!ggml_backend_is_cpu(backend)) {
ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
ggml_backend_synchronize(backend);
diff --git a/qwen_image.hpp b/qwen_image.hpp
index eeb823d..133e94c 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -99,6 +99,12 @@ namespace Qwen {
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
+
+#ifdef SD_USE_ALL
+ if (backend_dev_type == DEV_TYPE_VULKAN) {
+ force_prec_f32 = true;
+ }
+#endif
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example when using CUDA but the weights are k-quants (not all prompts).
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 17fe3fd..8ffee39 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -64,6 +64,8 @@ const char* sampling_methods_str[] = {
"TCD",
};
+dev_type_t backend_dev_type = DEV_TYPE_OTHER;
+
/*================================================== Helper Functions ================================================*/
void calculate_alphas_cumprod(float* alphas_cumprod,
@@ -155,6 +157,15 @@ public:
}
void init_backend() {
+#ifdef SD_USE_ALL
+ ggml_backend_load_all();
+ backend = ggml_backend_init_best();
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+ const char * name = ggml_backend_dev_name(dev);
+ if(name != NULL && strstr(name, "Vulkan")) {
+ backend_dev_type = DEV_TYPE_VULKAN;
+ }
+#endif
#ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index adb65a1..572ce70 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -142,6 +142,14 @@ enum lora_apply_mode_t {
LORA_APPLY_MODE_COUNT,
};
+
+enum dev_type_t {
+ DEV_TYPE_VULKAN,
+ DEV_TYPE_OTHER,
+};
+
+extern dev_type_t backend_dev_type;
+
typedef struct {
bool enabled;
int tile_size_x;
diff --git a/upscaler.cpp b/upscaler.cpp
index 29ac981..bfe2ad7 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -24,6 +24,16 @@ struct UpscalerGGML {
bool offload_params_to_cpu,
int n_threads) {
ggml_log_set(ggml_log_callback_default, nullptr);
+#ifdef SD_USE_ALL
+ LOG_DEBUG("Using backend = %s", ggml_backend_name(backend));
+ ggml_backend_load_all();
+ backend = ggml_backend_init_best();
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+ const char * name = ggml_backend_dev_name(dev);
+ if(name != NULL && strstr(name, "Vulkan")) {
+ backend_dev_type = DEV_TYPE_VULKAN;
+ }
+#endif
#ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
diff --git a/z_image.hpp b/z_image.hpp
index bc554f1..86c907f 100644
--- a/z_image.hpp
+++ b/z_image.hpp
@@ -96,6 +96,13 @@ namespace ZImage {
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
+
+#ifdef SD_USE_ALL
+ if (backend_dev_type == DEV_TYPE_VULKAN) {
+ force_prec_f32 = true;
+ }
+#endif
+
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using CUDA but the weights are k-quants.
blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);Alternatives you considered
No response
Additional context
No response
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request