leejet · leejet · Jan 4, 2026 · Jan 4, 2026 · Jan 4, 2026 · Jan 4, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,6 +8,11 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+if (MSVC)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+    add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
+endif()
+
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 

diff --git a/cache_dit.hpp b/cache_dit.hpp
@@ -117,7 +117,7 @@ struct TaylorSeerState {
                 continue;
             if (o > 0)
                 factorial *= static_cast<float>(o);
-            float coeff = std::pow(static_cast<float>(elapsed), o) / factorial;
+            float coeff = ::powf(static_cast<float>(elapsed), static_cast<float>(o)) / factorial;
             for (size_t i = 0; i < size; i++) {
                 output[i] += coeff * dY_prev[o][i];
             }

diff --git a/clip.hpp b/clip.hpp
@@ -296,7 +296,7 @@ class CLIPTokenizer {
                     size_t max_length = 0,
                     bool padding      = false) {
         if (max_length > 0 && padding) {
-            size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2));
+            size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.0 / (max_length - 2)));
             if (n == 0) {
                 n = 1;
             }
@@ -525,10 +525,10 @@ struct CLIPLayer : public GGMLBlock {
 
 struct CLIPEncoder : public GGMLBlock {
 protected:
-    int64_t n_layer;
+    int n_layer;
 
 public:
-    CLIPEncoder(int64_t n_layer,
+    CLIPEncoder(int n_layer,
                 int64_t d_model,
                 int64_t n_head,
                 int64_t intermediate_size,
@@ -623,10 +623,10 @@ class CLIPEmbeddings : public GGMLBlock {
 class CLIPVisionEmbeddings : public GGMLBlock {
 protected:
     int64_t embed_dim;
-    int64_t num_channels;
-    int64_t patch_size;
-    int64_t image_size;
-    int64_t num_patches;
+    int num_channels;
+    int patch_size;
+    int image_size;
+    int num_patches;
     int64_t num_positions;
 
     void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
@@ -641,9 +641,9 @@ class CLIPVisionEmbeddings : public GGMLBlock {
 
 public:
     CLIPVisionEmbeddings(int64_t embed_dim,
-                         int64_t num_channels = 3,
-                         int64_t patch_size   = 14,
-                         int64_t image_size   = 224)
+                         int num_channels = 3,
+                         int patch_size   = 14,
+                         int image_size   = 224)
         : embed_dim(embed_dim),
           num_channels(num_channels),
           patch_size(patch_size),

diff --git a/common.hpp b/common.hpp
@@ -80,7 +80,7 @@ class ResBlock : public GGMLBlock {
                                        std::pair<int, int> padding) {
         GGML_ASSERT(dims == 2 || dims == 3);
         if (dims == 3) {
-            return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
+            return std::shared_ptr<GGMLBlock>(new Conv3d(in_channels, out_channels, {kernel_size.first, 1, 1}, {1, 1, 1}, {padding.first, 0, 0}));
         } else {
             return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
         }
@@ -544,9 +544,9 @@ class AlphaBlender : public GGMLBlock {
 
 class VideoResBlock : public ResBlock {
 public:
-    VideoResBlock(int channels,
-                  int emb_channels,
-                  int out_channels,
+    VideoResBlock(int64_t channels,
+                  int64_t emb_channels,
+                  int64_t out_channels,
                   std::pair<int, int> kernel_size = {3, 3},
                   int64_t video_kernel_size       = 3,
                   int dims                        = 2)  // always 2

diff --git a/conditioner.hpp b/conditioner.hpp
@@ -303,11 +303,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                 int class_token = clean_input_ids[class_token_index[0]];
                 class_idx       = tokens_acc + class_token_index[0];
                 std::vector<int> clean_input_ids_tmp;
-                for (uint32_t i = 0; i < class_token_index[0]; i++)
+                for (int i = 0; i < class_token_index[0]; i++)
                     clean_input_ids_tmp.push_back(clean_input_ids[i]);
-                for (uint32_t i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
+                for (int i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
                     clean_input_ids_tmp.push_back(class_token);
-                for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
+                for (int i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
                     clean_input_ids_tmp.push_back(clean_input_ids[i]);
                 clean_input_ids.clear();
                 clean_input_ids = clean_input_ids_tmp;
@@ -322,7 +322,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
         tokenizer.pad_tokens(tokens, weights, max_length, padding);
         int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
-        for (uint32_t i = 0; i < tokens.size(); i++) {
+        for (int i = 0; i < tokens.size(); i++) {
             // if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
             if (class_idx + 1 <= i && i < class_idx + 1 + offset)  // photomaker V2 has num_tokens(=2)*num_input_imgs
                                                                    // hardcode for now
@@ -1584,7 +1584,7 @@ struct T5CLIPEmbedder : public Conditioner {
                                         chunk_hidden_states->ne[0],
                                         ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
 
-        modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
+        modify_mask_to_attend_padding(t5_attn_mask, static_cast<int>(ggml_nelements(t5_attn_mask)), mask_pad);
 
         return {hidden_states, t5_attn_mask, nullptr};
     }
@@ -1723,8 +1723,8 @@ struct LLMEmbedder : public Conditioner {
                 double factor        = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
                 int height           = image.height;
                 int width            = image.width;
-                int h_bar            = static_cast<int>(std::round(height / factor)) * factor;
-                int w_bar            = static_cast<int>(std::round(width / factor)) * factor;
+                int h_bar            = static_cast<int>(std::round(height / factor) * factor);
+                int w_bar            = static_cast<int>(std::round(width / factor) * factor);
 
                 if (static_cast<double>(h_bar) * w_bar > max_pixels) {
                     double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
@@ -1752,7 +1752,7 @@ struct LLMEmbedder : public Conditioner {
                 ggml_tensor* image_embed = nullptr;
                 llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
                 image_embeds.emplace_back(image_embed_idx, image_embed);
-                image_embed_idx += 1 + image_embed->ne[1] + 6;
+                image_embed_idx += 1 + static_cast<int>(image_embed->ne[1]) + 6;
 
                 img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>";  // [24669, 220, index, 25, 220, 151652]
                 int64_t num_image_tokens = image_embed->ne[1];
@@ -1799,9 +1799,9 @@ struct LLMEmbedder : public Conditioner {
 
             prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
 
-            prompt_attn_range.first = prompt.size();
+            prompt_attn_range.first = static_cast<int>(prompt.size());
             prompt += conditioner_params.text;
-            prompt_attn_range.second = prompt.size();
+            prompt_attn_range.second = static_cast<int>(prompt.size());
 
             prompt += "[/INST]";
         } else if (version == VERSION_OVIS_IMAGE) {

diff --git a/denoiser.hpp b/denoiser.hpp
@@ -245,7 +245,7 @@ struct SGMUniformScheduler : SigmaScheduler {
         int t_max                    = TIMESTEPS - 1;
         int t_min                    = 0;
         std::vector<float> timesteps = linear_space(static_cast<float>(t_max), static_cast<float>(t_min), n + 1);
-        for (int i = 0; i < n; i++) {
+        for (uint32_t i = 0; i < n; i++) {
             result.push_back(t_to_sigma_func(timesteps[i]));
         }
         result.push_back(0.0f);
@@ -259,11 +259,11 @@ struct LCMScheduler : SigmaScheduler {
         result.reserve(n + 1);
         const int original_steps = 50;
         const int k              = TIMESTEPS / original_steps;
-        for (int i = 0; i < n; i++) {
+        for (uint32_t i = 0; i < n; i++) {
             // the rounding ensures we match the training schedule of the LCM model
             int index    = (i * original_steps) / n;
             int timestep = (original_steps - index) * k - 1;
-            result.push_back(t_to_sigma(timestep));
+            result.push_back(t_to_sigma(static_cast<float>(timestep)));
         }
         result.push_back(0.0f);
         return result;
@@ -525,8 +525,8 @@ struct CompVisVDenoiser : public CompVisDenoiser {
 };
 
 struct EDMVDenoiser : public CompVisVDenoiser {
-    float min_sigma = 0.002;
-    float max_sigma = 120.0;
+    float min_sigma = 0.002f;
+    float max_sigma = 120.0f;
 
     EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0)
         : min_sigma(min_sigma), max_sigma(max_sigma) {
@@ -537,7 +537,7 @@ struct EDMVDenoiser : public CompVisVDenoiser {
     }
 
     float sigma_to_t(float s) override {
-        return 0.25 * std::log(s);
+        return 0.25f * std::log(s);
     }
 
     float sigma_min() override {
@@ -569,7 +569,7 @@ struct DiscreteFlowDenoiser : public Denoiser {
 
     void set_parameters() {
         for (int i = 1; i < TIMESTEPS + 1; i++) {
-            sigmas[i - 1] = t_to_sigma(i);
+            sigmas[i - 1] = t_to_sigma(static_cast<float>(i));
         }
     }
 
@@ -612,7 +612,7 @@ struct DiscreteFlowDenoiser : public Denoiser {
 };
 
 float flux_time_shift(float mu, float sigma, float t) {
-    return std::exp(mu) / (std::exp(mu) + std::pow((1.0 / t - 1.0), sigma));
+    return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma));
 }
 
 struct FluxFlowDenoiser : public Denoiser {
@@ -632,7 +632,7 @@ struct FluxFlowDenoiser : public Denoiser {
     void set_parameters(float shift) {
         set_shift(shift);
         for (int i = 0; i < TIMESTEPS; i++) {
-            sigmas[i] = t_to_sigma(i);
+            sigmas[i] = t_to_sigma(static_cast<float>(i));
         }
     }
 
@@ -1327,15 +1327,12 @@ static bool sample_k_diffusion(sample_method_t method,
                 // - pred_sample_direction -> "direction pointing to
                 //   x_t"
                 // - pred_prev_sample -> "x_t-1"
-                int timestep =
-                    roundf(TIMESTEPS -
-                           i * ((float)TIMESTEPS / steps)) -
-                    1;
+                int timestep = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
                 // 1. get previous step value (=t-1)
-                int prev_timestep = timestep - TIMESTEPS / steps;
+                int prev_timestep = timestep - TIMESTEPS / static_cast<int>(steps);
                 // The sigma here is chosen to cause the
                 // CompVisDenoiser to produce t = timestep
-                float sigma = compvis_sigmas[timestep];
+                float sigma = static_cast<float>(compvis_sigmas[timestep]);
                 if (i == 0) {
                     // The function add_noise intializes x to
                     // Diffusers' latents * sigma (as in Diffusers'
@@ -1392,10 +1389,10 @@ static bool sample_k_diffusion(sample_method_t method,
                     }
                 }
                 // 2. compute alphas, betas
-                float alpha_prod_t = alphas_cumprod[timestep];
+                float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
                 // Note final_alpha_cumprod = alphas_cumprod[0] due to
                 // trailing timestep spacing
-                float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
+                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
                 float beta_prod_t       = 1 - alpha_prod_t;
                 // 3. compute predicted original sample from predicted
                 // noise also called "predicted x_0" of formula (12)
@@ -1442,8 +1439,8 @@ static bool sample_k_diffusion(sample_method_t method,
                         // Two step inner loop without an explicit
                         // tensor
                         float pred_sample_direction =
-                            std::sqrt(1 - alpha_prod_t_prev -
-                                      std::pow(std_dev_t, 2)) *
+                            ::sqrtf(1 - alpha_prod_t_prev -
+                                    ::powf(std_dev_t, 2)) *
                             vec_model_output[j];
                         vec_x[j] = std::sqrt(alpha_prod_t_prev) *
                                        vec_pred_original_sample[j] +
@@ -1518,7 +1515,7 @@ static bool sample_k_diffusion(sample_method_t method,
                 // Begin k-diffusion specific workaround for
                 // evaluating F_theta(x; ...) from D(x, sigma), same
                 // as in DDIM (and see there for detailed comments)
-                float sigma = compvis_sigmas[timestep];
+                float sigma = static_cast<float>(compvis_sigmas[timestep]);
                 if (i == 0) {
                     float* vec_x = (float*)x->data;
                     for (int j = 0; j < ggml_nelements(x); j++) {
@@ -1557,14 +1554,14 @@ static bool sample_k_diffusion(sample_method_t method,
                 // is different from the notation alpha_t in
                 // DPM-Solver. In fact, we have alpha_{t_n} =
                 // \sqrt{\hat{alpha_n}}, [...]"
-                float alpha_prod_t = alphas_cumprod[timestep];
+                float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
                 float beta_prod_t  = 1 - alpha_prod_t;
                 // Note final_alpha_cumprod = alphas_cumprod[0] since
                 // TCD is always "trailing"
-                float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
+                float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
                 // The subscript _s are the only portion in this
                 // section (2) unique to TCD
-                float alpha_prod_s = alphas_cumprod[timestep_s];
+                float alpha_prod_s = static_cast<float>(alphas_cumprod[timestep_s]);
                 float beta_prod_s  = 1 - alpha_prod_s;
                 // 3. Compute the predicted noised sample x_s based on
                 // the model parameterization

diff --git a/examples/cli/avi_writer.h b/examples/cli/avi_writer.h
@@ -172,9 +172,9 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
 
         // Write '00dc' chunk (video frame)
         fwrite("00dc", 4, 1, f);
-        write_u32_le(f, jpeg_data.size);
+        write_u32_le(f, (uint32_t)jpeg_data.size);
         index[i].offset = ftell(f) - 8;
-        index[i].size   = jpeg_data.size;
+        index[i].size   = (uint32_t)jpeg_data.size;
         fwrite(jpeg_data.buf, 1, jpeg_data.size, f);
 
         // Align to even byte size

diff --git a/examples/common/common.hpp b/examples/common/common.hpp
@@ -1386,10 +1386,10 @@ struct SDGenerationParams {
                 if (!item.empty()) {
                     try {
                         custom_sigmas.push_back(std::stof(item));
-                    } catch (const std::invalid_argument& e) {
+                    } catch (const std::invalid_argument&) {
                         LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str());
                         return -1;
-                    } catch (const std::out_of_range& e) {
+                    } catch (const std::out_of_range&) {
                         LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str());
                         return -1;
                     }

diff --git a/examples/server/main.cpp b/examples/server/main.cpp
@@ -44,7 +44,7 @@ inline bool is_base64(unsigned char c) {
 }
 
 std::vector<uint8_t> base64_decode(const std::string& encoded_string) {
-    int in_len = encoded_string.size();
+    int in_len = static_cast<int>(encoded_string.size());
     int i      = 0;
     int j      = 0;
     int in_    = 0;
@@ -617,7 +617,7 @@ int main(int argc, const char** argv) {
                 int img_h           = height;
                 uint8_t* raw_pixels = load_image_from_memory(
                     reinterpret_cast<const char*>(bytes.data()),
-                    bytes.size(),
+                    static_cast<int>(bytes.size()),
                     img_w, img_h,
                     width, height, 3);
 
@@ -635,7 +635,7 @@ int main(int argc, const char** argv) {
                 int mask_h        = height;
                 uint8_t* mask_raw = load_image_from_memory(
                     reinterpret_cast<const char*>(mask_bytes.data()),
-                    mask_bytes.size(),
+                    static_cast<int>(mask_bytes.size()),
                     mask_w, mask_h,
                     width, height, 1);
                 mask_image = {(uint32_t)mask_w, (uint32_t)mask_h, 1, mask_raw};