Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()

if (MSVC)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
endif()

set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

Expand Down
2 changes: 1 addition & 1 deletion cache_dit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ struct TaylorSeerState {
continue;
if (o > 0)
factorial *= static_cast<float>(o);
float coeff = std::pow(static_cast<float>(elapsed), o) / factorial;
float coeff = ::powf(static_cast<float>(elapsed), static_cast<float>(o)) / factorial;
for (size_t i = 0; i < size; i++) {
output[i] += coeff * dY_prev[o][i];
}
Expand Down
20 changes: 10 additions & 10 deletions clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ class CLIPTokenizer {
size_t max_length = 0,
bool padding = false) {
if (max_length > 0 && padding) {
size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2));
size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.0 / (max_length - 2)));
if (n == 0) {
n = 1;
}
Expand Down Expand Up @@ -525,10 +525,10 @@ struct CLIPLayer : public GGMLBlock {

struct CLIPEncoder : public GGMLBlock {
protected:
int64_t n_layer;
int n_layer;

public:
CLIPEncoder(int64_t n_layer,
CLIPEncoder(int n_layer,
int64_t d_model,
int64_t n_head,
int64_t intermediate_size,
Expand Down Expand Up @@ -623,10 +623,10 @@ class CLIPEmbeddings : public GGMLBlock {
class CLIPVisionEmbeddings : public GGMLBlock {
protected:
int64_t embed_dim;
int64_t num_channels;
int64_t patch_size;
int64_t image_size;
int64_t num_patches;
int num_channels;
int patch_size;
int image_size;
int num_patches;
int64_t num_positions;

void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
Expand All @@ -641,9 +641,9 @@ class CLIPVisionEmbeddings : public GGMLBlock {

public:
CLIPVisionEmbeddings(int64_t embed_dim,
int64_t num_channels = 3,
int64_t patch_size = 14,
int64_t image_size = 224)
int num_channels = 3,
int patch_size = 14,
int image_size = 224)
: embed_dim(embed_dim),
num_channels(num_channels),
patch_size(patch_size),
Expand Down
8 changes: 4 additions & 4 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class ResBlock : public GGMLBlock {
std::pair<int, int> padding) {
GGML_ASSERT(dims == 2 || dims == 3);
if (dims == 3) {
return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
return std::shared_ptr<GGMLBlock>(new Conv3d(in_channels, out_channels, {kernel_size.first, 1, 1}, {1, 1, 1}, {padding.first, 0, 0}));
} else {
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
}
Expand Down Expand Up @@ -544,9 +544,9 @@ class AlphaBlender : public GGMLBlock {

class VideoResBlock : public ResBlock {
public:
VideoResBlock(int channels,
int emb_channels,
int out_channels,
VideoResBlock(int64_t channels,
int64_t emb_channels,
int64_t out_channels,
std::pair<int, int> kernel_size = {3, 3},
int64_t video_kernel_size = 3,
int dims = 2) // always 2
Expand Down
20 changes: 10 additions & 10 deletions conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,11 +303,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int class_token = clean_input_ids[class_token_index[0]];
class_idx = tokens_acc + class_token_index[0];
std::vector<int> clean_input_ids_tmp;
for (uint32_t i = 0; i < class_token_index[0]; i++)
for (int i = 0; i < class_token_index[0]; i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]);
for (uint32_t i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
for (int i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
clean_input_ids_tmp.push_back(class_token);
for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
for (int i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]);
clean_input_ids.clear();
clean_input_ids = clean_input_ids_tmp;
Expand All @@ -322,7 +322,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {

tokenizer.pad_tokens(tokens, weights, max_length, padding);
int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
for (uint32_t i = 0; i < tokens.size(); i++) {
for (int i = 0; i < tokens.size(); i++) {
// if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
// hardcode for now
Expand Down Expand Up @@ -1584,7 +1584,7 @@ struct T5CLIPEmbedder : public Conditioner {
chunk_hidden_states->ne[0],
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);

modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
modify_mask_to_attend_padding(t5_attn_mask, static_cast<int>(ggml_nelements(t5_attn_mask)), mask_pad);

return {hidden_states, t5_attn_mask, nullptr};
}
Expand Down Expand Up @@ -1723,8 +1723,8 @@ struct LLMEmbedder : public Conditioner {
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
int height = image.height;
int width = image.width;
int h_bar = static_cast<int>(std::round(height / factor)) * factor;
int w_bar = static_cast<int>(std::round(width / factor)) * factor;
int h_bar = static_cast<int>(std::round(height / factor) * factor);
int w_bar = static_cast<int>(std::round(width / factor) * factor);

if (static_cast<double>(h_bar) * w_bar > max_pixels) {
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
Expand Down Expand Up @@ -1752,7 +1752,7 @@ struct LLMEmbedder : public Conditioner {
ggml_tensor* image_embed = nullptr;
llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
image_embeds.emplace_back(image_embed_idx, image_embed);
image_embed_idx += 1 + image_embed->ne[1] + 6;
image_embed_idx += 1 + static_cast<int>(image_embed->ne[1]) + 6;

img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652]
int64_t num_image_tokens = image_embed->ne[1];
Expand Down Expand Up @@ -1799,9 +1799,9 @@ struct LLMEmbedder : public Conditioner {

prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";

prompt_attn_range.first = prompt.size();
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = prompt.size();
prompt_attn_range.second = static_cast<int>(prompt.size());

prompt += "[/INST]";
} else if (version == VERSION_OVIS_IMAGE) {
Expand Down
43 changes: 20 additions & 23 deletions denoiser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ struct SGMUniformScheduler : SigmaScheduler {
int t_max = TIMESTEPS - 1;
int t_min = 0;
std::vector<float> timesteps = linear_space(static_cast<float>(t_max), static_cast<float>(t_min), n + 1);
for (int i = 0; i < n; i++) {
for (uint32_t i = 0; i < n; i++) {
result.push_back(t_to_sigma_func(timesteps[i]));
}
result.push_back(0.0f);
Expand All @@ -259,11 +259,11 @@ struct LCMScheduler : SigmaScheduler {
result.reserve(n + 1);
const int original_steps = 50;
const int k = TIMESTEPS / original_steps;
for (int i = 0; i < n; i++) {
for (uint32_t i = 0; i < n; i++) {
// the rounding ensures we match the training schedule of the LCM model
int index = (i * original_steps) / n;
int timestep = (original_steps - index) * k - 1;
result.push_back(t_to_sigma(timestep));
result.push_back(t_to_sigma(static_cast<float>(timestep)));
}
result.push_back(0.0f);
return result;
Expand Down Expand Up @@ -525,8 +525,8 @@ struct CompVisVDenoiser : public CompVisDenoiser {
};

struct EDMVDenoiser : public CompVisVDenoiser {
float min_sigma = 0.002;
float max_sigma = 120.0;
float min_sigma = 0.002f;
float max_sigma = 120.0f;

EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0)
: min_sigma(min_sigma), max_sigma(max_sigma) {
Expand All @@ -537,7 +537,7 @@ struct EDMVDenoiser : public CompVisVDenoiser {
}

float sigma_to_t(float s) override {
return 0.25 * std::log(s);
return 0.25f * std::log(s);
}

float sigma_min() override {
Expand Down Expand Up @@ -569,7 +569,7 @@ struct DiscreteFlowDenoiser : public Denoiser {

void set_parameters() {
for (int i = 1; i < TIMESTEPS + 1; i++) {
sigmas[i - 1] = t_to_sigma(i);
sigmas[i - 1] = t_to_sigma(static_cast<float>(i));
}
}

Expand Down Expand Up @@ -612,7 +612,7 @@ struct DiscreteFlowDenoiser : public Denoiser {
};

float flux_time_shift(float mu, float sigma, float t) {
return std::exp(mu) / (std::exp(mu) + std::pow((1.0 / t - 1.0), sigma));
return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma));
}

struct FluxFlowDenoiser : public Denoiser {
Expand All @@ -632,7 +632,7 @@ struct FluxFlowDenoiser : public Denoiser {
void set_parameters(float shift) {
set_shift(shift);
for (int i = 0; i < TIMESTEPS; i++) {
sigmas[i] = t_to_sigma(i);
sigmas[i] = t_to_sigma(static_cast<float>(i));
}
}

Expand Down Expand Up @@ -1327,15 +1327,12 @@ static bool sample_k_diffusion(sample_method_t method,
// - pred_sample_direction -> "direction pointing to
// x_t"
// - pred_prev_sample -> "x_t-1"
int timestep =
roundf(TIMESTEPS -
i * ((float)TIMESTEPS / steps)) -
1;
int timestep = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
// 1. get previous step value (=t-1)
int prev_timestep = timestep - TIMESTEPS / steps;
int prev_timestep = timestep - TIMESTEPS / static_cast<int>(steps);
// The sigma here is chosen to cause the
// CompVisDenoiser to produce t = timestep
float sigma = compvis_sigmas[timestep];
float sigma = static_cast<float>(compvis_sigmas[timestep]);
if (i == 0) {
// The function add_noise intializes x to
// Diffusers' latents * sigma (as in Diffusers'
Expand Down Expand Up @@ -1392,10 +1389,10 @@ static bool sample_k_diffusion(sample_method_t method,
}
}
// 2. compute alphas, betas
float alpha_prod_t = alphas_cumprod[timestep];
float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
// Note final_alpha_cumprod = alphas_cumprod[0] due to
// trailing timestep spacing
float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
float beta_prod_t = 1 - alpha_prod_t;
// 3. compute predicted original sample from predicted
// noise also called "predicted x_0" of formula (12)
Expand Down Expand Up @@ -1442,8 +1439,8 @@ static bool sample_k_diffusion(sample_method_t method,
// Two step inner loop without an explicit
// tensor
float pred_sample_direction =
std::sqrt(1 - alpha_prod_t_prev -
std::pow(std_dev_t, 2)) *
::sqrtf(1 - alpha_prod_t_prev -
::powf(std_dev_t, 2)) *
vec_model_output[j];
vec_x[j] = std::sqrt(alpha_prod_t_prev) *
vec_pred_original_sample[j] +
Expand Down Expand Up @@ -1518,7 +1515,7 @@ static bool sample_k_diffusion(sample_method_t method,
// Begin k-diffusion specific workaround for
// evaluating F_theta(x; ...) from D(x, sigma), same
// as in DDIM (and see there for detailed comments)
float sigma = compvis_sigmas[timestep];
float sigma = static_cast<float>(compvis_sigmas[timestep]);
if (i == 0) {
float* vec_x = (float*)x->data;
for (int j = 0; j < ggml_nelements(x); j++) {
Expand Down Expand Up @@ -1557,14 +1554,14 @@ static bool sample_k_diffusion(sample_method_t method,
// is different from the notation alpha_t in
// DPM-Solver. In fact, we have alpha_{t_n} =
// \sqrt{\hat{alpha_n}}, [...]"
float alpha_prod_t = alphas_cumprod[timestep];
float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
float beta_prod_t = 1 - alpha_prod_t;
// Note final_alpha_cumprod = alphas_cumprod[0] since
// TCD is always "trailing"
float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
// The subscript _s are the only portion in this
// section (2) unique to TCD
float alpha_prod_s = alphas_cumprod[timestep_s];
float alpha_prod_s = static_cast<float>(alphas_cumprod[timestep_s]);
float beta_prod_s = 1 - alpha_prod_s;
// 3. Compute the predicted noised sample x_s based on
// the model parameterization
Expand Down
4 changes: 2 additions & 2 deletions examples/cli/avi_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,9 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int

// Write '00dc' chunk (video frame)
fwrite("00dc", 4, 1, f);
write_u32_le(f, jpeg_data.size);
write_u32_le(f, (uint32_t)jpeg_data.size);
index[i].offset = ftell(f) - 8;
index[i].size = jpeg_data.size;
index[i].size = (uint32_t)jpeg_data.size;
fwrite(jpeg_data.buf, 1, jpeg_data.size, f);

// Align to even byte size
Expand Down
4 changes: 2 additions & 2 deletions examples/common/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1386,10 +1386,10 @@ struct SDGenerationParams {
if (!item.empty()) {
try {
custom_sigmas.push_back(std::stof(item));
} catch (const std::invalid_argument& e) {
} catch (const std::invalid_argument&) {
LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str());
return -1;
} catch (const std::out_of_range& e) {
} catch (const std::out_of_range&) {
LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str());
return -1;
}
Expand Down
6 changes: 3 additions & 3 deletions examples/server/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ inline bool is_base64(unsigned char c) {
}

std::vector<uint8_t> base64_decode(const std::string& encoded_string) {
int in_len = encoded_string.size();
int in_len = static_cast<int>(encoded_string.size());
int i = 0;
int j = 0;
int in_ = 0;
Expand Down Expand Up @@ -617,7 +617,7 @@ int main(int argc, const char** argv) {
int img_h = height;
uint8_t* raw_pixels = load_image_from_memory(
reinterpret_cast<const char*>(bytes.data()),
bytes.size(),
static_cast<int>(bytes.size()),
img_w, img_h,
width, height, 3);

Expand All @@ -635,7 +635,7 @@ int main(int argc, const char** argv) {
int mask_h = height;
uint8_t* mask_raw = load_image_from_memory(
reinterpret_cast<const char*>(mask_bytes.data()),
mask_bytes.size(),
static_cast<int>(mask_bytes.size()),
mask_w, mask_h,
width, height, 1);
mask_image = {(uint32_t)mask_w, (uint32_t)mask_h, 1, mask_raw};
Expand Down
Loading
Loading