Skip to content

Commit 2fdf02e

Browse files
author
etiennebacher
committed
Merge branch 'rc_datawizard_0.13.0' of https://github.com/easystats/datawizard into rc_datawizard_0.13.0
2 parents f090bde + be756b6 commit 2fdf02e

File tree

9 files changed

+176
-46
lines changed

9 files changed

+176
-46
lines changed

NEWS.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,22 @@ BREAKING CHANGES
99

1010
* Removed deprecated arguments `group` and `na.rm` in multiple functions. Use `by` and `remove_na` instead (#546).
1111

12+
* The default value for the argument `dummy_factors` in `to_numeric()` has
13+
changed from `TRUE` to `FALSE` (#544).
14+
1215
CHANGES
1316

1417
* The `pattern` argument in `data_rename()` can also be a named vector. In this
1518
case, names are used as values for the `replacement` argument (i.e. `pattern`
1619
can be a character vector using `<new name> = "<old name>"`).
1720

21+
* `categorize()` gains a new `breaks` argument, to decide whether breaks are
22+
inclusive or exclusive (#548).
23+
24+
* The `labels` argument in `categorize()` gets two new options, `"range"` and
25+
`"observed"`, to use the range of categorized values as labels (i.e. factor
26+
levels) (#548).
27+
1828
* Minor additions to `reshape_ci()` to work with forthcoming changes in the
1929
`{bayestestR}` package.
2030

R/categorize.R

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,18 @@
3131
#' for numeric variables, the minimum of the original input is preserved. For
3232
#' factors, the default minimum is `1`. For `split = "equal_range"`, the
3333
#' default minimum is always `1`, unless specified otherwise in `lowest`.
34+
#' @param breaks Character, indicating whether breaks for categorizing data are
35+
#' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or
36+
#' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_
37+
#' group or interval to begin). Use `labels = "range"` to make this behaviour
38+
#' easier to see.
3439
#' @param labels Character vector of value labels. If not `NULL`, `categorize()`
3540
#' will returns factors instead of numeric variables, with `labels` used
36-
#' for labelling the factor levels. Can also be `"mean"` or `"median"` for a
37-
#' factor with labels as the mean/median of each groups.
41+
#' for labelling the factor levels. Can also be `"mean"`, `"median"`,
42+
#' `"range"` or `"observed"` for a factor with labels as the mean/median,
43+
#' the requested range (even if not all values of that range are present in
44+
#' the data) or observed range (range of the actual recoded values) of each
45+
#' group. See 'Examples'.
3846
#' @param append Logical or string. If `TRUE`, recoded or converted variables
3947
#' get new column names and are appended (column bind) to `x`, thus returning
4048
#' both the original and the recoded variables. The new columns get a suffix,
@@ -53,7 +61,7 @@
5361
#'
5462
#' # Splits and breaks (cut-off values)
5563
#'
56-
#' Breaks are in general _exclusive_, this means that these values indicate
64+
#' Breaks are by default _exclusive_, this means that these values indicate
5765
#' the lower bound of the next group or interval to begin. Take a simple
5866
#' example, a numeric variable with values from 1 to 9. The median would be 5,
5967
#' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
@@ -63,6 +71,9 @@
6371
#' from 1 to 3 belong to the first interval and are recoded into 1 (because
6472
#' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
6573
#'
74+
#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which
75+
#' case
76+
#'
6677
#' # Recoding into groups with equal size or range
6778
#'
6879
#' `split = "equal_length"` and `split = "equal_range"` try to divide the
@@ -119,6 +130,13 @@
119130
#' x <- sample(1:10, size = 30, replace = TRUE)
120131
#' categorize(x, "equal_length", n_groups = 3, labels = "mean")
121132
#' categorize(x, "equal_length", n_groups = 3, labels = "median")
133+
#'
134+
#' # cut numeric into groups with the requested range as a label name
135+
#' # each category has the same range, and labels indicate this range
136+
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
137+
#' # in this example, each category has the same range, but labels only refer
138+
#' # to the ranges of the actual values (present in the data) inside each group
139+
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
122140
#' @export
123141
categorize <- function(x, ...) {
124142
UseMethod("categorize")
@@ -142,6 +160,7 @@ categorize.numeric <- function(x,
142160
n_groups = NULL,
143161
range = NULL,
144162
lowest = 1,
163+
breaks = "exclusive",
145164
labels = NULL,
146165
verbose = TRUE,
147166
...) {
@@ -152,6 +171,9 @@ categorize.numeric <- function(x,
152171
if (identical(split, "equal_length")) split <- "length"
153172
if (identical(split, "equal_range")) split <- "range"
154173

174+
# check for valid values
175+
breaks <- match.arg(breaks, c("exclusive", "inclusive"))
176+
155177
# save
156178
original_x <- x
157179

@@ -169,9 +191,9 @@ categorize.numeric <- function(x,
169191
}
170192

171193
if (is.numeric(split)) {
172-
breaks <- split
194+
category_splits <- split
173195
} else {
174-
breaks <- switch(split,
196+
category_splits <- switch(split,
175197
median = stats::median(x),
176198
mean = mean(x),
177199
length = n_groups,
@@ -182,15 +204,18 @@ categorize.numeric <- function(x,
182204
}
183205

184206
# complete ranges, including minimum and maximum
185-
if (!identical(split, "length")) breaks <- unique(c(min(x), breaks, max(x)))
207+
if (!identical(split, "length")) {
208+
category_splits <- unique(c(min(x), category_splits, max(x)))
209+
}
186210

187211
# recode into groups
188212
out <- droplevels(cut(
189213
x,
190-
breaks = breaks,
214+
breaks = category_splits,
191215
include.lowest = TRUE,
192-
right = FALSE
216+
right = identical(breaks, "inclusive")
193217
))
218+
cut_result <- out
194219
levels(out) <- 1:nlevels(out)
195220

196221
# fix lowest value, add back into original vector
@@ -201,7 +226,7 @@ categorize.numeric <- function(x,
201226
original_x[!is.na(original_x)] <- out
202227

203228
# turn into factor?
204-
.original_x_to_factor(original_x, x, labels, out, verbose, ...)
229+
.original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...)
205230
}
206231

207232

@@ -223,6 +248,7 @@ categorize.data.frame <- function(x,
223248
n_groups = NULL,
224249
range = NULL,
225250
lowest = 1,
251+
breaks = "exclusive",
226252
labels = NULL,
227253
append = FALSE,
228254
ignore_case = FALSE,
@@ -260,6 +286,7 @@ categorize.data.frame <- function(x,
260286
n_groups = n_groups,
261287
range = range,
262288
lowest = lowest,
289+
breaks = breaks,
263290
labels = labels,
264291
verbose = verbose,
265292
...
@@ -276,6 +303,7 @@ categorize.grouped_df <- function(x,
276303
n_groups = NULL,
277304
range = NULL,
278305
lowest = 1,
306+
breaks = "exclusive",
279307
labels = NULL,
280308
append = FALSE,
281309
ignore_case = FALSE,
@@ -319,6 +347,7 @@ categorize.grouped_df <- function(x,
319347
n_groups = n_groups,
320348
range = range,
321349
lowest = lowest,
350+
breaks = breaks,
322351
labels = labels,
323352
select = select,
324353
exclude = exclude,
@@ -375,20 +404,26 @@ categorize.grouped_df <- function(x,
375404
}
376405

377406

378-
.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) {
407+
.original_x_to_factor <- function(original_x, x, cut_result, labels, out, verbose, ...) {
379408
if (!is.null(labels)) {
380409
if (length(labels) == length(unique(out))) {
381410
original_x <- as.factor(original_x)
382411
levels(original_x) <- labels
383-
} else if (length(labels) == 1 && labels %in% c("mean", "median")) {
412+
} else if (length(labels) == 1 && labels %in% c("mean", "median", "range", "observed")) {
384413
original_x <- as.factor(original_x)
385414
no_na_x <- original_x[!is.na(original_x)]
386-
if (labels == "mean") {
387-
labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x
388-
} else {
389-
labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x
390-
}
391-
levels(original_x) <- insight::format_value(labels, ...)
415+
out <- switch(labels,
416+
mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x,
417+
median = stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x,
418+
# labels basically like what "cut()" returns
419+
range = levels(cut_result),
420+
# range based on the values that are actually present in the data
421+
{
422+
temp <- stats::aggregate(x, list(no_na_x), FUN = range, na.rm = TRUE)$x
423+
apply(temp, 1, function(i) paste0("(", paste(as.vector(i), collapse = "-"), ")"))
424+
}
425+
)
426+
levels(original_x) <- insight::format_value(out, ...)
392427
} else if (isTRUE(verbose)) {
393428
insight::format_warning(
394429
"Argument `labels` and levels of the recoded variable are not of the same length.",

R/to_numeric.R

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
#' @inheritParams extract_column_names
1818
#' @inheritParams categorize
1919
#'
20-
#' @note By default, `to_numeric()` converts factors into "binary" dummies, i.e.
20+
#' @note When factors should be converted into multiple "binary" dummies, i.e.
2121
#' each factor level is converted into a separate column filled with a binary
22-
#' 0-1 value. If only one column is required, use `dummy_factors = FALSE`. If
23-
#' you want to preserve the original factor levels (in case these represent
24-
#' numeric values), use `preserve_levels = TRUE`.
22+
#' 0-1 value, set `dummy_factors = TRUE`. If you want to preserve the original
23+
#' factor levels (in case these represent numeric values), use
24+
#' `preserve_levels = TRUE`.
2525
#'
2626
#' @section Selection of variables - `select` argument:
2727
#' For most functions that have a `select` argument the complete input data
@@ -34,12 +34,12 @@
3434
#'
3535
#' @examples
3636
#' to_numeric(head(ToothGrowth))
37-
#' to_numeric(head(ToothGrowth), dummy_factors = FALSE)
37+
#' to_numeric(head(ToothGrowth), dummy_factors = TRUE)
3838
#'
3939
#' # factors
4040
#' x <- as.factor(mtcars$gear)
41-
#' to_numeric(x, dummy_factors = FALSE)
42-
#' to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE)
41+
#' to_numeric(x)
42+
#' to_numeric(x, preserve_levels = TRUE)
4343
#' # same as:
4444
#' coerce_to_numeric(x)
4545
#'
@@ -69,7 +69,7 @@ to_numeric.default <- function(x, verbose = TRUE, ...) {
6969
to_numeric.data.frame <- function(x,
7070
select = NULL,
7171
exclude = NULL,
72-
dummy_factors = TRUE,
72+
dummy_factors = FALSE,
7373
preserve_levels = FALSE,
7474
lowest = NULL,
7575
append = FALSE,
@@ -191,7 +191,7 @@ to_numeric.POSIXlt <- to_numeric.Date
191191

192192
#' @export
193193
to_numeric.factor <- function(x,
194-
dummy_factors = TRUE,
194+
dummy_factors = FALSE,
195195
preserve_levels = FALSE,
196196
lowest = NULL,
197197
verbose = TRUE,

man/categorize.Rd

Lines changed: 24 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/to_numeric.Rd

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)