3131# ' for numeric variables, the minimum of the original input is preserved. For
3232# ' factors, the default minimum is `1`. For `split = "equal_range"`, the
3333# ' default minimum is always `1`, unless specified otherwise in `lowest`.
34+ # ' @param breaks Character, indicating whether breaks for categorizing data are
35+ # ' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or
36+ # ' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_
37+ # ' group or interval to begin). Use `labels = "range"` to make this behaviour
38+ # ' easier to see.
3439# ' @param labels Character vector of value labels. If not `NULL`, `categorize()`
3540# ' will returns factors instead of numeric variables, with `labels` used
36- # ' for labelling the factor levels. Can also be `"mean"` or `"median"` for a
37- # ' factor with labels as the mean/median of each groups.
41+ # ' for labelling the factor levels. Can also be `"mean"`, `"median"`,
42+ # ' `"range"` or `"observed"` for a factor with labels as the mean/median,
43+ # ' the requested range (even if not all values of that range are present in
44+ # ' the data) or observed range (range of the actual recoded values) of each
45+ # ' group. See 'Examples'.
3846# ' @param append Logical or string. If `TRUE`, recoded or converted variables
3947# ' get new column names and are appended (column bind) to `x`, thus returning
4048# ' both the original and the recoded variables. The new columns get a suffix,
5361# '
5462# ' # Splits and breaks (cut-off values)
5563# '
56- # ' Breaks are in general _exclusive_, this means that these values indicate
64+ # ' Breaks are by default _exclusive_, this means that these values indicate
5765# ' the lower bound of the next group or interval to begin. Take a simple
5866# ' example, a numeric variable with values from 1 to 9. The median would be 5,
5967# ' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
6371# ' from 1 to 3 belong to the first interval and are recoded into 1 (because
6472# ' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
6573# '
74+ # ' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which
75+ # ' case
76+ # '
6677# ' # Recoding into groups with equal size or range
6778# '
6879# ' `split = "equal_length"` and `split = "equal_range"` try to divide the
119130# ' x <- sample(1:10, size = 30, replace = TRUE)
120131# ' categorize(x, "equal_length", n_groups = 3, labels = "mean")
121132# ' categorize(x, "equal_length", n_groups = 3, labels = "median")
133+ # '
134+ # ' # cut numeric into groups with the requested range as a label name
135+ # ' # each category has the same range, and labels indicate this range
136+ # ' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
137+ # ' # in this example, each category has the same range, but labels only refer
138+ # ' # to the ranges of the actual values (present in the data) inside each group
139+ # ' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
122140# ' @export
123141categorize <- function (x , ... ) {
124142 UseMethod(" categorize" )
@@ -142,6 +160,7 @@ categorize.numeric <- function(x,
142160 n_groups = NULL ,
143161 range = NULL ,
144162 lowest = 1 ,
163+ breaks = " exclusive" ,
145164 labels = NULL ,
146165 verbose = TRUE ,
147166 ... ) {
@@ -152,6 +171,9 @@ categorize.numeric <- function(x,
152171 if (identical(split , " equal_length" )) split <- " length"
153172 if (identical(split , " equal_range" )) split <- " range"
154173
174+ # check for valid values
175+ breaks <- match.arg(breaks , c(" exclusive" , " inclusive" ))
176+
155177 # save
156178 original_x <- x
157179
@@ -169,9 +191,9 @@ categorize.numeric <- function(x,
169191 }
170192
171193 if (is.numeric(split )) {
172- breaks <- split
194+ category_splits <- split
173195 } else {
174- breaks <- switch (split ,
196+ category_splits <- switch (split ,
175197 median = stats :: median(x ),
176198 mean = mean(x ),
177199 length = n_groups ,
@@ -182,15 +204,18 @@ categorize.numeric <- function(x,
182204 }
183205
184206 # complete ranges, including minimum and maximum
185- if (! identical(split , " length" )) breaks <- unique(c(min(x ), breaks , max(x )))
207+ if (! identical(split , " length" )) {
208+ category_splits <- unique(c(min(x ), category_splits , max(x )))
209+ }
186210
187211 # recode into groups
188212 out <- droplevels(cut(
189213 x ,
190- breaks = breaks ,
214+ breaks = category_splits ,
191215 include.lowest = TRUE ,
192- right = FALSE
216+ right = identical( breaks , " inclusive " )
193217 ))
218+ cut_result <- out
194219 levels(out ) <- 1 : nlevels(out )
195220
196221 # fix lowest value, add back into original vector
@@ -201,7 +226,7 @@ categorize.numeric <- function(x,
201226 original_x [! is.na(original_x )] <- out
202227
203228 # turn into factor?
204- .original_x_to_factor(original_x , x , labels , out , verbose , ... )
229+ .original_x_to_factor(original_x , x , cut_result , labels , out , verbose , ... )
205230}
206231
207232
@@ -223,6 +248,7 @@ categorize.data.frame <- function(x,
223248 n_groups = NULL ,
224249 range = NULL ,
225250 lowest = 1 ,
251+ breaks = " exclusive" ,
226252 labels = NULL ,
227253 append = FALSE ,
228254 ignore_case = FALSE ,
@@ -260,6 +286,7 @@ categorize.data.frame <- function(x,
260286 n_groups = n_groups ,
261287 range = range ,
262288 lowest = lowest ,
289+ breaks = breaks ,
263290 labels = labels ,
264291 verbose = verbose ,
265292 ...
@@ -276,6 +303,7 @@ categorize.grouped_df <- function(x,
276303 n_groups = NULL ,
277304 range = NULL ,
278305 lowest = 1 ,
306+ breaks = " exclusive" ,
279307 labels = NULL ,
280308 append = FALSE ,
281309 ignore_case = FALSE ,
@@ -319,6 +347,7 @@ categorize.grouped_df <- function(x,
319347 n_groups = n_groups ,
320348 range = range ,
321349 lowest = lowest ,
350+ breaks = breaks ,
322351 labels = labels ,
323352 select = select ,
324353 exclude = exclude ,
@@ -375,20 +404,26 @@ categorize.grouped_df <- function(x,
375404}
376405
377406
378- .original_x_to_factor <- function (original_x , x , labels , out , verbose , ... ) {
407+ .original_x_to_factor <- function (original_x , x , cut_result , labels , out , verbose , ... ) {
379408 if (! is.null(labels )) {
380409 if (length(labels ) == length(unique(out ))) {
381410 original_x <- as.factor(original_x )
382411 levels(original_x ) <- labels
383- } else if (length(labels ) == 1 && labels %in% c(" mean" , " median" )) {
412+ } else if (length(labels ) == 1 && labels %in% c(" mean" , " median" , " range " , " observed " )) {
384413 original_x <- as.factor(original_x )
385414 no_na_x <- original_x [! is.na(original_x )]
386- if (labels == " mean" ) {
387- labels <- stats :: aggregate(x , list (no_na_x ), FUN = mean , na.rm = TRUE )$ x
388- } else {
389- labels <- stats :: aggregate(x , list (no_na_x ), FUN = stats :: median , na.rm = TRUE )$ x
390- }
391- levels(original_x ) <- insight :: format_value(labels , ... )
415+ out <- switch (labels ,
416+ mean = stats :: aggregate(x , list (no_na_x ), FUN = mean , na.rm = TRUE )$ x ,
417+ median = stats :: aggregate(x , list (no_na_x ), FUN = stats :: median , na.rm = TRUE )$ x ,
418+ # labels basically like what "cut()" returns
419+ range = levels(cut_result ),
420+ # range based on the values that are actually present in the data
421+ {
422+ temp <- stats :: aggregate(x , list (no_na_x ), FUN = range , na.rm = TRUE )$ x
423+ apply(temp , 1 , function (i ) paste0(" (" , paste(as.vector(i ), collapse = " -" ), " )" ))
424+ }
425+ )
426+ levels(original_x ) <- insight :: format_value(out , ... )
392427 } else if (isTRUE(verbose )) {
393428 insight :: format_warning(
394429 " Argument `labels` and levels of the recoded variable are not of the same length." ,
0 commit comments