Using limits in scale_x_continuous with geom_histogram removes values even when they are well within the limits

37 Views Asked by At

I want to set wide limits on the x-axis of a histogram for compatibility with previous plots in the same document. I thought I should be able to do this setting the limits in scale_x_continuous() as the limits I want are outside the range of the data. However, doing this, even with absurdly high limits, censors data with a message about missing (rather than out of limit) data. I can solve my problem with coord_cartesian(xlim = c(-3, 5)) to get what I want but I want to understand whether what I am seeing using scale_x_continuous(limits = c(-3, 5)) is doing what it is. I think the warning message could be improved and the help around oob in scale_x_continuous() which suggested that oob might have solved things for me, seemed not to enable me to use the oob argument!

Here is a reprex which I think shows the whole story.

library(tidyverse)

tibScores <- structure(list(scores = c(1.3221868551862, 0.641571901750246, 
                                       0.679997504675969, 1.69269563506049, 0.0863798463356552, 1.17140778928003, 
                                       0.575259325846973, 1.76102607380206, 0.816982263242756, 1.36699542376874, 
                                       0.652688919189357, -0.0556963218639883, 0.447405816980878, 1.66074084421999, 
                                       2.1616205500135, 0.536966252715301, 0.863735587473018, -0.0838092754807627, 
                                       1.44139911944675, 0.28123124435063, 0.726886572797169, 1.28139850105958, 
                                       -1.06200317504591, 1.33947497571541, 1.25321267994842, 2.26719146654814, 
                                       0.0391905158098528, -0.325762556819256, 1.32308861983067, 0.67539246225335, 
                                       3.15411661921818, 2.06629001625931, 0.565898473951559, 1.53933053768839, 
                                       0.391723502639786, 8.20432000704363e-05, 2.06594609314108, 1.30682345061009, 
                                       0.605298461881756, 1.58497799066082, 1.68990269481407, 0.476040346123826, 
                                       0.844876986409903, -0.164609368672023, 2.20658953135608, 0.998615148039355, 
                                       1.15569174851264, 2.02756244449021, 2.47305476162932, 0.728964653094446, 
                                       1.48997692828698, 4.38774920562831, 2.00346155661968, 1.49193102247697, 
                                       2.28013856141223, 2.76792989289636, 3.26522037161048, 1.7030572330355, 
                                       1.58105144823292, 2.54872692975434, 2.91883589732548, 2.01003029310646, 
                                       2.6215879604812, 1.9739800810684, 2.35983308929714, 1.65115537149953, 
                                       0.895557980659816, 2.67904664720373, 1.61291641056518, 1.80793828366117, 
                                       3.14825826246364, 2.66897888275127, 1.49863234127131, 0.37011916381626, 
                                       2.67629185248689, 3.39374742622388, 2.52608956918892, 0.336324241851188, 
                                       1.87099745872943, 4.10493845361578), 
                            grp = c("NHS", "NHS", "NHS", 
                                    "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", 
                                    "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", 
                                    "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", 
                                    "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", "NHS", 
                                    "NHS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", 
                                    "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", 
                                    "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", 
                                    "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS", "HS")), 
                       row.names = c(NA, -80L), 
                       class = c("tbl_df", "tbl", "data.frame"))

ggplot(data = tibScores,
       aes(x = scores, fill = grp)) +
  geom_histogram(aes(y = after_stat(density)),
                 position = "identity",
                 colour = "black",
                 alpha = .8, 
                 bins = 30) 

### but, for comparison with a preceding plot, I want x limits at -3 and 5 so
ggplot(data = tibScores,
       aes(x = scores, fill = grp)) +
  geom_histogram(aes(y = after_stat(density)),
                 position = "identity",
                 colour = "black",
                 alpha = .8, 
                 bins = 30) +
  scale_x_continuous("Scores",
                     breaks = -3:5) 

### but then I remember that the breaks don't set the limits so:
ggplot(data = tibScores,
       aes(x = scores, fill = grp)) +
  geom_histogram(aes(y = after_stat(density)),
                 position = "identity",
                 colour = "black",
                 alpha = .8, 
                 bins = 30) +
  scale_x_continuous("Scores",
                     breaks = -3:5,
                     limits = c(-3, 5))
### but now:
# Warning message:
#   Removed 4 rows containing missing values (`geom_bar()`). 
### Why four points censored out?  There are no missing values so that message is misleading
### and it must be about the limits.

### so I sleuth around and find https://stackoverflow.com/questions/32505298/explain-ggplot2-warning-removed-k-rows-containing-missing-values
### and this gets me what I want but I still don't understand the censoring when I use scale_x_continuous with limits.
ggplot(data = tibScores,
       aes(x = scores, fill = grp)) +
  geom_histogram(aes(y = after_stat(density)),
                 position = "identity",
                 colour = "black",
                 alpha = .8, 
                 bins = 30) +
  scale_x_continuous("Scores",
                     breaks = -3:5) +
  coord_cartesian(xlim = c(-3, 5))
                     limits = c(-3, 5))

ggplot(data = tibScores,
       aes(x = scores, fill = grp)) +
  geom_histogram(aes(y = after_stat(density)),
                 position = "identity",
                 colour = "black",
                 alpha = .8, 
                 bins = 30) 

### but, for comparison with a preceding plot, I want x limits at -3 and 5 so
ggplot(data = tibScores,
       aes(x = scores, fill = grp)) +
  geom_histogram(aes(y = after_stat(density)),
                 position = "identity",
                 colour = "black",
                 alpha = .8, 
                 bins = 30) +
  scale_x_continuous("Scores",
                     breaks = -3:5) 

### but then I remember that the breaks don't set the limits so I do this
ggplot(data = tibScores,
       aes(x = scores, fill = grp)) +
  geom_histogram(aes(y = after_stat(density)),
                 position = "identity",
                 colour = "black",
                 alpha = .8, 
                 bins = 30) +
  scale_x_continuous("Scores",
                     breaks = -3:5,
                     limits = c(-3, 5))
### but that gets me:
# Warning message:
#   Removed 4 rows containing missing values (`geom_bar()`). 
### pushing the limits up to c(-3, 8) still gets the censoring
### there are no missing values so to me the warning is misleading
### and I really can't understand the behaviour

### I also discover oob (which is in the scale_x_continuous help, I'd just never used it)
ggplot(data = tibScores,
       aes(x = scores, fill = grp)) +
  geom_histogram(aes(y = after_stat(density)),
                 position = "identity",
                 colour = "black",
                 alpha = .8, 
                 bins = 30) +
  scale_x_continuous("Scores",
                     breaks = -3:5,
                     limits = c(-3, 5),
                     oob = keep)

### gives me:
# Error in `oob()`:
#   ℹ In index: 1.
# Caused by error:
#   ! `.p()` must return a single `TRUE` or `FALSE`, not NULL.
# Run `rlang::last_trace()` to see where the error occurred.
### and I'm sorry, but the help and its links there don't help me!

### but I do find coord_cartesian(limits = ...)

ggplot(data = tibScores,
       aes(x = scores, fill = grp)) +
  geom_histogram(aes(y = after_stat(density)),
                 position = "identity",
                 colour = "black",
                 alpha = .8, 
                 bins = 30) +
  scale_x_continuous("Scores",
                     breaks = -3:5) +
  coord_cartesian(xlim = c(-3, 5))

### and that works but I think I am still baffled by 

I hope someone can clarify things for me. TIA, Chris

1

There are 1 best solutions below

1
stefan On

First, the issue with using oob = keep is that it should be oob = scales::oob_keep. You are passing purrr::keep to the oob= argument. Hence you get an error.

Second, the difference when setting the limits via coord_cartesian is that the latter will not alter the data, i.e. it will simply zoom on the desired range, whereas setting the limits via the scale will alter the data.

Third, I agree that the docs and/or warnings might be improved. So, feel free to propose an improvement as a feature request or ... .

The general issue is that for the outermost bars the xmin/xmax value fall outside of the range of the data, i.e. the outermost bins are centered at the maximum/minimum value of your data. While the default limits account for that, this fact is often forgotten when setting the limits. And in case of a histogram the outermost bars get dropped.

This can be seen clearly by means of a more minimal example (code below). The top left shows a histogram with the default limits where I added two vertical lines for the data range. This shows that the outermost bars fall outside of the data range. As a consequence, when fixing the limits to the data range these bars get dropped as shown in the top right. Besides setting the limits via coord_cartesian this could be fixed by using oob_keep as shown in the bottom left. Or as another option you might consider using boundary= to align the bars with the boundary of the bins. Note however, that this changes the binning.

Now to your real data. Here the issue is a bit special, as of course all data and even the outermost bars of your histogram clearly fall inside the limits you have set. However, as I already have said, when you set the limits via the scale you alter the data. And in your case the issue is that no data is removed but instead added (: (wasn't aware of that either), i.e. by expanding the limits some invisible zero bars are added to the data and the plot. And even if invisible, what I have said before also applies to your case: The outermost bars get dropped.

To make this visible I used after_stat and after_scale to make the invisible bars visible:

pp <- ggplot(
  data = tibScores,
  aes(x = scores, fill = grp)
) +
  geom_histogram(
    aes(
      y = after_stat(if_else(density > 0, density, .5)),
      fill = stage(grp, after_scale = if_else(density > 0, fill, NA_character_))
    ),
    position = "identity",
    colour = "black",
    alpha = .8,
    bins = 30
  ) +
  geom_vline(xintercept = c(-3, 5), color = "red") +
  labs(subtitle = "Default")

pp1 <- pp +
  scale_x_continuous("Scores",
    breaks = -3:5, 
    limits = c(-3, 5)
  ) +
  labs(subtitle = "Fixing the limits will censor or drop outermost bars")

list(pp, pp1) |>
  wrap_plots(ncol = 1)

enter image description here CODE

library(ggplot2)

set.seed(1)

dat <- data.frame(
  x = runif(100, 0, 1)
)

p <- ggplot(dat, aes(x = x)) +
  geom_vline(xintercept = c(0, 1), color = "red")

p0 <- p +
  geom_histogram() +
  labs(subtitle = "Default")

p1 <- p0 +
  scale_x_continuous(
    limits = c(0, 1)
  ) +
  labs(subtitle = "Fixing the limits will censor\nor drop outermost bars")

p2 <- p0 +
  scale_x_continuous(
    limits = c(0, 1),
    oob = scales::oob_keep
  ) +
  labs(subtitle = "oob_keep will keep them")

p3 <- p +
  geom_histogram(boundary = 0) +
  scale_x_continuous(
    limits = c(0, 1)
  ) +
  labs(subtitle = "boundary = 0: Do not center the bars")

library(patchwork)

list(p0, p1, p2, p3) |>
  wrap_plots()