diff --git a/DESCRIPTION b/DESCRIPTION
index 8513edda5..5cf7e7173 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: ggeffects
 Type: Package
 Encoding: UTF-8
 Title: Create Tidy Data Frames of Marginal Effects for 'ggplot' from Model Outputs
-Version: 1.2.3.9
+Version: 1.2.3.10
 Authors@R: c(
       person("Daniel", "Lüdecke", role = c("aut", "cre"), email = "d.luedecke@uke.de", comment = c(ORCID = "0000-0002-8895-3206")),
       person("Frederik", "Aust", role = "ctb", comment = c(ORCID = "0000-0003-4900-788X")),
diff --git a/NEWS.md b/NEWS.md
index 512a8abc5..72bfbdb3a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -14,6 +14,10 @@
 
 * `tibbles` are always converted into data frames, to avoid issues.
 
+* `hypothesis_test()` gains a `by` argument, to specify a variable that is used
+  to group the comparisons or contrasts. This is useful for models with interaction
+  terms.
+
 ## Bug fixes
 
 * Plotting residuals did not work when model object passed to `ggpredict()`
diff --git a/R/hypothesis_test.R b/R/hypothesis_test.R
index 7e933cc0a..e183944b6 100644
--- a/R/hypothesis_test.R
+++ b/R/hypothesis_test.R
@@ -14,6 +14,11 @@
 #'   contrasts or comparisons for the *slopes* of this numeric predictor are
 #'   computed (possibly grouped by the levels of further categorical focal
 #'   predictors).
+#' @param by Character vector specifying the names of predictors to condition on.
+#'   Hypothesis test is then carried out for focal terms by each level of `by`
+#'   variables. This is useful especially for interaction terms, where we want
+#'   to test the interaction within "groups". `by` is only relevant for
+#'   categorical predictors.
 #' @param scale Character string, indicating the scale on which the contrasts
 #'   or comparisons are represented. Can be `"response"` (default), which would
 #'   return contrasts on the response scale (e.g. for logistic regression, as
@@ -122,6 +127,9 @@
 #'   # p-value adjustment
 #'   hypothesis_test(m, c("c161sex", "c172code"), p_adjust = "tukey")
 #'
+#'   # not all comparisons, only by specific group levels
+#'   hypothesis_test(m, "c172code", by = "c161sex")
+#'
 #'   # specific comparisons
 #'   hypothesis_test(m, c("c161sex", "c172code"), test = "b2 = b1")
 #'
@@ -139,6 +147,7 @@ hypothesis_test <- function(model, ...) {
 #' @export
 hypothesis_test.default <- function(model,
                                     terms = NULL,
+                                    by = NULL,
                                     test = "pairwise",
                                     equivalence = NULL,
                                     scale = "response",
@@ -205,7 +214,12 @@ hypothesis_test.default <- function(model,
   need_average_predictions <- insight::is_mixed_model(model)
   msg_intervals <- FALSE
 
-  # we want contrasts or comparisons for these focal predictors...
+  # by-variables are included in terms
+  if (!is.null(by)) {
+    terms <- unique(c(terms, by))
+  }
+
+  # we want contrasts or comparisons for these focal predictors...  
   focal <- .clean_terms(terms)
 
   # check if we have a mixed model - in this case, we need to ensure that our
@@ -222,6 +236,27 @@ hypothesis_test.default <- function(model,
   }
   grid <- data_grid(model, terms, ...)
 
+  # check for valid by-variable
+  if (!is.null(by)) {
+    # all by-terms need to be in data grid
+    if (!all(by %in% colnames(grid))) {
+      insight::format_error(
+        paste0("Variable(s) `", toString(by[!by %in% colnames(grid)]), "` not found in data grid.")
+      )
+    }
+    # by-terms must be categorical
+    by_factors <- vapply(grid[by], is.factor, TRUE)
+    if (!all(by_factors)) {
+      insight::format_error(
+        "All variables in `by` must be categorical.",
+        paste0(
+          "The following variables in `by` are not categorical: ",
+          toString(paste0("`", by[!by_factors], "`"))
+        )
+      )
+    }
+  }
+
   by_arg <- NULL
   # for models with ordinal/categorical outcome, we need focal terms and
   # response variable as "by" argument
@@ -580,11 +615,6 @@ hypothesis_test.default <- function(model,
   out$conf.high <- .comparisons$conf.high
   out$p.value <- .comparisons$p.value
 
-  # p-value adjustment?
-  if (!is.null(p_adjust)) {
-    out <- .p_adjust(out, p_adjust, .comparisons$statistic, grid, focal, df, verbose)
-  }
-
   # for pairwise comparisons, we may have comparisons inside one level when we
   # have multiple focal terms, like "1-1" and "a-b". In this case, the comparison
   # of 1 to 1 ("1-1") is just the contrast for the level "1", we therefore can
@@ -602,6 +632,37 @@ hypothesis_test.default <- function(model,
     }
   }
 
+  # filter by-variables?
+  if (!is.null(by)) {
+    for (by_factor in by) {
+      # values in "by" are character vectors, which are saved as "level-level".
+      # we now extract the unique values, and filter the data frame
+      unique_values <- unique(grid[[by_factor]])
+      by_levels <- paste0(unique_values, "-", unique_values)
+      keep_rows <- out[[by_factor]] %in% c(by_levels, unique_values)
+      # filter final data frame
+      out <- out[keep_rows, , drop = FALSE]
+      # but we also need to filter the ".comparisons" data frame
+      .comparisons <- .comparisons[keep_rows, , drop = FALSE]
+      # finally, replace "level-level" just by "level"
+      for (i in seq_along(by_levels)) {
+        out[[by_factor]] <- gsub(
+          by_levels[i],
+          unique_values[i],
+          out[[by_factor]],
+          fixed = TRUE
+        )
+      }
+    }
+    # remove by-terms from focal terms
+    focal <- focal[!focal %in% by]
+  }
+
+  # p-value adjustment?
+  if (!is.null(p_adjust)) {
+    out <- .p_adjust(out, p_adjust, .comparisons$statistic, grid, focal, df, verbose)
+  }
+
   # add back response levels?
   if ("group" %in% colnames(.comparisons)) {
     out <- cbind(
@@ -638,6 +699,7 @@ hypothesis_test.default <- function(model,
 #' @rdname hypothesis_test
 #' @export
 hypothesis_test.ggeffects <- function(model,
+                                      by = NULL,
                                       test = "pairwise",
                                       equivalence = NULL,
                                       p_adjust = NULL,
@@ -655,6 +717,7 @@ hypothesis_test.ggeffects <- function(model,
   hypothesis_test.default(
     model,
     terms = focal,
+    by = by,
     test = test,
     equivalence = equivalence,
     p_adjust = p_adjust,
diff --git a/man/hypothesis_test.Rd b/man/hypothesis_test.Rd
index 74175be84..69361f8f6 100644
--- a/man/hypothesis_test.Rd
+++ b/man/hypothesis_test.Rd
@@ -11,6 +11,7 @@ hypothesis_test(model, ...)
 \method{hypothesis_test}{default}(
   model,
   terms = NULL,
+  by = NULL,
   test = "pairwise",
   equivalence = NULL,
   scale = "response",
@@ -25,6 +26,7 @@ hypothesis_test(model, ...)
 
 \method{hypothesis_test}{ggeffects}(
   model,
+  by = NULL,
   test = "pairwise",
   equivalence = NULL,
   p_adjust = NULL,
@@ -51,6 +53,12 @@ contrasts or comparisons for the \emph{slopes} of this numeric predictor are
 computed (possibly grouped by the levels of further categorical focal
 predictors).}
 
+\item{by}{Character vector specifying the names of predictors to condition on.
+Hypothesis test is then only carried out within each level of \code{by} variables,
+and not by all combinations of levels of \code{by} variables. This is useful
+especially for interaction terms, where we want to test the interaction
+within "groups". \code{by} is only relevant for categorical predictors.}
+
 \item{test}{Hypothesis to test. By default, pairwise-comparisons are
 conducted. See section \emph{Introduction into contrasts and pairwise comparisons}.}
 
diff --git a/tests/testthat/test-hypothesis_test.R b/tests/testthat/test-hypothesis_test.R
index 3def02cd9..62807d7ba 100644
--- a/tests/testthat/test-hypothesis_test.R
+++ b/tests/testthat/test-hypothesis_test.R
@@ -38,7 +38,7 @@ if (suppressWarnings(requiet("testthat") && requiet("ggeffects") && requiet("mar
     expect_equal(
       out1$p.value,
       c(
-        0.074, 0.3503, 0.5295, 0.7504, 0.5729, 0.009, 0.0258, 0.1497, 
+        0.074, 0.3503, 0.5295, 0.7504, 0.5729, 0.009, 0.0258, 0.1497,
         0.2033, 0.8163, 0.2247, 0.137, 0.3676, 0.2538, 0.8215
       ),
       tolerance = 1e-3,
@@ -47,7 +47,7 @@ if (suppressWarnings(requiet("testthat") && requiet("ggeffects") && requiet("mar
     expect_equal(
       out2$p.value,
       c(
-        0.4704, 0.9366, 0.9887, 0.9996, 0.9931, 0.0927, 0.2215, 0.6985, 
+        0.4704, 0.9366, 0.9887, 0.9996, 0.9931, 0.0927, 0.2215, 0.6985,
         0.7976, 0.9999, 0.8276, 0.6689, 0.9453, 0.862, 0.9999
       ),
       tolerance = 1e-3,
@@ -69,7 +69,7 @@ if (suppressWarnings(requiet("testthat") && requiet("ggeffects") && requiet("mar
 
   data(iris)
   model2 <- lm(Sepal.Width ~ Sepal.Length * Species, data = iris)
-  test_that("ggpredict, lm", {
+  test_that("hypothesis_test, interaction", {
     out <- hypothesis_test(model2, c("Sepal.Length", "Species"))
     expect_identical(colnames(out), c(
       "Sepal.Length", "Species", "Contrast", "conf.low", "conf.high",
@@ -82,6 +82,41 @@ if (suppressWarnings(requiet("testthat") && requiet("ggeffects") && requiet("mar
     expect_identical(out$Sepal.Length, c("slope", "slope", "slope"))
   })
 
+  test_that("hypothesis_test, by-argument", {
+    skip_if_not_installed("datawizard")
+    data(efc)
+    efc$c161sex <- datawizard::to_factor(efc$c161sex)
+    efc$c172code <- datawizard::to_factor(efc$c172code)
+
+    mfilter <- lm(neg_c_7 ~ c161sex * c172code + e42dep + c12hour, data = efc)
+    prfilter <- ggpredict(mfilter, "c172code")
+
+    out <- hypothesis_test(prfilter, by = "c161sex")
+    expect_identical(nrow(out), 6L)
+    expect_identical(
+      out$c172code,
+      c(
+        "low level of education-intermediate level of education",
+        "low level of education-high level of education",
+        "intermediate level of education-high level of education",
+        "low level of education-intermediate level of education",
+        "low level of education-high level of education",
+        "intermediate level of education-high level of education"
+      )
+    )
+    expect_equal(out$p.value, c(0.3962, 0.6512, 0.7424, 0.9491, 0.0721, 0.0288), tolerance = 1e-3)
+
+    out <- hypothesis_test(prfilter, by = "c161sex", p_adjust = "tukey")
+    expect_equal(out$p.value, c(0.6727, 0.8934, 0.9422, 0.9978, 0.1699, 0.0734), tolerance = 1e-3)
+
+    prfilter <- ggpredict(mfilter, c("c172code", "c161sex"))
+    out <- hypothesis_test(prfilter, p_adjust = "tukey")
+    out <- out[out$c161sex %in% c("Male-Male", "Female-Female"), , drop = FALSE]
+    expect_equal(out$p.value, c(0.9581, 0.9976, 0.9995, 1, 0.4657, 0.2432), tolerance = 1e-3)
+
+    expect_error(hypothesis_test(mfilter, "c161sex", by = "c12hour"), regex = "categorical")
+  })
+
   if (suppressWarnings(requiet("lme4"))) {
     model3 <- suppressMessages(lmer(outcome ~ groups * episode + sex + (1 | ID), data = d))
     test_that("hypothesis_test, categorical, pairwise", {