A Guide to the BKP R package • BKP

Installation

You can install the stable version of BKP from CRAN with:

install.packages("BKP")

Or install the development version from GitHub with:

# install.packages("pak")
pak::pak("Jiangyan-Zhao/BKP")

BKP model

library(BKP)
library(tgp)

Example 1

Let $x\in[-2,2]$ , and suppose the true Bernoulli probability function is given by $\pi_{1}(x) = \frac{1}{1+e^{-3x}}.$

# Define true success probability function
true_pi_fun <- function(x) {
  1/(1+exp(-3*x))
}
X <- seq(-2, 2, length = 1000)
true_pi <- true_pi_fun(X)
plot(X, true_pi, type = "l", lwd = 2, xlab = "x", ylab = "Probability")

We aim to fit the BKP model based on seven input locations that are uniformly distributed over $[-2,2]$ , with each location associated with a binomial observation having a maximum trial count of 100.

set.seed(123)
# Data generation
n <- 7
Xbounds <- matrix(c(-2,2), nrow = 1)
X <- lhs(n = n, rect = Xbounds)
true_pi <- true_pi_fun(X)
m <- sample(100, n, replace = TRUE)
y <- rbinom(n, size = m, prob = true_pi)

# Fit BKP model
BKP_model_1D_1 <- fit.BKP(X, y, m, Xbounds = Xbounds)

# Plot results
plot(BKP_model_1D_1)

# Add true probability function
Xnew = matrix(seq(-2, 2, length = 100), ncol=1)
true_pi <- true_pi_fun(Xnew)
lines(Xnew,true_pi, col = "black", lwd = 2)
legend(x = -2.24,y = 0.89, lwd = 2, bty = "n", col = "black",
       legend = "Ture Probability", inset = 0.02)

Example 2

The first example is essentially a generalized linear model with a smooth logit link, and thus poses limited modeling complexity. To demonstrate the capability of the BKP model in handling more challenging classification structures, we consider a second example with a highly nonlinear underlying probability surface. Define the true Bernoulli probability as $\pi_{2}(x) = \frac{1}{2}\left[ 1+e^{-x^{2}}\cos \left( 10\frac{1-e^{-x}}{1+e^{-x}} \right) \right],$ where $x \in [-2,2]$ . This example involves rapid local oscillations and strong nonlinearity, making it substantially more difficult to fit than Example 1. Here, we increase the number of locations to 30.

# Define true success probability function
true_pi_fun <- function(x) {
  (1 + exp(-x^2) * cos(10 * (1 - exp(-x)) / (1 + exp(-x)))) / 2
}
X <- seq(-2, 2, length = 1000)
true_pi <- true_pi_fun(X)
plot(X, true_pi, type = "l", lwd = 2,
     xlab = "x", ylab = "Probability")

set.seed(123)
# Data generation
n <- 30
Xbounds <- matrix(c(-2,2), nrow = 1)
X <- lhs(n = n, rect = Xbounds)
true_pi <- true_pi_fun(X)
m <- sample(100, n, replace = TRUE)
y <- rbinom(n, size = m, prob = true_pi)

# Fit BKP model
BKP_model_1D_2 <- fit.BKP(X, y, m, Xbounds = Xbounds)

# Plot results
plot(BKP_model_1D_2)

# Add true probability function
Xnew = matrix(seq(-2, 2, length = 100), ncol=1)
true_pi <- true_pi_fun(Xnew)
lines(Xnew,true_pi, col = "black", lwd = 2)
legend(x = -2.24,y = 0.89, lwd = 2, bty = "n", col = "black",
       legend = "Ture Probability", inset = 0.02)

Example 3

Let $\vec{x} \in [0,1]^2$ , and define the latent surface using a re-scaled version of the Goldstein–Price function: $\begin{align*} f(\vec{x}) &= \frac{\log\left[\left\{1+a(\vec{x})\right\}\left\{30+b(\vec{x})\right\} - 8.6928\right]}{2.4269}, \quad \text{with} \\ a(\vec{x}) &= \left(4 x_1 + 4 x_2 - 3 \right)^2 \cdot \Big\{75 - 56 (x_1 + x_2) + 3(4 x_1 - 2)^2 \\ &\quad + 6(4 x_1 - 2)(4 x_2 - 2) + 3(4 x_2 - 2)^2\Big\}, \\ b(\vec{x}) &= \left(8 x_1 - 12 x_2 +2 \right)^2 \cdot \Big\{-14 - 128 x_1 + 12(4 x_1 - 2)^2 \\ &\quad + 192 x_2 - 36(4 x_1 - 2)(4 x_2 - 2) + 27(4 x_2 - 2)^2\Big\}. \end{align*}$

The true Bernoulli probability surface is then defined by $\pi_3(\vec{x}) = \Phi\{f(\vec{x})\},$ where $\Phi(\cdot)$ is the cumulative distribution function of the standard normal distribution.

# Define 2D latent function and probability transformation
true_pi_fun <- function(X) {
  if(is.null(nrow(X))) X <- matrix(X, nrow=1)
  m <- 8.6928
  s <- 2.4269
  x1 <- 4*X[,1]- 2
  x2 <- 4*X[,2]- 2
  a <- 1 + (x1 + x2 + 1)^2 *
    (19- 14*x1 + 3*x1^2- 14*x2 + 6*x1*x2 + 3*x2^2)
  b <- 30 + (2*x1- 3*x2)^2 *
    (18- 32*x1 + 12*x1^2 + 48*x2- 36*x1*x2 + 27*x2^2)
  f <- log(a*b)
  f <- (f- m)/s
  return(pnorm(f))  # Transform to probability
}
x1 <- seq(0, 1, length.out = 100)
x2 <- seq(0, 1, length.out = 100)
X <- expand.grid(x1 = x1, x2 = x2)
true_pi <- true_pi_fun(X)
df <- data.frame(x1 = X$x1, x2 = X$x2, True = true_pi)
print(BKP:::my_2D_plot_fun("True", title = "True Probability", data = df))

To construct the training data, we generate a LHD of size $100$ over $[0,1]^2$ . Each location is associated with a binomial observation whose number of trials is randomly drawn from $\{1, \dots, 100\}$ .

set.seed(123)
# Data generation
n <- 100
Xbounds <- matrix(c(0, 0, 1, 1), nrow = 2)
X <- lhs(n = n, rect = Xbounds)
true_pi <- true_pi_fun(X)
m <- sample(100, n, replace = TRUE)
y <- rbinom(n, size = m, prob = true_pi)

# Fit BKP model
BKP_model_2D <- fit.BKP(X, y, m, Xbounds=Xbounds)

# Plot results
plot(BKP_model_2D)

Example 4

We next consider a binary classification task using the Two Spirals dataset, a well-known benchmark consisting of two intertwined spirals in a bounded two-dimensional input space.

We generate $n = 200$ observations using the mlbench.spirals function from the R package mlbench, with two complete rotations and additive Gaussian noise of standard deviation sd = 0.05. The inputs $\vec{x}$ are constrained to the domain $[-1.7, 1.7]^2$ , and the binary class labels are encoded as 0 and 1. We fit the BKP model using a fixed prior specification with r_0 = 0.1 and p_0 = 0.5.

library(mlbench) 
set.seed(123)
# Data
n <- 200
data <- mlbench.spirals(n, cycles = 2, sd = 0.05)
X <- data$x
y <- as.numeric(data$classes) - 1  # Convert to 0/1 for BKP
m <- rep(1, n)
Xbounds <- rbind(c(-1.7, 1.7), c(-1.7, 1.7))

# Fit model
BKP_model_Class <- fit.BKP(
  X, y, m, Xbounds = Xbounds,
  prior = "fixed", r0 = 0.1, loss = "log_loss")

# Plot results
plot(BKP_model_Class)

DKP model

Example 5

Consider a one-dimensional three-class classification problem. The input is defined on the interval $x \in [-2, 2]$ , and the true class probability vector is given by $\vec{\pi}(x) = \left[\frac{\pi_{1}(x)}{2}, \frac{\pi_{2}(x)}{2},1-\frac{\pi_{1}(x)}{2}-\frac{\pi_{2}(x)}{2}\right]^\top,$ where $\pi_{1}(x)$ and $\pi_{2}(x)$ are smooth functions defined in Example 1 and 2, respectively. We generate $n = 30$ input locations using Latin hypercube sampling over the interval $[-2, 2]$ . At each location, the response is a multinomial vector with probability $\vec{\pi}(x)$ and a random total count sampled from $\{1, \dots, 150\}$ .

set.seed(123)
# Define true class probability function (3-class)
true_pi_fun <- function(X) {
  p1 <- 1/(1+exp(-3*X))
  p2 <- (1 + exp(-X^2) * cos(10 * (1 - exp(-X)) / (1 + exp(-X)))) / 2
  return(matrix(c(p1/2, p2/2, 1 - (p1+p2)/2), nrow = length(p1)))
}
# Data points
n <- 30
Xbounds <- matrix(c(-2, 2), nrow = 1)
X <- lhs(n = n, rect = Xbounds)
true_pi <- true_pi_fun(X)
m <- sample(150, n, replace = TRUE)
Y <- t(sapply(1:n, function(i) rmultinom(1, size = m[i], prob = true_pi[i, ])))

# Fit DKP model
DKP_model_1D <- fit.DKP(X, Y, Xbounds = Xbounds)

# Plot results
plot(DKP_model_1D)

# Add true probability surface
Xnew <- matrix(seq(-2,2, length = 100), ncol=1)
true_pi <- true_pi_fun(Xnew)
plot(Xnew, true_pi[, 1], type = "l", col = "black",
     xlab = "x", ylab = "Probability", ylim = c(0, 1),
     main = "True Probability", lwd = 2)
lines(Xnew, true_pi[, 2], col = "red", lwd = 2)
lines(Xnew, true_pi[, 3], col = "blue", lwd = 2)
legend("topright",
       bty = "n",
       legend = c("Class 1", "Class 2", "Class 3"),
       col = c("black", "red", "blue"), lty = 1, lwd = 2)

Example 6

Consider a two-dimensional three-class classification problem. Let $\vec{x} = [x_{1}, x_{2}]^\top \in [0,1]^2$ , and define the true class probability function as $\vec{\pi}(\vec{x}) = \left[ \frac{\pi_{3}(\vec{x})}{2}, \; \frac{\pi_{4}(\vec{x})}{2}, \; 1 - \frac{\pi_{3}(\vec{x})}{2} - \frac{\pi_{4}(\vec{x})}{2} \right]^\top,$ where $\pi_{3}(\vec{x})$ is defined in Example 3, and $\pi_{4}(\vec{x}) = \sin(\pi x_1) \cos\{\pi(x_2 - 0.5)\}.$ We generate $n = 100$ input points using Latin hypercube sampling over $[0,1]^2$ . At each location, the response is a multinomial vector with probability $\vec{\pi}(\vec{x})$ and a total count randomly sampled from $\{1, \dots, 150\}$ .

set.seed(123)
# Define 2D latent function and probability transformation (3-class)
true_pi_fun_1 <- function(X) {
  if(is.null(nrow(X))) X <- matrix(X, nrow=1)
  m <- 8.6928
  s <- 2.4269
  x1 <- 4*X[,1]- 2
  x2 <- 4*X[,2]- 2
  a <- 1 + (x1 + x2 + 1)^2 *
    (19- 14*x1 + 3*x1^2- 14*x2 + 6*x1*x2 + 3*x2^2)
  b <- 30 + (2*x1- 3*x2)^2 *
    (18- 32*x1 + 12*x1^2 + 48*x2- 36*x1*x2 + 27*x2^2)
  f <- log(a*b)
  f <- (f- m)/s
  return(pnorm(f))  # Transform to probability
}
true_pi_fun <- function(X){
  p1 <- true_pi_fun_1(X)
  p2 <- sin(pi * X[,1]) * cos(pi * (X[,2] - 0.5))
  return(matrix(c(p1/2, p2/2, 1 - (p1+p2)/2), nrow = length(p1)))
}
n <- 100
Xbounds <- matrix(c(0, 0, 1, 1), nrow = 2)
X <- lhs(n = n, rect = Xbounds)
true_pi <- true_pi_fun(X)
m <- sample(150, n, replace = TRUE)
Y <- t(sapply(1:n, function(i) rmultinom(1, size = m[i], prob = true_pi[i, ])))

# Fit DKP model
DKP_model_2D <- fit.DKP(X, Y, Xbounds=Xbounds)

# Plot results
plot(DKP_model_2D)

Example 7

Consider another three-class classification task based on the well-known Iris dataset. This classic benchmark dataset contains measurements of three iris species (setosa, versicolor, and virginica) each described by four features: sepal length, sepal width, petal length, and petal width. Due to the overlap in feature space, particularly between versicolor and virginica, the class boundaries are not linearly separable, making this dataset a standard testbed for evaluating multi-class classification algorithms. For visualization purposes, we restrict our analysis to the first two features: sepal length and sepal width.

set.seed(123)
# Data
data(iris)
X <- as.matrix(iris[, 1:2])
Xbounds <- rbind(c(4.2, 8), c(1.9, 4.5))
labels <- iris$Species
Y <- model.matrix(~ labels - 1) # expand factors to a set of dummy variables

# Fit model
DKP_model_Class <- fit.DKP(
  X, Y, Xbounds = Xbounds, loss = "log_loss",
  prior = "fixed", r0 = 0.1, p0 = rep(1/3, 3))

# Plot results
plot(DKP_model_Class)