--- title: "Bayesian Supervised Learning in StochTree" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Bayesian-Supervised-Learning} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} bibliography: vignettes.bib editor_options: markdown: wrap: 72 --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` This vignette demonstrates how to use the `bart()` function for Bayesian supervised learning (@chipman2010bart). To begin, we load the `stochtree` package. ```{r setup} library(stochtree) ``` # Demo 1: Step Function ## Simulation Here, we generate data from a simple step function. ```{r data} # Generate the data n <- 500 p_x <- 10 snr <- 3 X <- matrix(runif(n*p_x), ncol = p_x) f_XW <- ( ((0 <= X[,1]) & (0.25 > X[,1])) * (-7.5) + ((0.25 <= X[,1]) & (0.5 > X[,1])) * (-2.5) + ((0.5 <= X[,1]) & (0.75 > X[,1])) * (2.5) + ((0.75 <= X[,1]) & (1 > X[,1])) * (7.5) ) noise_sd <- sd(f_XW) / snr y <- f_XW + rnorm(n, 0, 1)*noise_sd # Split data into test and train sets test_set_pct <- 0.2 n_test <- round(test_set_pct*n) n_train <- n - n_test test_inds <- sort(sample(1:n, n_test, replace = FALSE)) train_inds <- (1:n)[!((1:n) %in% test_inds)] X_test <- as.data.frame(X[test_inds,]) X_train <- as.data.frame(X[train_inds,]) y_test <- y[test_inds] y_train <- y[train_inds] ``` ## Sampling and Analysis ### Warmstart We first sample from an ensemble model of $y \mid X$ using "warm-start" initialization samples (@he2023stochastic). This is the default in `stochtree`. ```{r} num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc general_params <- list(sample_sigma2_global = T) mean_forest_params <- list(sample_sigma2_leaf = T, num_trees = 100) bart_model_warmstart <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, general_params = general_params, mean_forest_params = mean_forest_params ) ``` Inspect the MCMC samples ```{r bart_warmstart_plot} plot(bart_model_warmstart$sigma2_global_samples, ylab="sigma^2") abline(h=noise_sd^2,col="red",lty=2,lwd=2.5) plot(rowMeans(bart_model_warmstart$y_hat_test), y_test, pch=16, cex=0.75, xlab = "pred", ylab = "actual") abline(0,1,col="red",lty=2,lwd=2.5) ``` ### BART MCMC without Warmstart Next, we sample from this ensemble model without any warm-start initialization. ```{r} num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc general_params <- list(sample_sigma2_global = T) mean_forest_params <- list(sample_sigma2_leaf = T, num_trees = 100) bart_model_root <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, general_params = general_params, mean_forest_params = mean_forest_params ) ``` Inspect the MCMC samples ```{r bart_root_plot} plot(bart_model_root$sigma2_global_samples, ylab="sigma^2") abline(h=noise_sd^2,col="red",lty=2,lwd=2.5) plot(rowMeans(bart_model_root$y_hat_test), y_test, pch=16, cex=0.75, xlab = "pred", ylab = "actual") abline(0,1,col="red",lty=2,lwd=2.5) ``` # Demo 2: Partitioned Linear Model ## Simulation Here, we generate data from a simple partitioned linear model. ```{r data_plm} # Generate the data n <- 500 p_x <- 10 p_w <- 1 snr <- 3 X <- matrix(runif(n*p_x), ncol = p_x) leaf_basis <- matrix(runif(n*p_w), ncol = p_w) f_XW <- ( ((0 <= X[,1]) & (0.25 > X[,1])) * (-7.5*leaf_basis[,1]) + ((0.25 <= X[,1]) & (0.5 > X[,1])) * (-2.5*leaf_basis[,1]) + ((0.5 <= X[,1]) & (0.75 > X[,1])) * (2.5*leaf_basis[,1]) + ((0.75 <= X[,1]) & (1 > X[,1])) * (7.5*leaf_basis[,1]) ) noise_sd <- sd(f_XW) / snr y <- f_XW + rnorm(n, 0, 1)*noise_sd # Split data into test and train sets test_set_pct <- 0.2 n_test <- round(test_set_pct*n) n_train <- n - n_test test_inds <- sort(sample(1:n, n_test, replace = FALSE)) train_inds <- (1:n)[!((1:n) %in% test_inds)] X_test <- as.data.frame(X[test_inds,]) X_train <- as.data.frame(X[train_inds,]) leaf_basis_test <- leaf_basis[test_inds,] leaf_basis_train <- leaf_basis[train_inds,] y_test <- y[test_inds] y_train <- y[train_inds] ``` ## Sampling and Analysis ### Warmstart We first sample from an ensemble model of $y \mid X$ using "warm-start" initialization samples (@he2023stochastic). This is the default in `stochtree`. ```{r} num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc general_params <- list(sample_sigma2_global = T) mean_forest_params <- list(sample_sigma2_leaf = T, num_trees = 100) bart_model_warmstart <- stochtree::bart( X_train = X_train, leaf_basis_train = leaf_basis_train, y_train = y_train, X_test = X_test, leaf_basis_test = leaf_basis_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, general_params = general_params, mean_forest_params = mean_forest_params ) ``` Inspect the MCMC samples ```{r bart_warmstart_plot_plm} plot(bart_model_warmstart$sigma2_global_samples, ylab="sigma^2") abline(h=noise_sd^2,col="red",lty=2,lwd=2.5) plot(rowMeans(bart_model_warmstart$y_hat_test), y_test, pch=16, cex=0.75, xlab = "pred", ylab = "actual") abline(0,1,col="red",lty=2,lwd=2.5) ``` ### BART MCMC without Warmstart Next, we sample from this ensemble model without any warm-start initialization. ```{r} num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc general_params <- list(sample_sigma2_global = T) mean_forest_params <- list(sample_sigma2_leaf = T, num_trees = 100) bart_model_root <- stochtree::bart( X_train = X_train, leaf_basis_train = leaf_basis_train, y_train = y_train, X_test = X_test, leaf_basis_test = leaf_basis_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, general_params = general_params, mean_forest_params = mean_forest_params ) ``` Inspect the BART samples after burnin. ```{r bart_root_plot_plm} plot(bart_model_root$sigma2_global_samples, ylab="sigma^2") abline(h=noise_sd^2,col="red",lty=2,lwd=2.5) plot(rowMeans(bart_model_root$y_hat_test), y_test, pch=16, cex=0.75, xlab = "pred", ylab = "actual") abline(0,1,col="red",lty=2,lwd=2.5) ``` # Demo 3: Partitioned Linear Model with Random Effects ## Simulation Here, we generate data from a simple partitioned linear model with an additive random effect structure. ```{r data_plm_rfx} # Generate the data n <- 500 p_x <- 10 p_w <- 1 snr <- 3 X <- matrix(runif(n*p_x), ncol = p_x) leaf_basis <- matrix(runif(n*p_w), ncol = p_w) rfx_group_ids <- rep(c(1,2), n %/% 2) rfx_coefs <- matrix(c(-5, -3, 5, 3), nrow=2, byrow=TRUE) rfx_basis <- cbind(1, runif(n, -1, 1)) f_XW <- ( ((0 <= X[,1]) & (0.25 > X[,1])) * (-7.5*leaf_basis[,1]) + ((0.25 <= X[,1]) & (0.5 > X[,1])) * (-2.5*leaf_basis[,1]) + ((0.5 <= X[,1]) & (0.75 > X[,1])) * (2.5*leaf_basis[,1]) + ((0.75 <= X[,1]) & (1 > X[,1])) * (7.5*leaf_basis[,1]) ) rfx_term <- rowSums(rfx_coefs[rfx_group_ids,] * rfx_basis) noise_sd <- sd(f_XW) / snr y <- f_XW + rfx_term + rnorm(n, 0, 1)*noise_sd # Split data into test and train sets test_set_pct <- 0.2 n_test <- round(test_set_pct*n) n_train <- n - n_test test_inds <- sort(sample(1:n, n_test, replace = FALSE)) train_inds <- (1:n)[!((1:n) %in% test_inds)] X_test <- as.data.frame(X[test_inds,]) X_train <- as.data.frame(X[train_inds,]) leaf_basis_test <- leaf_basis[test_inds,] leaf_basis_train <- leaf_basis[train_inds,] y_test <- y[test_inds] y_train <- y[train_inds] rfx_group_ids_test <- rfx_group_ids[test_inds] rfx_group_ids_train <- rfx_group_ids[train_inds] rfx_basis_test <- rfx_basis[test_inds,] rfx_basis_train <- rfx_basis[train_inds,] ``` ## Sampling and Analysis ### Warmstart We first sample from an ensemble model of $y \mid X$ using "warm-start" initialization samples (@he2023stochastic). This is the default in `stochtree`. ```{r} num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc general_params <- list(sample_sigma2_global = T) mean_forest_params <- list(sample_sigma2_leaf = T, num_trees = 100) bart_model_warmstart <- stochtree::bart( X_train = X_train, leaf_basis_train = leaf_basis_train, y_train = y_train, rfx_group_ids_train = rfx_group_ids_train, rfx_basis_train = rfx_basis_train, X_test = X_test, leaf_basis_test = leaf_basis_test, rfx_group_ids_test = rfx_group_ids_test, rfx_basis_test = rfx_basis_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, general_params = general_params, mean_forest_params = mean_forest_params ) ``` Inspect the MCMC samples ```{r bart_warmstart_plot_plm_rfx} plot(bart_model_warmstart$sigma2_global_samples, ylab="sigma^2") abline(h=noise_sd^2,col="red",lty=2,lwd=2.5) plot(rowMeans(bart_model_warmstart$y_hat_test), y_test, pch=16, cex=0.75, xlab = "pred", ylab = "actual") abline(0,1,col="red",lty=2,lwd=2.5) ``` ### BART MCMC without Warmstart Next, we sample from this ensemble model without any warm-start initialization. ```{r} num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc general_params <- list(sample_sigma2_global = T) mean_forest_params <- list(sample_sigma2_leaf = T, num_trees = 100) bart_model_root <- stochtree::bart( X_train = X_train, leaf_basis_train = leaf_basis_train, y_train = y_train, rfx_group_ids_train = rfx_group_ids_train, rfx_basis_train = rfx_basis_train, X_test = X_test, leaf_basis_test = leaf_basis_test, rfx_group_ids_test = rfx_group_ids_test, rfx_basis_test = rfx_basis_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, general_params = general_params, mean_forest_params = mean_forest_params ) ``` Inspect the MCMC samples ```{r bart_root_plot_plm_rfx} plot(bart_model_root$sigma2_global_samples, ylab="sigma^2") abline(h=noise_sd^2,col="red",lty=2,lwd=2.5) plot(rowMeans(bart_model_root$y_hat_test), y_test, pch=16, cex=0.75, xlab = "pred", ylab = "actual") abline(0,1,col="red",lty=2,lwd=2.5) ``` # References