How to generate data for Gaussian distributions in these two scenarios in R?

In Tibshirani's "Elements of Statistical Learning" when comparing least squares / linear models and knn of these two scenarios:

Scenario 1: the training data in each class were created from two-dimensional Gaussian distributions with uncorrelated components and various means.

Scenario 2: the training data in each class were obtained from a mixture of 10 low-dimensional Gaussian distributions with individual means distributed as Gaussian.

The idea is that the former is best suited for least squares / linear models, and the latter is suitable for knn-type models (those with higher variance than I understand, since knn takes into account the nearest points, not all points).

In R, how would I simulate data for both scenarios?

The ultimate goal is to be able to reproduce both scenarios in order to prove that effectively the first is better explained by the linear model than the second.

Thanks!

+4
source share
3 answers

It could be a scenario 1

library(mvtnorm)

N1 = 50
N2 = 50
K = 2

mu1 = c(-1,3)
mu2 = c(2,0)

cov1 = 0
v11 = 2
v12 = 2
Sigma1 = matrix(c(v11,cov1,cov1,v12),nrow=2)

cov2 = 0
v21 = 2
v22 = 2
Sigma2 = matrix(c(v21,cov2,cov2,v22),nrow=2)

x1 = rmvnorm(N1,mu1,Sigma1)
x2 = rmvnorm(N2,mu2,Sigma2)

This could be a candidate for simulation from a Gaussian mixture:

BartSimpson <- function(x,n = 100){ 
   means <- as.matrix(sort(rnorm(10)))
   dens <- .1*rowSums(apply(means,1,dnorm,x=x,sd=.1)) 
   rBartSimpson <- c(apply(means,1,rnorm,n=n/10,sd=.1))
   return(list("thedensity" = dens,"draws" = rBartSimpson))
}

x <- seq(-5,5,by=.01)

plot(x,BartSimpson(x)$thedensity,type="l",lwd=4,col="yellow2",xlim=c(-4,4),ylim=c(0,0.6))
+1
source

In the code below, I first create 10 different class tools, and then use the tools to get random values ​​from these tools. The code is identical for the two scenarios, but you will need to adjust the variance within and between classes to get the desired results.

Scenario 1:

10 ( , ). , .

library(MASS)
n <- 20
# subjects per class
classes <- 10
# number of classes
mean <- 100
# mean value for all classes
var.between <- 25
# variation between classes
var.within <- 225
# variation within classes
covmatrix1 <- matrix(c(var.between,0,0,var.between), nrow=2)
# covariance matrix for the classes
means <- mvrnorm(classes, c(100,100), Sigma=covmatrix1)
# creates the means for the two variables for each class using variance between classes
covmatrix2 <- matrix(c(var.within,0,0,var.within), nrow=2)
# creates a covariance matrix for the subjects
class <- NULL
values <- NULL
for (i in 1:10) {
  temp <- mvrnorm(n, c(means[i], means[i+classes]), Sigma=covmatrix2)
  class <- c(class, rep(i, n))
values <- c(values, temp)
}
# this loop uses generates data for each class based on the class means and variance within classes
valuematrix <- matrix(values, nrow=(n*classes))
data <- data.frame (class, valuematrix)
plot(data$X1, data$X2)

, , , :

covmatrix <- matrix(c(225, 0, 0, 225), nrow=2)
# specifies that the variance in both groups is 225 and no covariance
values <- matrix(mvrnorm(200, c(100,100), Sigma=covmatrix), nrow=200)
# creates a matrix of 200 individuals with two values each.

2:

, , . var.between 500 var.within 25, :

n <- 20
# subjects per class
classes <- 10
# number of classes
mean <- 100
# mean value for all classes
var.between <- 500
# variation between classes
var.within <- 25
# variation within classes
covmatrix1 <- matrix(c(var.between,0,0,var.between), nrow=2)
# covariance matrix for the classes
means <- mvrnorm(classes, c(100,100), Sigma=covmatrix1)
# creates the means for the two variables for each class using variance between classes
covmatrix2 <- matrix(c(var.within,0,0,var.within), nrow=2)
# creates a covariance matrix for the subjects
class <- NULL
values <- NULL
for (i in 1:10) {
  temp <- mvrnorm(n, c(means[i], means[i+classes]), Sigma=covmatrix2)
  class <- c(class, rep(i, n))
values <- c(values, temp)
}
# this loop uses generates data for each class based on the class means and variance within classes
valuematrix <- matrix(values, nrow=(n*classes))
data <- data.frame (class, valuematrix)
plot(data$X1, data$X2)

, .

, !

+1

:

mixed_dists = function(n, n_means, var=0.2) {
    means = rnorm(n_means, mean=1, sd=2)
    values <- NULL
    class <- NULL
    for (i in 1:n_means) {
        temp <- rnorm(n/n_means, mean=means[i], sd=0.2)
        class <- c(class, rep(i, n/n_means))
        values <- c(values, temp)
    }
    return(list(values, class));
}

N = 100

#Scenario 1: The training data in each class were generated from bivariate Gaussian distributions 
#with uncorrelated components and different means.
scenario1 = function () { 
    var = 0.5
    n_groups = 2
    m = mixed_dists(N, n_groups, var=var)
    x = m[[1]]
    group = m[[2]]
    y = mixed_dists(N, n_groups, var=var)[[1]]
    data = matrix(c(x,y, group), nrow=N, ncol=3)
    colnames(data) = c("x", "y", "group")
    data = data.frame(data)
    plot(x=data$x,y=data$y, col=data$group)
    model = lm(y~x, data=data)
    summary(model) 
}



#Scenario 2: The training data in each class came from a mixture of 10 
#low-variance Gaussian distributions, with individual means themselves
#distributed as Gaussian.
scenario2 = function () {
    var = 0.2 # low variance
    n_groups = 10
    m = mixed_dists(N, n_groups, var=var)
    x = m[[1]]
    group = m[[2]]
    y = mixed_dists(N, n_groups, var=var)[[1]]
    data = matrix(c(x,y, group), nrow=N, ncol=3)
    colnames(data) = c("x", "y", "group")
    data = data.frame(data)
    plot(x=data$x,y=data$y, col=data$group)
    model = lm(y~x, data=data)
    summary(model)
}
# scenario1()
# scenario2()

Thus, basically the data in scenario 1 are clearly separated in 2 classes, and the data in scenario 2 have about 10 clusters and cannot be separated purely in a straight line. Indeed, when running a linear model in both scenarios, you can see that on average it will be better applied to scenario 1 than to scenario 2.

0
source

Source: https://habr.com/ru/post/1605934/


All Articles