#
# fcn -- function to be integrated.
# X -- array (N x n) of sample points from a corresponding
# distribution.
# K -- number of clusters.
<- function(fcn, X, K){
integrate_kmeans <- kmeans(X, K) # we use R function kmeans
km <- km$centers # centers of obtained clusters
Xc <- km$size/sum(km$size) # proportions of sample points in clusters
w
<- purrr::map_dbl(seq_len(K), ~ fcn(Xc[., ]))
F <- sum(w*F)
I
return(I)
}
k-means clustering approach to an integral evaluation
Problem formulation
Let us consider an integral
\[ I = \int\limits_{\Omega}{f(x)\pi(x)dx}, \tag{1}\]
where \(f(x): \Omega \rightarrow \mathbb{R}^n\), \(\pi(x): \Omega \rightarrow \mathbb{R}^n\) is a probability density function, \(x \in \Omega \subseteq \mathbb{R}^n\).
Using Monte Carlo (MC) approach, the integral can be evaluates as
\[ I \approx \frac{1}{N}\sum\limits_{i = 1}^{N}{f(x_i)}, \tag{2}\]
where \(\{x_i\}_{i = 1}^{N}\) are sample points from the distribution \(\pi(x)\). The lager the sample size \(N\), the more accurate the approximation. However, if \(N\) is sufficiently large, then the integral evaluation procedure becomes time-consuming.
K-means clustering approach to integration
An alternative approach can be applied, which is base on the k-means clustering. The algorithm is the following:
- Sample \(N\) points from \(\pi(x)\) distribution.
- Apply k-means clustering to the sample obtained. An output of the clustering procedure is:
- \(\{X_j^{(0)}\}_{j = 1}^K\) – clusters’ centers.
- \(\{w_j\}_{j = 1}^K\) – proportions of sample points in \(j\text{the}\) cluster.
- Calculate an approximated value of the integral as
\[ I \approx \sum\limits_{j = 1}^{K}{w_jf(X_j^{(0)})}. \tag{3}\]
R code implementing k-means-based algorithm
R code implementing Monte Carlo-based algorithm
# fcn -- function to be integrated.
# X -- array (N x n) of sample points from a corresponding
# distribution.
<- function(fcn, X){
integrate_mc <- purrr::map_dbl(seq_len(K), ~ fcn(X[., ]))
F <- mean(F)
I
return(I)
}
An Example
Let us evaluate a two-dimensional integral
\[ I = \iint\limits_{x_1^2 + x_2^2 \leq 4}e^{-\frac{(x_1^2+x_2^2)}{4}}dx_1dx_2, \tag{4}\]
which can be rewritten as
\[ \begin{aligned} I &= \int\limits_{-\infty}^\infty\int\limits_{-\infty}^\infty 4\pi\chi(x_1, x_2)\frac{1}{4\pi}e^{-\frac{(x_1^2+x_2^2)}{4}}dx_1dx_2 = \\ &= \int\limits_{-\infty}^\infty\int\limits_{-\infty}^\infty f(x_1, x_2)\pi(x_1, x_2)dx_1dx_2, \end{aligned} \tag{5}\]
where
\[ f(x_1, x_2) = 4\pi\chi(x_1, x_2)=\left\{\begin{array}{rl} 4\pi, & x_1^2+x_2^2 \leq 4\\ 0, & otherwise\end{array}\right., \]
and
\[ \pi(x) = \frac{1}{4\pi}e^{-\frac{(x_1^2+x_2^2)}{4}} \]
is a probability density function of a normal distribution \(N\left(\mu = \left(\begin{array}{cc}0 \\ 0\end{array}\right), \Sigma = \left(\begin{array}{cc}2 & 0 \\ 0 & 2\end{array}\right)\right)\).
<- function(x){
fcn <- 4*pi*ifelse(x[1]^2+x[2]^2 <= 4, 1, 0)
result return(result)
}
<- 2 # number of dimensions
n <- 10000 # sample size from normal distribution
N <- 8 # number of clusters for k-means clustering
K
# sampling from N(mu, Sigma)
<- c(0, 0)
mu <- diag(c(2, 2))
Sigma
set.seed(3141592)
<- MASS::mvrnorm(N, mu, Sigma)
X
# integration by using MC
<- integrate_mc(fcn, X)
mc_value
# integration by using k-means
<- integrate_kmeans(fcn, X, K)
kmeans_value
# true value
<- 4*pi*(1-exp(-1))
true_value
::tibble(
tibble
true_value, mc_value, kmeans_value )
# A tibble: 1 × 3
true_value mc_value kmeans_value
<dbl> <dbl> <dbl>
1 7.94 7.85 7.73