I am trying to learn mixture models, here is the paper for the mixtools package. I tried to implement the univariate normal mixture model by myself. Below is the code:
data(faithful)
EM_normalMixture= function(x,mu.init,sd.init,lambda.init,iter.max,tol=1e-5)
{
change <- Inf
k<-1
mu<-mu.init
sd<-sd.init
lambda<-lambda.init
loglik<-c()
loglik[1]<-0
while(change>tol&k<iter.max){
#E step
comp1.prod <- dnorm(x, mu[1], sd[1]) * lambda[1]
comp2.prod <- dnorm(x, mu[2], sd[2]) * lambda[2]
posterior.1 <- (1+comp2.prod/comp1.prod )^-1 #formula (3) in the paper mixtools: An R Package for Analyzing Finite Mixture Models
posterior.2 <- (1+comp1.prod/comp2.prod )^-1
k<-k+1
loglik[k]<-sum(posterior.1*log(comp1.prod)+posterior.2*log(comp2.prod))
change<-abs(loglik[k]-loglik[k-1])
# M step
mu[1] <- sum(posterior.1 * x)/sum(posterior.1)
mu[2] <- sum(posterior.2 * x)/sum(posterior.2)
sd[1] <- sqrt(sum(posterior.1 * (x - mu[1])^2)/sum(posterior.1))
sd[2] <- sqrt(sum(posterior.2 * (x - mu[2])^2)/sum(posterior.2))
lambda[1] <- sum(posterior.1)/ length(x)
lambda[2] <- sum(posterior.2)/ length(x)
}
return(list('mu'=mu,'sd'=sd, 'lambda'=lambda,'posterior'=cbind(posterior.1,posterior.2),'loglik'=loglik))
}
x<-faithful$waiting
EM_normalMixture(x,mu.init=c(80,50),sd.init=c(5,5),lambda.init=c(0.5,0.5),iter.max=50)
The results are(not include posterior):
## $mu
## [1] 80.09103 54.61479
##
## $sd
## [1] 5.867777 5.871161
##
## $lambda
## [1] 0.639116 0.360884
##
##
## $loglik
## [1] 0.000 -1097.229 -1045.628 -1045.528 -1045.350 -1045.250 -1045.204
## [8] -1045.184 -1045.176 -1045.173 -1045.172 -1045.172 -1045.172 -1045.172
## [15] -1045.173 -1045.173 -1045.173 -1045.173 -1045.173 -1045.173 -1045.173
## [22] -1045.173 -1045.173
Now we use normalmixEM function in the mixtools packate
library(mixtools)
x<-faithful$waiting
normalmixEM(x, lambda = c(0.6,0.4), mu = c(80,50), sigma = c(5,5), k = 2)
And here are some results from the mixtools:
##
## $lambda
## [1] 0.639115 0.360885
##
## $mu
## [1] 80.09104 54.61481
##
## $sigma
## [1] 5.867756 5.871190
##
## $loglik
## [1] -1034.002
##
##
## $all.loglik
## [1] -1078.434 -1034.528 -1034.034 -1034.008 -1034.004 -1034.002 -1034.002
## [8] -1034.002 -1034.002 -1034.002 -1034.002 -1034.002 -1034.002 -1034.002
## [15] -1034.002 -1034.002 -1034.002
##
## $restarts
## [1] 0
##
## $ft
## [1] "normalmixEM"
##
## attr(,"class")
## [1] "mixEM"
We can see the two results are almost the same, except the 'loglik' or "Q" for the EM algorithm. To investigate what causes the differences I look into the source codes of the mixtools normalmixEM function, here are the source codes of the normalmixEM function, I did some modifications in order to run the function without the library mixtools.
#note we do not need mixtools library here
dyn.load("normpost") #need to compile normpost.c first
normalmix.init<-function (x, lambda = NULL, mu = NULL, s = NULL, k = 2, arbmean = TRUE,
arbvar = TRUE)
{
if (!is.null(s)) {
arbvar <- (length(s) > 1)
if (arbvar)
k <- length(s)
}
if (!is.null(mu)) {
arbmean <- (length(mu) > 1)
if (arbmean) {
k <- length(mu)
if (!is.null(s) && length(s) > 1 && k != length(s)) {
stop("mu and sigma are each of length >1 but not of the same length.")
}
}
}
if (!arbmean && !arbvar) {
stop("arbmean and arbvar cannot both be FALSE")
}
n = length(x)
x = sort(x)
x.bin = list()
for (j in 1:k) {
x.bin[[j]] <- x[max(1, floor((j - 1) * n/k)):ceiling(j *
n/k)]
}
if (is.null(s)) {
s.hyp = as.vector(sapply(x.bin, sd))
if (any(s.hyp == 0))
s.hyp[which(s.hyp == 0)] = runif(sum(s.hyp == 0),
0, sd(x))
if (arbvar) {
s = 1/rexp(k, rate = s.hyp)
}
else {
s = 1/rexp(1, rate = mean(s.hyp))
}
}
if (is.null(mu)) {
mu.hyp <- as.vector(sapply(x.bin, mean))
if (arbmean) {
mu = rnorm(k, mean = mu.hyp, sd = s)
}
else {
mu = rnorm(1, mean = mean(mu.hyp), sd = mean(s))
}
}
if (is.null(lambda)) {
cond = TRUE
while (cond) {
lambda = runif(k)
lambda = lambda/sum(lambda)
if (min(lambda) < 0.05)
cond = TRUE
else cond = FALSE
}
}
else {
lambda <- rep(lambda, length.out = k)
lambda <- lambda/sum(lambda)
}
list(lambda = lambda, mu = mu, s = s, k = k, arbvar = arbvar,
arbmean = arbmean)
}
normalmixEM2<-function (x, lambda = NULL, mu = NULL, sigma = NULL, k = 2, mean.constr = NULL,
sd.constr = NULL, epsilon = 1e-08, maxit = 1000, maxrestarts = 20,
verb = FALSE, fast = FALSE, ECM = FALSE, arbmean = TRUE, arbvar = TRUE)
{
warn <- options(warn = -1)
x <- as.vector(x)
tmp <- normalmix.init(x = x, lambda = lambda, mu = mu, s = sigma,
k = k, arbmean = arbmean, arbvar = arbvar)
lambda <- tmp$lambda
mu <- tmp$mu
sigma <- tmp$s
k <- tmp$k
arbvar <- tmp$arbvar
arbmean <- tmp$arbmean
if (fast == TRUE && k == 2 && arbmean == TRUE) {
a <- normalmixEM2comp(x, lambda = lambda[1], mu = mu,
sigsqrd = sigma^2, eps = epsilon, maxit = maxit,
verb = verb)
}
else {
z <- parse.constraints(mean.constr, k = k, allsame = !arbmean)
meancat <- z$category
meanalpha <- z$alpha
z <- parse.constraints(sd.constr, k = k, allsame = !arbvar)
sdcat <- z$category
sdalpha <- z$alpha
ECM <- ECM || any(meancat != 1:k) || any(sdcat != 1)
n <- length(x)
notdone <- TRUE
restarts <- 0
while (notdone) {
notdone <- FALSE
tmp <- normalmix.init(x = x, lambda = lambda, mu = mu,
s = sigma, k = k, arbmean = arbmean, arbvar = arbvar)
lambda <- tmp$lambda
mu <- tmp$mu
k <- tmp$k
sigma <- tmp$s
var <- sigma^2
diff <- epsilon + 1
iter <- 0
postprobs <- matrix(nrow = n, ncol = k)
mu <- rep(mu, k)[1:k]
sigma <- rep(sigma, k)[1:k]
z <- .C("normpost", as.integer(n), as.integer(k),
as.double(x), as.double(mu), as.double(sigma),
as.double(lambda), res2 = double(n * k), double(3*k), post = double(n * k), loglik = double(1))
postprobs <- matrix(z$post, nrow = n)
res <- matrix(z$res2, nrow = n)
ll <- obsloglik <- z$loglik
while (diff > epsilon && iter < maxit) {
lambda <- colMeans(postprobs)
mu[meancat == 0] <- meanalpha[meancat == 0]
if (max(meancat) > 0) {
for (i in 1:max(meancat)) {
w <- which(meancat == i)
if (length(w) == 1) {
mu[w] <- sum(postprobs[, w] * x)/(n * lambda[w])
}
else {
tmp <- t(postprobs[, w]) * (meanalpha[w]/sigma[w]^2)
mu[w] <- meanalpha[w] * sum(t(tmp) * x)/sum(tmp *
meanalpha[w])
}
}
}
if (ECM) {
z <- .C("normpost", as.integer(n), as.integer(k),
as.double(x), as.double(mu), as.double(sigma),
as.double(lambda), res2 = double(n * k),
double(3 * k), post = double(n * k), loglik = double(1))
postprobs <- matrix(z$post, nrow = n)
res <- matrix(z$res2, nrow = n)
lambda <- colMeans(postprobs)
}
sigma[sdcat == 0] <- sdalpha[sdcat == 0]
if (max(sdcat) > 0) {
for (i in 1:max(sdcat)) {
w <- which(sdcat == i)
if (length(w) == 1) {
sigma[w] <- sqrt(sum(postprobs[, w] * res[,
w])/(n * lambda[w]))
}
else {
tmp <- t(postprobs[, w])/sdalpha[w]
sigma[w] <- sdalpha[w] * sqrt(sum(t(tmp) *
res[, w])/(n * sum(lambda[w])))
}
}
if (any(sigma < 1e-08)) {
notdone <- TRUE
cat("One of the variances is going to zero; ",
"trying new starting values.\n")
restarts <- restarts + 1
lambda <- mu <- sigma <- NULL
if (restarts > maxrestarts) {
stop("Too many tries!")
}
break
}
}
z <- .C("normpost", as.integer(n), as.integer(k),
as.double(x), as.double(mu), as.double(sigma),
as.double(lambda), res2 = double(n * k), double(3 * k), post = double(n * k), loglik = double(1))
postprobs <- matrix(z$post, nrow = n)
res <- matrix(z$res2, nrow = n)
newobsloglik <- z$loglik
diff <- newobsloglik - obsloglik
obsloglik <- newobsloglik
ll <- c(ll, obsloglik)
iter <- iter + 1
if (verb) {
cat("iteration =", iter, " log-lik diff =",
diff, " log-lik =", obsloglik, "\n")
print(rbind(lambda, mu, sigma))
}
}
}
if (iter == maxit) {
cat("WARNING! NOT CONVERGENT!", "\n")
}
cat("number of iterations=", iter, "\n")
if (arbmean == FALSE) {
scale.order = order(sigma)
sigma.min = min(sigma)
postprobs = postprobs[, scale.order]
colnames(postprobs) <- c(paste("comp", ".",
1:k, sep = ""))
a = list(x = x, lambda = lambda[scale.order], mu = mu,
sigma = sigma.min, scale = sigma[scale.order]/sigma.min,
loglik = obsloglik, posterior = postprobs, all.loglik = ll,
restarts = restarts, ft = "normalmixEM")
}
else {
colnames(postprobs) <- c(paste("comp", ".",
1:k, sep = ""))
a = list(x = x, lambda = lambda, mu = mu, sigma = sigma,
loglik = obsloglik, posterior = postprobs, all.loglik = ll,
restarts = restarts, ft = "normalmixEM")
}
}
class(a) = "mixEM"
options(warn)
a
}
data(faithful)
x<-faithful$waiting
normalmixEM2(x, lambda = c(0.5,0.5), mu = c(80,50), sigma = c(5,5), k = 2) #note, function is
#normalmixEM2
I got exactly the same results as the normalmixEM in the mixtools.
And the source codes for normpost.c are
#include <R.h>
#include <Rmath.h>
/* Compute the matrix of "posterior" probabilities in a finite mixture of
univariate normal densities. The algorithm used is fairly safe from
a numerical perspective; it avoids over- or under-flow as long as the
values of sigma are not too large or small. */
void normpost(
int *nn, /* sample size */
int *mm, /* number of components */
double *data, /* vector of observations */
double *mu, /* current vector of component means */
double *sigma, /* current vector of component stdevs */
double *lambda, /* current vector of mixing parameters */
double *res2, /* n by m matrix of squared residuals */
double *work, /* 3*m-vector of workspace, which will be broken into 3 parts */
double *post, /* n by m matrix of posterior probabilities */
double *loglik /* scalar loglikelihood value */
) {
int n=*nn, m=*mm, i, j, minj=0;
double x, r, rowsum, min=0.0;
double *LamSigRatio = work+m; /* Second 1/3 of workspace, for frequently used constants */
double *logLamSigRatio = work+2*m; /* Third 1/3 of workspace, for frequently used constants */
*loglik = -(double)n * 0.91893853320467274178; /* n/2 times log(2pi) */
for (j=0; j<m; j++){ /* store some often-used values to save time later */
LamSigRatio[j] = lambda[j] / sigma[j];
logLamSigRatio[j] = log(LamSigRatio[j]);
}
for (i=0; i<n; i++){
x=data[i];
for (j=0; j<m; j++) {
r = x-mu[j];
r = r*r;
res2[i + n*j] = r;
work[j] = r = r / (2.0 * sigma[j] * sigma[j]);
/* Keep track of the smallest standardized squared residual.
By dividing everything by the component density with the
smallest such residual, the denominator of the posterior
is guaranteed to be at least one and cannot be infinite unless
the values of lambda or sigma are very large or small. This helps
prevent numerical problems when calculating the posteriors.*/
if (j==0 || r < min) {
minj = j;
min = r;
}
}
/* At this stage, work contains the squared st'dized resids over 2 */
rowsum = 1.0;
for (j=0; j<m; j++) {
if (j==minj)
work[j] = 1.0;
else {
work[j] = (LamSigRatio[j] / LamSigRatio[minj]) * exp(min - work[j]);
rowsum += work[j];
}
}
/* At this stage, work contains the normal density at data[i]
divided by the normal density with the largest st'dized resid
Thus, dividing by rowsum gives the posteriors: */
for (j=0; j<m; j++) {
post[i + n*j] = work[j] / rowsum;
}
/* Finally, adjust the loglikelihood correctly */
*loglik += log(rowsum) - min + logLamSigRatio[minj];
}
}
void oldnormpost(
int *nn, /* sample size */
int *mm, /* number of components */
double *data, /* vector of observations */
double *mu, /* current vector of component means */
double *sigma, /* current vector of component stdevs */
double *lambda, /* current vector of mixing parameters */
double *res2, /* n by m matrix of squared residuals */
double *work, /* m-vector of workspace */
double *post, /* n by m matrix of posterior probabilities */
double *loglik /* scalar loglikelihood value */
) {
int n=*nn, m=*mm, i, j, minj=0;
double x, r, rowsum, min=0.0;
*loglik = -(double)n * 0.91893853320467274178; /* n/2 times log(2pi) */
for (i=0; i<n; i++){
x=data[i];
min = 1.0e+6; /* large number */
for (j=0; j<m; j++) {
r = x-mu[j];
r = r*r;
res2[i + n*j] = r;
work[j] = r = r / (2.0 * sigma[j] * sigma[j]);
/* Keep track of the smallest standardized squared residual.
By dividing everything by the component density with the
smallest such residual, the denominator of the posterior
is guaranteed to be at least one and cannot be infinite unless
the values of lambda or sigma are very large or small. This helps
prevent numerical problems when calculating the posteriors.*/
if (r < min) {
minj = j;
min = r;
}
}
/* At this stage, work contains the squared st'dized resids over 2 */
rowsum = 1.0;
for (j=0; j<m; j++) {
if (j==minj)
work[j] = 1.0;
else
rowsum+=(work[j]=lambda[j]/sigma[j]*sigma[minj]/lambda[minj]*exp(min-work[j]));
}
/* At this stage, work contains the normal density at data[i]
divided by the normal density with the largest st'dized resid
Thus, dividing by rowsum gives the posteriors: */
for (j=0; j<m; j++) {
post[i + n*j] = work[j] / rowsum;
}
/* Finally, adjust the loglikelihood correctly */
*loglik += log(rowsum) - min + log(lambda[minj]/sigma[minj]);
}
}
Here are my questions:
Why the 'loglik' or 'Q' in is not always increasing in my implement(forget initial 0)? such as from -1045.172 to -1045.173 is in fact decreasing, I think for EM algorithm the "Q" should always increasing.
What are the maths in the normpost.c? I am not very familiar with c code, I cannot figure out the mathematics in the normpost.c.
Here is another question from this site but I think it does not solve my questions.
Thank you very much.