1

I am trying to learn mixture models, here is the paper for the mixtools package. I tried to implement the univariate normal mixture model by myself. Below is the code:

data(faithful)
EM_normalMixture= function(x,mu.init,sd.init,lambda.init,iter.max,tol=1e-5)
{
  change <- Inf
  k<-1
  mu<-mu.init
  sd<-sd.init
  lambda<-lambda.init
  loglik<-c()
  loglik[1]<-0
  
  while(change>tol&k<iter.max){
    #E step
    comp1.prod <- dnorm(x, mu[1], sd[1]) * lambda[1]
    comp2.prod <- dnorm(x, mu[2], sd[2]) * lambda[2]
    
    posterior.1 <- (1+comp2.prod/comp1.prod )^-1  #formula (3) in the paper mixtools: An R Package for Analyzing Finite Mixture Models
    posterior.2 <- (1+comp1.prod/comp2.prod )^-1
    
    k<-k+1
    loglik[k]<-sum(posterior.1*log(comp1.prod)+posterior.2*log(comp2.prod))
    
    change<-abs(loglik[k]-loglik[k-1])
    
    # M step
    
   mu[1] <- sum(posterior.1 * x)/sum(posterior.1) 
   mu[2] <- sum(posterior.2 * x)/sum(posterior.2) 
   
   
   sd[1] <- sqrt(sum(posterior.1 * (x - mu[1])^2)/sum(posterior.1))
   sd[2] <- sqrt(sum(posterior.2 * (x - mu[2])^2)/sum(posterior.2))
   
      
   lambda[1] <- sum(posterior.1)/ length(x)
   lambda[2] <- sum(posterior.2)/ length(x)
    
  }
  
  return(list('mu'=mu,'sd'=sd, 'lambda'=lambda,'posterior'=cbind(posterior.1,posterior.2),'loglik'=loglik)) 
}

x<-faithful$waiting

EM_normalMixture(x,mu.init=c(80,50),sd.init=c(5,5),lambda.init=c(0.5,0.5),iter.max=50)

The results are(not include posterior):

## $mu
## [1] 80.09103 54.61479
## 
## $sd
## [1] 5.867777 5.871161
## 
## $lambda
## [1] 0.639116 0.360884
## 
## 
## $loglik
##  [1]     0.000 -1097.229 -1045.628 -1045.528 -1045.350 -1045.250 -1045.204
##  [8] -1045.184 -1045.176 -1045.173 -1045.172 -1045.172 -1045.172 -1045.172
## [15] -1045.173 -1045.173 -1045.173 -1045.173 -1045.173 -1045.173 -1045.173
## [22] -1045.173 -1045.173

Now we use normalmixEM function in the mixtools packate

library(mixtools)
x<-faithful$waiting
normalmixEM(x, lambda = c(0.6,0.4), mu = c(80,50), sigma = c(5,5), k = 2)

And here are some results from the mixtools:

## 
## $lambda
## [1] 0.639115 0.360885
## 
## $mu
## [1] 80.09104 54.61481
## 
## $sigma
## [1] 5.867756 5.871190
## 
## $loglik
## [1] -1034.002
## 
## 
## $all.loglik
##  [1] -1078.434 -1034.528 -1034.034 -1034.008 -1034.004 -1034.002 -1034.002
##  [8] -1034.002 -1034.002 -1034.002 -1034.002 -1034.002 -1034.002 -1034.002
## [15] -1034.002 -1034.002 -1034.002
## 
## $restarts
## [1] 0
## 
## $ft
## [1] "normalmixEM"
## 
## attr(,"class")
## [1] "mixEM"

We can see the two results are almost the same, except the 'loglik' or "Q" for the EM algorithm. To investigate what causes the differences I look into the source codes of the mixtools normalmixEM function, here are the source codes of the normalmixEM function, I did some modifications in order to run the function without the library mixtools.

#note we do not need mixtools library here
dyn.load("normpost") #need to compile normpost.c first

normalmix.init<-function (x, lambda = NULL, mu = NULL, s = NULL, k = 2, arbmean = TRUE, 
                          arbvar = TRUE) 
{
  if (!is.null(s)) {
    arbvar <- (length(s) > 1)
    if (arbvar) 
      k <- length(s)
  }
  if (!is.null(mu)) {
    arbmean <- (length(mu) > 1)
    if (arbmean) {
      k <- length(mu)
      if (!is.null(s) && length(s) > 1 && k != length(s)) {
        stop("mu and sigma are each of length >1 but not of the same length.")
      }
    }
  }
  if (!arbmean && !arbvar) {
    stop("arbmean and arbvar cannot both be FALSE")
  }
  n = length(x)
  x = sort(x)
  x.bin = list()
  for (j in 1:k) {
    x.bin[[j]] <- x[max(1, floor((j - 1) * n/k)):ceiling(j * 
                                                           n/k)]
  }
  if (is.null(s)) {
    s.hyp = as.vector(sapply(x.bin, sd))
    if (any(s.hyp == 0)) 
      s.hyp[which(s.hyp == 0)] = runif(sum(s.hyp == 0), 
                                       0, sd(x))
    if (arbvar) {
      s = 1/rexp(k, rate = s.hyp)
    }
    else {
      s = 1/rexp(1, rate = mean(s.hyp))
    }
  }
  if (is.null(mu)) {
    mu.hyp <- as.vector(sapply(x.bin, mean))
    if (arbmean) {
      mu = rnorm(k, mean = mu.hyp, sd = s)
    }
    else {
      mu = rnorm(1, mean = mean(mu.hyp), sd = mean(s))
    }
  }
  if (is.null(lambda)) {
    cond = TRUE
    while (cond) {
      lambda = runif(k)
      lambda = lambda/sum(lambda)
      if (min(lambda) < 0.05) 
        cond = TRUE
      else cond = FALSE
    }
  }
  else {
    lambda <- rep(lambda, length.out = k)
    lambda <- lambda/sum(lambda)
  }
  list(lambda = lambda, mu = mu, s = s, k = k, arbvar = arbvar, 
       arbmean = arbmean)
}

normalmixEM2<-function (x, lambda = NULL, mu = NULL, sigma = NULL, k = 2, mean.constr = NULL, 
                        sd.constr = NULL, epsilon = 1e-08, maxit = 1000, maxrestarts = 20, 
                        verb = FALSE, fast = FALSE, ECM = FALSE, arbmean = TRUE,  arbvar = TRUE) 
{
  warn <- options(warn = -1)
  x <- as.vector(x)
  tmp <- normalmix.init(x = x, lambda = lambda, mu = mu, s = sigma, 
                        k = k, arbmean = arbmean, arbvar = arbvar)
  lambda <- tmp$lambda
  mu <- tmp$mu
  sigma <- tmp$s
  k <- tmp$k
  arbvar <- tmp$arbvar
  arbmean <- tmp$arbmean
  if (fast == TRUE && k == 2 && arbmean == TRUE) {
    a <- normalmixEM2comp(x, lambda = lambda[1], mu = mu, 
                          sigsqrd = sigma^2, eps = epsilon, maxit = maxit, 
                          verb = verb)
  }
  else {
    z <- parse.constraints(mean.constr, k = k, allsame = !arbmean)
    meancat <- z$category
meanalpha <- z$alpha
    z <- parse.constraints(sd.constr, k = k, allsame = !arbvar)
    sdcat <- z$category
sdalpha <- z$alpha
    
    ECM <- ECM || any(meancat != 1:k) || any(sdcat != 1)
    n <- length(x)
    
    notdone <- TRUE
    restarts <- 0
    while (notdone) {
      notdone <- FALSE
      tmp <- normalmix.init(x = x, lambda = lambda, mu = mu, 
                            s = sigma, k = k, arbmean = arbmean, arbvar = arbvar)
      lambda <- tmp$lambda
  mu <- tmp$mu
      k <- tmp$k
  sigma <- tmp$s
      var <- sigma^2
      diff <- epsilon + 1
      iter <- 0
      postprobs <- matrix(nrow = n, ncol = k)
      mu <- rep(mu, k)[1:k]
      sigma <- rep(sigma, k)[1:k]
      z <- .C("normpost", as.integer(n), as.integer(k), 
              as.double(x), as.double(mu), as.double(sigma), 
              as.double(lambda), res2 = double(n * k), double(3*k), post = double(n * k), loglik = double(1))
      
      postprobs <- matrix(z$post, nrow = n)
  res <- matrix(z$res2, nrow = n)
      ll <- obsloglik <- z$loglik
  while (diff > epsilon && iter < maxit) {
    lambda <- colMeans(postprobs)
    mu[meancat == 0] <- meanalpha[meancat == 0]
    if (max(meancat) > 0) {
      for (i in 1:max(meancat)) {
        w <- which(meancat == i)
        if (length(w) == 1) {
          mu[w] <- sum(postprobs[, w] * x)/(n * lambda[w])
        }
        else {
          tmp <- t(postprobs[, w]) * (meanalpha[w]/sigma[w]^2)
          mu[w] <- meanalpha[w] * sum(t(tmp) * x)/sum(tmp * 
                                                        meanalpha[w])
        }
      }
    }
    if (ECM) {
      z <- .C("normpost", as.integer(n), as.integer(k), 
              as.double(x), as.double(mu), as.double(sigma), 
              as.double(lambda), res2 = double(n * k), 
              double(3 * k), post = double(n * k), loglik = double(1))
      postprobs <- matrix(z$post, nrow = n)
      res <- matrix(z$res2, nrow = n)
      lambda <- colMeans(postprobs)
    }
    sigma[sdcat == 0] <- sdalpha[sdcat == 0]
    if (max(sdcat) > 0) {
      for (i in 1:max(sdcat)) {
        w <- which(sdcat == i)
        if (length(w) == 1) {
          sigma[w] <- sqrt(sum(postprobs[, w] * res[, 
                                                    w])/(n * lambda[w]))
        }
        else {
          tmp <- t(postprobs[, w])/sdalpha[w]
          sigma[w] <- sdalpha[w] * sqrt(sum(t(tmp) * 
                                              res[, w])/(n * sum(lambda[w])))
        }
      }
      if (any(sigma < 1e-08)) {
        notdone <- TRUE
        cat("One of the variances is going to zero; ", 
            "trying new starting values.\n")
        restarts <- restarts + 1
        lambda <- mu <- sigma <- NULL
        if (restarts > maxrestarts) {
          stop("Too many tries!")
        }
        break
      }
    }
    z <- .C("normpost", as.integer(n), as.integer(k), 
            as.double(x), as.double(mu), as.double(sigma), 
            as.double(lambda), res2 = double(n * k), double(3 * k), post = double(n * k), loglik = double(1))
    postprobs <- matrix(z$post, nrow = n)
    res <- matrix(z$res2, nrow = n)
    newobsloglik <- z$loglik
        diff <- newobsloglik - obsloglik
        obsloglik <- newobsloglik
        ll <- c(ll, obsloglik)
        iter <- iter + 1
        if (verb) {
          cat("iteration =", iter, " log-lik diff =", 
              diff, " log-lik =", obsloglik, "\n")
          print(rbind(lambda, mu, sigma))
        }
      }
    }
    if (iter == maxit) {
      cat("WARNING! NOT CONVERGENT!", "\n")
    }
    cat("number of iterations=", iter, "\n")
    if (arbmean == FALSE) {
      scale.order = order(sigma)
      sigma.min = min(sigma)
      postprobs = postprobs[, scale.order]
      colnames(postprobs) <- c(paste("comp", ".", 
                                     1:k, sep = ""))
      a = list(x = x, lambda = lambda[scale.order], mu = mu, 
               sigma = sigma.min, scale = sigma[scale.order]/sigma.min, 
               loglik = obsloglik, posterior = postprobs, all.loglik = ll, 
               restarts = restarts, ft = "normalmixEM")
    }
    else {
      colnames(postprobs) <- c(paste("comp", ".", 
                                     1:k, sep = ""))
      a = list(x = x, lambda = lambda, mu = mu, sigma = sigma, 
               loglik = obsloglik, posterior = postprobs, all.loglik = ll, 
               restarts = restarts, ft = "normalmixEM")
    }
  }
  class(a) = "mixEM"
  options(warn)
  a
}

data(faithful)
x<-faithful$waiting

normalmixEM2(x, lambda = c(0.5,0.5), mu = c(80,50), sigma = c(5,5), k = 2) #note, function is 
                                                                           #normalmixEM2

I got exactly the same results as the normalmixEM in the mixtools.

And the source codes for normpost.c are

#include <R.h>
#include <Rmath.h>

/* Compute the matrix of "posterior" probabilities in a finite mixture of
   univariate normal densities.  The algorithm used is fairly safe from
   a numerical perspective; it avoids over- or under-flow as long as the
   values of sigma are not too large or small.  */
void normpost(
    int *nn, /* sample size */
    int *mm, /* number of components */
    double *data,  /* vector of observations */
    double *mu, /* current vector of component means */
    double *sigma, /* current vector of component stdevs */
    double *lambda, /* current vector of mixing parameters */
    double *res2, /* n by m matrix of squared residuals */
    double *work, /* 3*m-vector of workspace, which will be broken into 3 parts */
    double *post, /* n by m matrix of posterior probabilities */      
    double *loglik /* scalar loglikelihood value */ 
    ) {
  int n=*nn, m=*mm, i, j, minj=0;
  double x, r, rowsum, min=0.0;
  double *LamSigRatio = work+m; /* Second 1/3 of workspace, for frequently used constants */
  double *logLamSigRatio = work+2*m; /* Third 1/3 of workspace, for frequently used constants */

  *loglik = -(double)n * 0.91893853320467274178; /* n/2 times log(2pi) */
  for (j=0; j<m; j++){ /* store some often-used values to save time later */
    LamSigRatio[j] = lambda[j] / sigma[j];
    logLamSigRatio[j] = log(LamSigRatio[j]);
  }
  for (i=0; i<n; i++){
    x=data[i];
    for (j=0; j<m; j++) {
      r = x-mu[j];
      r = r*r;
      res2[i + n*j] = r;
      work[j] = r = r / (2.0 * sigma[j] * sigma[j]);
      /* Keep track of the smallest standardized squared residual.  
         By dividing everything by the component density with the
         smallest such residual, the denominator of the posterior
         is guaranteed to be at least one and cannot be infinite unless
         the values of lambda or sigma are very large or small.  This helps
         prevent numerical problems when calculating the posteriors.*/
      if (j==0 || r < min) {        
        minj = j;
        min = r;
      }
    }
    /* At this stage, work contains the squared st'dized resids over 2 */
    rowsum = 1.0;
    for (j=0; j<m; j++) {
      if (j==minj) 
        work[j] = 1.0;
      else {
        work[j] = (LamSigRatio[j] / LamSigRatio[minj]) * exp(min - work[j]);
        rowsum += work[j];
      }
    }
    /* At this stage, work contains the normal density at data[i]
       divided by the normal density with the largest st'dized resid 
       Thus, dividing by rowsum gives the posteriors: */
    for (j=0; j<m; j++) {
      post[i + n*j] = work[j] / rowsum;
    }
    /* Finally, adjust the loglikelihood correctly */
    *loglik += log(rowsum) - min + logLamSigRatio[minj];
  }
}

void oldnormpost(
    int *nn, /* sample size */
    int *mm, /* number of components */
    double *data,  /* vector of observations */
    double *mu, /* current vector of component means */
    double *sigma, /* current vector of component stdevs */
    double *lambda, /* current vector of mixing parameters */
    double *res2, /* n by m matrix of squared residuals */
    double *work, /* m-vector of workspace */
    double *post, /* n by m matrix of posterior probabilities */      
    double *loglik /* scalar loglikelihood value */ 
    ) {
  int n=*nn, m=*mm, i, j, minj=0;
  double x, r, rowsum, min=0.0;

  *loglik = -(double)n * 0.91893853320467274178; /* n/2 times log(2pi) */
  for (i=0; i<n; i++){
    x=data[i];
    min = 1.0e+6; /* large number */
    for (j=0; j<m; j++) {
      r = x-mu[j];
      r = r*r;
      res2[i + n*j] = r;
      work[j] = r = r / (2.0 * sigma[j] * sigma[j]);
      /* Keep track of the smallest standardized squared residual.  
         By dividing everything by the component density with the
         smallest such residual, the denominator of the posterior
         is guaranteed to be at least one and cannot be infinite unless
         the values of lambda or sigma are very large or small.  This helps
         prevent numerical problems when calculating the posteriors.*/
      if (r < min) {
        minj = j;
        min = r;
      }
    }
    /* At this stage, work contains the squared st'dized resids over 2 */
    rowsum = 1.0;
    for (j=0; j<m; j++) {
      if (j==minj) 
        work[j] = 1.0;
      else 
        rowsum+=(work[j]=lambda[j]/sigma[j]*sigma[minj]/lambda[minj]*exp(min-work[j]));
    }
    /* At this stage, work contains the normal density at data[i]
       divided by the normal density with the largest st'dized resid 
       Thus, dividing by rowsum gives the posteriors: */
    for (j=0; j<m; j++) {
      post[i + n*j] = work[j] / rowsum;
    }
    /* Finally, adjust the loglikelihood correctly */
    *loglik += log(rowsum) - min + log(lambda[minj]/sigma[minj]);
  }
}

Here are my questions:

  1. Why the 'loglik' or 'Q' in is not always increasing in my implement(forget initial 0)? such as from -1045.172 to -1045.173 is in fact decreasing, I think for EM algorithm the "Q" should always increasing.

  2. What are the maths in the normpost.c? I am not very familiar with c code, I cannot figure out the mathematics in the normpost.c.

Here is another question from this site but I think it does not solve my questions.

Thank you very much.

Deep North
  • 4,527
  • 2
  • 18
  • 38

0 Answers0