pyMM.py

"""Probabilistic principal components analysis (PPCA)

A generative latent linear variable model.

PPCA assumes that the observed data is generated by linearly transforming a
number of latent variables and then adding spherical Gaussian noise. The
latent variables are drawn from a standard Gaussian distribution.

This implementation is based on David Barber's Matlab implementation:
http://web4.cs.ucl.ac.uk/staff/D.Barber/pmwiki/pmwiki.php?n=Main.Software

This implementation uses the EM algorithm to handle missing data.
"""

# Author: Charlie Nash <charlie.tc.nash@gmail.com>

import numpy as np
import scipy as sp
import numpy.random as rd
from random import seed
from scipy.stats import multivariate_normal
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import Imputer


class GMM():
    """Probabilistic principal components analysis (PPCA).

    A generative latent linear variable model.

    PPCA assumes that the observed data is generated by linearly transforming a
    number of latent variables and then adding spherical Gaussian noise. The
    latent variables are drawn from a standard Gaussian distribution.

    The parameters of the model are the transformation matrix (principal
    components) the mean, and the noise variance.

    PPCA performs maximum likelihood or MAP estimation of the model parameters
    using the expectation-maximisation algorithm (EM).

    Attributes
    ----------

    latentDim : int
        Dimensionality of latent space. The number of variables that are
        transformed by the principal components to the data space.

    components : array, [latentDim, nFeatures]
        Transformation matrix parameter.

    bias: array, [nFeatures]
        Bias parameter.

    noiseVariance : float
        Noise variance parameter. Variance of noise that is added to linearly
        transformed latent variables to generate data.

    standardize : bool, optional
        When True, the mean is subtracted from the data, and each feature is
        divided by it's standard deviation so that the mean and variance of
        the transformed features are 0 and 1 respectively.

    componentPrior : float >= 0
        Gaussian component matrix hyperparameter. If > 0 then a Gaussian prior
        is applied to each column of the component matrix with covariance
        componentPrior^-1 * noiseVariance. This has the effect
        of regularising the component matrix.

    tol : float
        Stopping tolerance for EM algorithm

    maxIter : int
        Maximum number of iterations for EM algorithm

    Notes
    -----

    TODO

    Examples
    --------

    TODO
    """
    def __init__(self, n_components, covariance_type='full', tol=1e-3,
                 max_iter=1000, random_state=0, verbose=True, robust=False):
        self.n_components = n_components
        self.covariance_type = covariance_type
        self.tol = tol
        self.max_iter = max_iter
        self.random_state = random_state
        self.verbose = verbose
        self.robust = robust
        self.isFitted = False

    def _e_step(self, X, params):
        if self.missing_data:
            return self._e_step_miss(X, params)
        else:
            return self._e_step_no_miss(X, params)

    def _e_step_no_miss(self, X, params):
        """ E-Step of the EM-algorithm.

        The E-step takes the existing parameters, for the components, bias
        and noise variance and computes sufficient statistics for the M-Step
        by taking the expectation of latent variables conditional on the
        visible variables. Also returns the likelihood for the data X and
        projections into latent space of the data.

        Args
        ----
        X : array, [nExamples, nFeatures]
            Matrix of training data, where nExamples is the number of
            examples and nFeatures is the number of features.
        W : array, [dataDim, latentDim]
            Component matrix data. Maps latent points to data space.
        b : array, [dataDim,]
            Data bias.
        sigmaSq : float
            Noise variance parameter.

        Returns
        -------
        ss : dict

        proj :

        ll :
        """
        # Get params
        mu_list = params['mu_list']
        components = params['components']
        n_examples, data_dim = X.shape

        # Compute responsibilities
        log_r = np.zeros([n_examples, self.n_components])

        # Get Sigma from params
        Sigma_list = self._params_to_Sigma(params)

        for k, mu, Sigma in zip(range(self.n_components), mu_list,
                                Sigma_list):
            try:
                log_r[:, k] = multivariate_normal.logpdf(X, mu, Sigma)
            except np.linalg.linalg.LinAlgError:
                if self.robust:
                    Sigma_robust = Sigma + 1e-8*np.eye(self.data_dim)
                    log_r[:, k] = multivariate_normal.logpdf(X, mu,
                                                             Sigma_robust)
                else:
                    error_msg = ('Covariance matrix ill-conditioned. Use ' +
                                 'robust=True to pre-condition covariance ' +
                                 'matrices or choose fewer mixture ' +
                                 'components')
                    raise np.linalg.linalg.LinAlgError(error_msg)
        log_r = log_r + np.log(components)
        log_r_sum = sp.misc.logsumexp(log_r, axis=1)
        responsibilities = np.exp(log_r - log_r_sum[:, np.newaxis])

        x_list = [X for k in range(self.n_components)]
        xx_list = [X[:, :, np.newaxis] * X[:, np.newaxis, :] for k in
                   range(self.n_components)]

        # Store sufficient statistics in dictionary
        ss = {'responsibilities': responsibilities,
              'x_list': x_list,
              'xx_list': xx_list}

        # Compute log-likelihood of each example
        sample_ll = log_r_sum

        return ss, sample_ll

    def _e_step_miss(self, X, params):
        """ E-Step of the EM-algorithm.

        The E-step takes the existing parameters, for the components, bias
        and noise variance and computes sufficient statistics for the M-Step
        by taking the expectation of latent variables conditional on the
        visible variables. Also returns the likelihood for the data X and
        projections into latent space of the data.

        Args
        ----
        X : array, [nExamples, nFeatures]
            Matrix of training data, where nExamples is the number of
            examples and nFeatures is the number of features.
        W : array, [dataDim, latentDim]
            Component matrix data. Maps latent points to data space.
        b : array, [dataDim,]
            Data bias.
        sigmaSq : float
            Noise variance parameter.

        Returns
        -------
        ss : dict

        proj :

        ll :
        """
        # Get current params
        mu_list = params['mu_list']
        components = params['components']

        # Get Sigma from params
        Sigma_list = self._params_to_Sigma(params)

        observed_list = [np.array(np.where(~np.isnan(row))).flatten() for
                         row in X]
        n_examples, data_dim = np.shape(X)

        # Loop over data points computing responsibilities
        r = np.zeros([n_examples, self.n_components])
        for n in range(n_examples):
            id_obs = observed_list[n]
            row = X[n, :]
            row_obs = row[id_obs]
            for k, mu, Sigma in zip(range(self.n_components), mu_list,
                                    Sigma_list):
                mu_obs = mu[id_obs]
                Sigma_obs = Sigma[np.ix_(id_obs, id_obs)]
                r[n, k] = multivariate_normal.pdf(row_obs[np.newaxis, :],
                                                  mu_obs, Sigma_obs)
        r = r * components
        r_sum = r.sum(axis=1)
        responsibilities = r / r_sum[:, np.newaxis]

        x_list = []
        xx_list = []
        for k, mu, Sigma in zip(range(self.n_components), mu_list, Sigma_list):

            x_tot = np.zeros([n_examples, data_dim])
            xx_tot = np.zeros([n_examples, data_dim, data_dim])

            for n in range(n_examples):
                id_obs = observed_list[n]
                id_miss = np.setdiff1d(np.arange(data_dim), id_obs)
                n_miss = len(id_miss)
                row = X[n, :]
                row_obs = row[id_obs]

                # Simplify for case with no missing data
                if n_miss == 0:
                    x_tot[n] = row_obs
                    xx_tot[n] = np.outer(row_obs, row_obs)

                # Get missing / present parameters
                mu_obs = mu[id_obs]
                mu_miss = mu[id_miss]
                Sigma_obs = Sigma[np.ix_(id_obs, id_obs)]
                Sigma_miss = Sigma[np.ix_(id_miss, id_miss)]
                Sigma_obs_miss = Sigma[np.ix_(id_obs, id_miss)]
                Sigma_miss_obs = Sigma[np.ix_(id_miss, id_obs)]

                # Get conditional distribution p(x_miss | x_vis, params_k)
                mean_cond = (mu_miss + Sigma_miss_obs.dot(
                             np.linalg.inv(Sigma_obs)).dot(row_obs - mu_obs))
                Sigma_cond = (Sigma_miss -
                              Sigma_miss_obs.dot(np.linalg.inv(Sigma_obs))
                              .dot(Sigma_obs_miss))

                # Get sufficient statistics E[x] and E[xx^t]
                x = np.empty(data_dim)
                x[id_obs] = row_obs
                x[id_miss] = mean_cond
                x_tot[n] = x

                xx = np.empty([data_dim, data_dim])
                xx[np.ix_(id_obs, id_obs)] = np.outer(row_obs, row_obs)
                xx[np.ix_(id_obs, id_miss)] = np.outer(row_obs, mean_cond)
                xx[np.ix_(id_miss, id_obs)] = np.outer(mean_cond, row_obs)
                xx[np.ix_(id_miss, id_miss)] = (np.outer(mean_cond, mean_cond)
                                                + Sigma_cond)
                xx_tot[n] = xx
            x_list.append(x_tot)
            xx_list.append(xx_tot)

        # Store sufficient statistics in dictionary
        ss = {'responsibilities': responsibilities,
              'x_list': x_list,
              'xx_list': xx_list}

        # Compute log-likelihood of each example
        sample_ll = np.log(r_sum)

        return ss, sample_ll

    def _m_step(self, ss, params):
        """ M-Step of the EM-algorithm.

        The M-step takes the sufficient statistics computed in the E-step, and
        maximizes the expected complete data log-likelihood with respect to the
        parameters.

        Args
        ----
        ss : dict

        Returns
        -------
        params : dict

        """
        resp = ss['responsibilities']
        x_list = ss['x_list']
        xx_list = ss['xx_list']

        # Update components param
        components = np.mean(resp, axis=0)

        # Update mean / Sigma params
        mu_list = []
        Sigma_list = []
        for r, x, xx in zip(resp.T, x_list, xx_list):
            mu = np.sum(x*r[:, np.newaxis], axis=0) / r.sum()
            mu_list.append(mu)
            Sigma = (np.sum((xx*r[:, np.newaxis, np.newaxis]), axis=0) /
                     r.sum() - np.outer(mu, mu))
            Sigma_list.append(Sigma)

        # Store params in dictionary
        params = {'Sigma_list': Sigma_list,
                  'mu_list': mu_list,
                  'components': components}

        return params

    def _params_to_Sigma(self, params):
        return params['Sigma_list']

    def _init_params(self, X, init_method='kmeans'):
        seed(self.random_state)
        n_examples = X.shape[0]
        if init_method == 'kmeans':
            kmeans = KMeans(self.n_components)
            if self.missing_data:
                imputer = Imputer()
                X = imputer.fit_transform(X)
            kmeans.fit(X)
            mu_list = [k for k in kmeans.cluster_centers_]
            Sigma_list = []
            for k in range(self.n_components):
                X_k = X[kmeans.labels_ == k, :]
                if X_k.shape[0] < 5*self.data_dim:
                    Sigma_list.append(np.eye(self.data_dim))
                else:
                    Sigma_list.append(np.cov(X_k.T))
            components = np.array([np.sum(kmeans.labels_ == k) / n_examples
                                  for k in range(self.n_components)])
            params_init = {'mu_list': mu_list,
                           'Sigma_list': Sigma_list,
                           'components': components}
            return params_init

    def fit(self, X, params_init=None, init_method='kmeans'):
        """ Fit the model using EM with data X.

        Args
        ----
        X : array, [nExamples, nFeatures]
            Matrix of training data, where nExamples is the number of
            examples and nFeatures is the number of features.
        """
        n_examples, data_dim = np.shape(X)
        self.data_dim = data_dim
        if np.isnan(X).any():
            self.missing_data = True
        else:
            self.missing_data = False

        if params_init is None:
            params = self._init_params(X, init_method)
        else:
            params = params_init

        # Check for missing values and remove if whole row is missing
        X = X[~np.isnan(X).all(axis=1), :]

        oldL = -np.inf
        for i in range(self.max_iter):

            # E-Step
            ss, sample_ll = self._e_step(X, params)

            # Evaluate likelihood
            ll = sample_ll.mean()
            if self.verbose:
                print("Iter {:d}   NLL: {:.3f}   Change: {:.3f}".format(i,
                      -ll, -(ll-oldL)), flush=True)

            # Break if change in likelihood is small
            if np.abs(ll - oldL) < self.tol:
                break
            oldL = ll

            # M-step
            params = self._m_step(ss, params)

        else:
            if self.verbose:
                print("PPCA did not converge within the specified" +
                      " tolerance. You might want to increase the number of" +
                      " iterations.")

        # Update Object attributes
        self.params = params
        self.trainNll = ll
        self.isFitted = True

    def sample(self, n_samples=1):
        """Sample from fitted model.

        Sample from fitted model by first sampling from latent space
        (spherical Gaussian) then transforming into data space using learned
        parameters. Noise can then be added optionally.

        Parameters
        ----------
        nSamples : int
            Number of samples to generate
        noisy : bool
            Option to add noise to samples (default = True)

        Returns
        -------
        dataSamples : array [nSamples, dataDim]
            Collection of samples in data space.
        """
        if not self.isFitted:
            print("Model is not yet fitted. First use fit to learn the " +
                  "model params.")
        else:
            components = self.params['components']
            mu_list = self.params['mu_list']
            Sigma_list = self._params_to_Sigma(self.params)
            components_cumsum = np.cumsum(components)
            samples = np.zeros([n_samples, self.data_dim])
            for n in range(n_samples):
                r = np.random.rand(1)
                z = np.argmin(r > components_cumsum)
                samples[n] = rd.multivariate_normal(mu_list[z], Sigma_list[z])
            return samples

    def score_samples(self, X):
        if not self.isFitted:
            print("Model is not yet fitted. First use fit to learn the " +
                  "model params.")
        else:
            # Apply one step of E-step to get the sample log-likelihoods
            return self._e_step(X, self.params)[1]

    def score(self, X):
        """Compute the average log-likelihood of data matrix X

        Parameters
        ----------
        X: array, shape (n_samples, n_features)
            The data

        Returns
        -------
        meanLl: array, shape (n_samples,)
            Log-likelihood of each sample under the current model
        """
        if not self.isFitted:
            print("Model is not yet fitted. First use fit to learn the " +
                  "model params.")
        else:
            # Apply one step of E-step to get the sample log-likelihoods
            sample_ll = self.score_samples(X)

            # Divide by number of examples to get average log likelihood
            return sample_ll.mean()


class SphericalGMM(GMM):

    @staticmethod
    def _convert_gmm_params(params):
        sigma_sq_list = [np.mean(np.diag(cov)) for cov in
                         params['Sigma_list']]
        params_conv = {i: params[i] for i in params if i != 'Sigma_list'}
        params_conv['sigma_sq_list'] = sigma_sq_list
        return params_conv

    def _init_params(self, X, init_method='kmeans'):
        params_init_gmm = super(SphericalGMM, self)._init_params(X,
                                                                 init_method)
        return self._convert_gmm_params(params_init_gmm)

    def _m_step(self, ss, params):
        params_gmm = super(SphericalGMM, self)._m_step(ss, params)
        return self._convert_gmm_params(params_gmm)

    def _params_to_Sigma(self, params):
            return [sigma_sq*np.eye(self.data_dim) for sigma_sq in
                    params['sigma_sq_list']]


class DiagonalGMM(SphericalGMM):

    @staticmethod
    def _convert_gmm_params(params):
        Psi_list = [np.diag(np.diag(cov)) for cov in params['Sigma_list']]
        params_conv = {i: params[i] for i in params if i != 'Sigma_list'}
        params_conv['Psi_list'] = Psi_list
        return params_conv

    def _params_to_Sigma(self, params):
            return params['Psi_list']


class MPPCA(GMM):
    """Mixtures of probabilistic principal components analysis (PPCA) models.

    A generative latent variable model.

    PPCA assumes that the observed data is generated by first generating latent
    variables z from a Gaussian distribution p(z), then linearly transforming
    these variables with a weights matrix W, and then finally adding spherical
    Gaussian noise. PPCA can be viewed as a Gaussian model with a low-rank
    approximation to the covariance matrix. It can be useful in the case
    where there are many dimensions, but not many examples. Here a
    full-covariance model needs to estimate many parameters, and will have a
    tendency to overfit, whereas a PPCA model can have considerably fewer
    parameters, and therefore is less likely to overfit.

    The parameters of the model are the transformation matrix W , the mean mu,
    and the noise variance sigma_sq.

    The mixture of PPCA models (MPPCA) additionally assumes that the data can
    come from a number of PPCA components, with each component being selected
    from a disrete probability distribution. Thus the parameters are W_k, mu_k
    and sigma_sq_k for each component k, and component probabilities alpha_k
    for each component.

    MPPCA performs maximum likelihood or MAP estimation of the model
    parameters using the expectation-maximisation algorithm (EM algorithm).

    Attributes
    ----------

    latent_dim : int
        Dimensionality of latent space. The number of variables that are
        transformed by the weight matrix to the data space.

    n_components : array, [latentDim, nFeatures]
        Transformation matrix parameter.

    bias: array, [nFeatures]
        Bias parameter.

    noiseVariance : float
        Noise variance parameter. Variance of noise that is added to linearly
        transformed latent variables to generate data.

    standardize : bool, optional
        When True, the mean is subtracted from the data, and each feature is
        divided by it's standard deviation so that the mean and variance of
        the transformed features are 0 and 1 respectively.

    componentPrior : float >= 0
        Gaussian component matrix hyperparameter. If > 0 then a Gaussian prior
        is applied to each column of the component matrix with covariance
        componentPrior^-1 * noiseVariance. This has the effect
        of regularising the component matrix.

    tol : float
        Stopping tolerance for EM algorithm

    maxIter : int
        Maximum number of iterations for EM algorithm
    """

    def __init__(self, n_components, latent_dim, tol=1e-3, max_iter=1000,
                 random_state=0, verbose=True, robust=False):

        super(MPPCA, self).__init__(n_components=n_components, tol=tol,
                                    max_iter=max_iter,
                                    random_state=random_state,
                                    verbose=verbose, robust=robust)
        self.latent_dim = latent_dim

    def _init_params(self, X, init_method='kmeans'):
        seed(self.random_state)
        n_examples = X.shape[0]
        if init_method == 'kmeans':
            kmeans = KMeans(self.n_components)
            if self.missing_data:
                imputer = Imputer()
                X = imputer.fit_transform(X)
            kmeans.fit(X)
            mu_list = [k for k in kmeans.cluster_centers_]
            W_list = []
            sigma_sq_list = []
            for k in range(self.n_components):
                data_k = X[kmeans.labels_ == k, :]
                pca = PCA(n_components=self.latent_dim)
                pca.fit(data_k)
                W_list.append(pca.components_.T)
                sigma_sq_list.append(0.1)
            components = np.array([np.sum(kmeans.labels_ == k) / n_examples
                                   for k in range(self.n_components)])
            params_init = {'mu_list': mu_list,
                           'W_list': W_list,
                           'sigma_sq_list': sigma_sq_list,
                           'components': components}
            return params_init

    def _e_step_no_miss(self, X, params):
        """ E-Step of the EM-algorithm.

        The E-step takes the existing parameters, for the components, bias
        and noise variance and computes sufficient statistics for the M-Step
        by taking the expectation of latent variables conditional on the
        visible variables. Also returns the likelihood for the data X and
        projections into latent space of the data.

        Args
        ----
        X : array, [nExamples, nFeatures]
            Matrix of training data, where nExamples is the number of
            examples and nFeatures is the number of features.
        W : array, [dataDim, latentDim]
            Component matrix data. Maps latent points to data space.
        b : array, [dataDim,]
            Data bias.
        sigmaSq : float
            Noise variance parameter.

        Returns
        -------
        ss : dict

        proj :

        ll :
        """
        # Get params
        mu_list = params['mu_list']
        components = params['components']
        W_list = params['W_list']
        sigma_sq_list = params['sigma_sq_list']
        n_examples, data_dim = X.shape

        # Compute responsibilities
        r = np.zeros([n_examples, self.n_components])

        # Get Sigma from params
        Sigma_list = self._params_to_Sigma(params)

        for k, mu, Sigma in zip(range(self.n_components), mu_list, Sigma_list):
            r[:, k] = multivariate_normal.pdf(X, mu, Sigma)
        r = r * components
        r_sum = r.sum(axis=1)
        responsibilities = r / r_sum[:, np.newaxis]

        # Get sufficient statistics E[z] and E[zz^t] for each component
        z_list = []
        zz_list = []
        xz_list = []
        xx_list = []
        for mu, W, sigma_sq in zip(mu_list, W_list, sigma_sq_list):
            dev = X - mu
            F_inv = (np.linalg.inv(W.T.dot(W) +
                     sigma_sq*np.eye(self.latent_dim)))
            z = dev.dot(W).dot(F_inv)
            z_list.append(z)
            zz = sigma_sq*F_inv + z[:, :, np.newaxis] * z[:, np.newaxis, :]
            zz_list.append(zz)
            xx = dev[:, :, np.newaxis] * dev[:, np.newaxis, :]
            xx_list.append(xx)
            xz = dev[:, :, np.newaxis] * z[:, np.newaxis, :]
            xz_list.append(xz)

        # Store sufficient statistics in dictionary
        ss = {'responsibilities': responsibilities,
              'x_list': [X for k in range(self.n_components)],
              'xx_list': xx_list,
              'xz_list': xz_list,
              'z_list': z_list,
              'zz_list': zz_list}

        # Compute log-likelihood
        sample_ll = np.log(r_sum)

        return ss, sample_ll

    def _e_step_miss(self, X, params):
        """ E-Step of the EM-algorithm.

        The E-step takes the existing parameters, for the components, bias
        and noise variance and computes sufficient statistics for the M-Step
        by taking the expectation of latent variables conditional on the
        visible variables. Also returns the likelihood for the data X and
        projections into latent space of the data.

        Args
        ----
        X : array, [nExamples, nFeatures]
            Matrix of training data, where nExamples is the number of
            examples and nFeatures is the number of features.
        W : array, [dataDim, latentDim]
            Component matrix data. Maps latent points to data space.
        b : array, [dataDim,]
            Data bias.
        sigmaSq : float
            Noise variance parameter.

        Returns
        -------
        ss : dict

        proj :

        ll :
        """
        # Get current params
        mu_list = params['mu_list']
        components = params['components']
        sigma_sq_list = params['sigma_sq_list']
        W_list = params['W_list']

        # Get Sigma from params
        Sigma_list = self._params_to_Sigma(params)

        observed_list = [np.array(np.where(~np.isnan(row))).flatten() for
                         row in X]
        n_examples, data_dim = np.shape(X)

        # Loop over data points computing responsibilities
        r = np.zeros([n_examples, self.n_components])
        for n in range(n_examples):
            id_obs = observed_list[n]
            row = X[n, :]
            row_obs = row[id_obs]
            for k, mu, Sigma in zip(range(self.n_components), mu_list,
                                    Sigma_list):
                mu_obs = mu[id_obs]
                Sigma_obs = Sigma[np.ix_(id_obs, id_obs)]
                r[n, k] = multivariate_normal.pdf(row_obs[np.newaxis, :],
                                                  mu_obs, Sigma_obs)
        r = r * components
        r_sum = r.sum(axis=1)
        responsibilities = r / r_sum[:, np.newaxis]

        x_list = []
        xx_list = []
        z_list = []
        zz_list = []
        xz_list = []

        for k, mu, W, sigma_sq in zip(range(self.n_components), mu_list,
                                      W_list, sigma_sq_list):

            x_tot = np.zeros([n_examples, data_dim])
            xx_tot = np.zeros([n_examples, data_dim, data_dim])
            z_tot = np.zeros([n_examples, self.latent_dim])
            zz_tot = np.zeros([n_examples, self.latent_dim, self.latent_dim])
            xz_tot = np.zeros([n_examples, self.data_dim, self.latent_dim])

            for n in range(n_examples):
                id_obs = observed_list[n]
                id_miss = np.setdiff1d(np.arange(data_dim), id_obs)
                n_miss = len(id_miss)
                row = X[n, :]
                row_obs = row[id_obs]

#                # Simplify for case with no missing data
#                if n_miss == 0:
#                    x_tot[n] = row_obs
#                    xx_tot[n] = np.outer(row_obs, row_obs)

                # Get missing and visible points
                W_obs = W[id_obs, :]
                W_miss = W[id_miss, :]
                mu_obs = mu[id_obs]
                mu_miss = mu[id_miss]
                row_min_mu = row_obs - mu_obs

                # Get conditional distribution of p(z | x_vis, params)
                F_inv = (np.linalg.inv(W_obs.T.dot(W_obs) +
                         sigma_sq*np.eye(self.latent_dim)))
                cov_z_cond = sigma_sq*F_inv
                mean_z_cond = F_inv.dot(W_obs.T).dot(row_obs - mu_obs)

                # Get conditional distribution of p(x_miss | z, params)
                mean_x_miss = W_miss.dot(mean_z_cond) + mu_miss

                # Append sufficient statistics
                z_tot[n] = mean_z_cond
                zz_tot[n] = cov_z_cond + np.outer(mean_z_cond, mean_z_cond)

                x_tot[n, id_obs] = row_obs
                x_tot[n, id_miss] = mean_x_miss

                xz_tot[n, id_miss, :] = W_miss.dot(zz_tot[n])
                xz_tot[n, id_obs, :] = np.outer(row_min_mu, mean_z_cond)

                xx = np.empty([data_dim, data_dim])
                xx[np.ix_(id_obs, id_obs)] = np.outer(row_min_mu, row_min_mu)
                xx[np.ix_(id_obs, id_miss)] = np.outer(row_min_mu,
                                                       mean_x_miss - mu_miss)
                xx[np.ix_(id_miss, id_obs)] = np.outer(mean_x_miss - mu_miss,
                                                       row_min_mu)
                xx[np.ix_(id_miss, id_miss)] = (W_miss.dot(zz_tot[n]).
                                                dot(W_miss.T) +
                                                sigma_sq*np.eye(n_miss))
                xx_tot[n] = xx

            x_list.append(x_tot)
            xx_list.append(xx_tot)
            z_list.append(z_tot)
            zz_list.append(zz_tot)
            xz_list.append(xz_tot)

        # Store sufficient statistics in dictionary
        ss = {'responsibilities': responsibilities,
              'x_list': x_list,
              'xx_list': xx_list,
              'xz_list': xz_list,
              'z_list': z_list,
              'zz_list': zz_list}

        # Compute log-likelihood
        sample_ll = np.log(r_sum)

        return ss, sample_ll

    def _m_step(self, ss, params):
        """ M-Step of the EM-algorithm.

        The M-step takes the sufficient statistics computed in the E-step, and
        maximizes the expected complete data log-likelihood with respect to the
        parameters.

        Args
        ----
        ss : dict

        Returns
        -------
        params : dict

        """
        resp = ss['responsibilities']
        x_list = ss['x_list']
        z_list = ss['z_list']
        zz_list = ss['zz_list']
        xz_list = ss['xz_list']
        xx_list = ss['xx_list']
        W_list_old = params['W_list']

        # Update components param
        components = np.mean(resp, axis=0)

        # Update mean / Sigma params
        mu_list = []
        W_list = []
        sigma_sq_list = []
        for r, W, x, z, zz, xz, xx in zip(resp.T, W_list_old, x_list, z_list,
                                          zz_list, xz_list, xx_list):
            resid = x - z.dot(W.T)
            mu = np.sum(resid*r[:, np.newaxis], axis=0) / r.sum()
            mu_list.append(mu)
            W1 = np.sum(xz*r[:, np.newaxis, np.newaxis], axis=0)
            W2 = np.linalg.inv(np.sum(zz*r[:, np.newaxis, np.newaxis],
                                      axis=0))
            W = W1.dot(W2)
            W_list.append(W)
            s1 = np.trace(xx, axis1=1, axis2=2)
            s2 = -2*np.trace(xz.dot(W.T), axis1=1, axis2=2)
            s3 = np.trace(zz*W.T.dot(W), axis1=1, axis2=2)
            sigma_sq = np.sum(r*(s1 + s2 + s3)) / (self.data_dim * r.sum())
            sigma_sq_list.append(sigma_sq)

        # Store params in dictionary
        params = {'W_list': W_list,
                  'sigma_sq_list': sigma_sq_list,
                  'mu_list': mu_list,
                  'components': components}
        return params

    def _params_to_Sigma(self, params):
        W_list = params['W_list']
        sigma_sq_list = params['sigma_sq_list']
        Sigma_list = [W.dot(W.T) + sigma_sq*np.eye(self.data_dim)
                      for W, sigma_sq in zip(W_list, sigma_sq_list)]
        return Sigma_list


class MFA(GMM):

    def __init__(self, n_components, latent_dim, tol=1e-3, max_iter=1000,
                 random_state=0, verbose=True, robust=False, small=1e-4):

        super(MFA, self).__init__(n_components=n_components, tol=tol,
                                    max_iter=max_iter,
                                    random_state=random_state,
                                    verbose=verbose, robust=robust)
        self.latent_dim = latent_dim
        self.SMALL = small

    def _init_params(self, X, init_method='kmeans'):
        seed(self.random_state)
        n_examples = X.shape[0]
        if init_method == 'kmeans':
            kmeans = KMeans(self.n_components)
            if self.missing_data:
                imputer = Imputer()
                X = imputer.fit_transform(X)
            kmeans.fit(X)
            mu_list = [k for k in kmeans.cluster_centers_]
            W_list = []
            Psi_list = []
            for k in range(self.n_components):
                X_k = X[kmeans.labels_ == k, :]
                if X_k.shape[0] < 500*self.data_dim:
                    W_list.append(np.random.randn(self.data_dim,
                                                  self.latent_dim))
                    Psi_list.append(10*np.eye(self.data_dim))
                else:
                    fa = FactorAnalysis(n_components=self.latent_dim)
                    fa.fit(X_k)
                    W_list.append(fa.components_.T)
                    Psi_list.append(np.diag(fa.noise_variance_))
            components = np.array([np.sum(kmeans.labels_ == k) / n_examples
                                   for k in range(self.n_components)])
            params_init = {'mu_list': mu_list,
                           'W_list': W_list,
                           'Psi_list': Psi_list,
                           'components': components}
            return params_init

    def _e_step_no_miss(self, X, params):
        """ E-Step of the EM-algorithm.

        The E-step takes the existing parameters, for the components, bias
        and noise variance and computes sufficient statistics for the M-Step
        by taking the expectation of latent variables conditional on the
        visible variables. Also returns the likelihood for the data X and
        projections into latent space of the data.

        Args
        ----
        X : array, [nExamples, nFeatures]
            Matrix of training data, where nExamples is the number of
            examples and nFeatures is the number of features.
        W : array, [dataDim, latentDim]
            Component matrix data. Maps latent points to data space.
        b : array, [dataDim,]
            Data bias.
        sigmaSq : float
            Noise variance parameter.

        Returns
        -------
        ss : dict

        proj :

        ll :
        """
        # Get params
        mu_list = params['mu_list']
        components = params['components']
        W_list = params['W_list']
        Psi_list = params['Psi_list']
        n_examples, data_dim = X.shape

        # Get Sigma from params
        Sigma_list = self._params_to_Sigma(params)

        # Compute responsibilities
        log_r = np.zeros([n_examples, self.n_components])
        for k, mu, Sigma, W, Psi in zip(range(self.n_components), mu_list,
                                        Sigma_list, W_list, Psi_list):

#            print('Sigma = {}'.format(Sigma))
#            print('Psi: {}'.format(Psi))
#            print('WW.T: {}'.format(W.dot(W.T)))
            try:
                log_r[:, k] = multivariate_normal.logpdf(X, mu, Sigma)
#            except ValueError:
#                print('ValueError: Sigma = {}'.format(Sigma))
#                print('Eigenvalues: {}'.format(np.linalg.eigvals(Sigma)))
#                print('Psi: {}'.format(Psi))
#                print('WW.T: {}'.format(W.dot(W.T)))
            except np.linalg.linalg.LinAlgError:
                if self.robust:
#                    print(Sigma)
                    Sigma_robust = Sigma + self.SMALL*np.eye(self.data_dim)
                    log_r[:, k] = multivariate_normal.logpdf(X, mu,
                                                             Sigma_robust)
                else:
                    error_msg = ('Covariance matrix ill-conditioned. Use ' +
                                 'robust=True to pre-condition covariance ' +
                                 'matrices or choose fewer mixture ' +
                                 'components')
                    raise np.linalg.linalg.LinAlgError(error_msg)
        log_r = log_r + np.log(components)
        log_r_sum = sp.misc.logsumexp(log_r, axis=1)
        responsibilities = np.exp(log_r - log_r_sum[:, np.newaxis])

        # Get sufficient statistics E[z] and E[zz^t] for each component
        z_list = []
        zz_list = []
        xz_list = []
        zx_list = []
        xx_list = []
        for mu, W, Psi in zip(mu_list, W_list, Psi_list):
            dev = X - mu
            F = W.dot(W.T) + Psi
#            print(F)
            F_inv = np.linalg.inv(W.dot(W.T) + Psi)
            z = dev.dot(F_inv.dot(W))
            try:
                F_inv_W = np.linalg.solve(F, W)
            except np.linalg.linalg.LinAlgError:
                if self.robust:
                    F_robust = F + self.SMALL*np.eye(self.data_dim)
                    F_inv_W = np.linalg.solve(F_robust, W)
                else:
                    error_msg = ('Matrix ill-conditioned. Use ' +
                                 'robust=True to pre-condition covariance ' +
                                 'matrices or choose fewer mixture ' +
                                 'components')
                    raise np.linalg.linalg.LinAlgError(error_msg)
            z = dev.dot(F_inv_W)
            z_list.append(z)
            zz = (np.eye(self.latent_dim) - W.T.dot(F_inv_W) +
                  z[:, :, np.newaxis] * z[:, np.newaxis, :])
            zz_list.append(zz)
            xx = dev[:, :, np.newaxis] * dev[:, np.newaxis, :]
            xx_list.append(xx)
            xz = dev[:, :, np.newaxis] * z[:, np.newaxis, :]
            xz_list.append(xz)
            zx = z[:, :, np.newaxis] * dev[:, np.newaxis, :]
            zx_list.append(zx)
        x_list = [X for k in range(self.n_components)]

        # Store sufficient statistics in dictionary
        ss = {'responsibilities': responsibilities,
              'x_list': x_list,
              'xx_list': xx_list,
              'xz_list': xz_list,
              'zx_list': zx_list,
              'z_list': z_list,
              'zz_list': zz_list}

        # Compute log-likelihood
        sample_ll = log_r_sum

        return ss, sample_ll

    def _e_step_miss(self, X, params):
        """ E-Step of the EM-algorithm.

        The E-step takes the existing parameters, for the components, bias
        and noise variance and computes sufficient statistics for the M-Step
        by taking the expectation of latent variables conditional on the
        visible variables. Also returns the likelihood for the data X and
        projections into latent space of the data.

        Args
        ----
        X : array, [nExamples, nFeatures]
            Matrix of training data, where nExamples is the number of
            examples and nFeatures is the number of features.
        W : array, [dataDim, latentDim]
            Component matrix data. Maps latent points to data space.
        b : array, [dataDim,]
            Data bias.
        sigmaSq : float
            Noise variance parameter.

        Returns
        -------
        ss : dict

        proj :

        ll :
        """
        # Get current params
        mu_list = params['mu_list']
        components = params['components']
        Psi_list = params['Psi_list']
        W_list = params['W_list']

        # Get Sigma from params
        Sigma_list = self._params_to_Sigma(params)

        observed_list = [np.array(np.where(~np.isnan(row))).flatten() for
                         row in X]
        n_examples, data_dim = np.shape(X)

        # Compute responsibilities
        log_r = np.zeros([n_examples, self.n_components])
        for n in range(n_examples):
            id_obs = observed_list[n]
            row = X[n, :]
            row_obs = row[id_obs]
            for k, mu, Sigma in zip(range(self.n_components), mu_list,
                                    Sigma_list):
                mu_obs = mu[id_obs]
                Sigma_obs = Sigma[np.ix_(id_obs, id_obs)]
                try:
                    log_r[:, k] = multivariate_normal.logpdf(row_obs, mu_obs, Sigma_obs)
                except np.linalg.linalg.LinAlgError:
                    if self.robust:
                        Sigma_robust = Sigma_obs + self.SMALL*np.eye(self.data_dim)
                        log_r[:, k] = multivariate_normal.logpdf(row_obs, mu_obs,
                                                                 Sigma_robust)
                    else:
                        error_msg = ('Covariance matrix ill-conditioned. Use ' +
                                     'robust=True to pre-condition covariance ' +
                                     'matrices or choose fewer mixture ' +
                                     'components')
                        raise np.linalg.linalg.LinAlgError(error_msg)
        log_r = log_r + np.log(components)
        log_r_sum = sp.misc.logsumexp(log_r, axis=1)
        responsibilities = np.exp(log_r - log_r_sum[:, np.newaxis])

        x_list = []
        xx_list = []
        z_list = []
        zz_list = []
        xz_list = []
        zx_list = []
        for k, mu, W, Psi in zip(range(self.n_components), mu_list, W_list,
                                 Psi_list):
            Psi_inv = np.linalg.inv(Psi)
            x_tot = np.zeros([n_examples, data_dim])
            xx_tot = np.zeros([n_examples, data_dim, data_dim])
            z_tot = np.zeros([n_examples, self.latent_dim])
            zz_tot = np.zeros([n_examples, self.latent_dim, self.latent_dim])
            xz_tot = np.zeros([n_examples, self.data_dim, self.latent_dim])
            zx_tot = np.zeros([n_examples, self.latent_dim, self.data_dim])

            for n in range(n_examples):
                id_obs = observed_list[n]
                id_miss = np.setdiff1d(np.arange(data_dim), id_obs)
#                n_miss = len(id_miss)
                row = X[n, :]
                row_obs = row[id_obs]

#                # Simplify for case with no missing data
#                if n_miss == 0:
#                    x_tot[n] = row_obs
#                    xx_tot[n] = np.outer(row_obs, row_obs)

                # Get missing and visible points
#                Psi_obs = Psi[np.ix_(id_obs,id_obs)]
                Psi_miss = Psi[np.ix_(id_miss, id_miss)]
                Psi_inv_obs = Psi_inv[np.ix_(id_obs, id_obs)]
                W_obs = W[id_obs, :]
                W_miss = W[id_miss, :]
                mu_obs = mu[id_obs]
                mu_miss = mu[id_miss]
                row_min_mu = row_obs - mu_obs

                # Get conditional distribution of p(z | x_vis, params)
                Beta = (W_obs.T.dot(Psi_inv_obs - Psi_inv_obs.dot(W_obs).dot(
                        np.linalg.inv(np.eye(self.latent_dim) +
                        W_obs.T.dot(Psi_inv_obs).dot(W_obs)).dot(
                        W_obs.T.dot(Psi_inv_obs)))))
                mean_z_cond = Beta.dot(row_min_mu)
                cov_z_cond = np.eye(self.latent_dim) - Beta.dot(W_obs)

                # Get conditional distribution of p(x_miss | z, params)
                mean_x_miss = W_miss.dot(mean_z_cond) + mu_miss

                # Append sufficient statistics
                z_tot[n] = mean_z_cond
                zz_tot[n] = cov_z_cond + np.outer(mean_z_cond, mean_z_cond)

                x_tot[n, id_obs] = row_obs
                x_tot[n, id_miss] = mean_x_miss

                xz_tot[n, id_miss, :] = W_miss.dot(zz_tot[n])
                xz_tot[n, id_obs, :] = np.outer(row_min_mu, mean_z_cond)

                zx_tot[n, :, id_miss] = zz_tot[n].dot(W_miss.T).T
                zx_tot[n, :, id_obs] = np.outer(mean_z_cond, row_min_mu).T

                xx = np.empty([data_dim, data_dim])
                xx[np.ix_(id_obs, id_obs)] = np.outer(row_min_mu, row_min_mu)
                xx[np.ix_(id_obs, id_miss)] = np.outer(row_min_mu,
                                                       mean_x_miss - mu_miss)
                xx[np.ix_(id_miss, id_obs)] = np.outer(mean_x_miss - mu_miss,
                                                       row_min_mu)
                xx[np.ix_(id_miss, id_miss)] = (W_miss.dot(zz_tot[n]).dot(
                                                W_miss.T) + Psi_miss)
                xx_tot[n] = xx
            x_list.append(x_tot)
            xx_list.append(xx_tot)
            z_list.append(z_tot)
            zz_list.append(zz_tot)
            xz_list.append(xz_tot)
            zx_list.append(zx_tot)

        # Store sufficient statistics in dictionary
        ss = {'responsibilities': responsibilities,
              'x_list': x_list,
              'xx_list': xx_list,
              'xz_list': xz_list,
              'zx_list': zx_list,
              'z_list': z_list,
              'zz_list': zz_list}

        # Compute log-likelihood
        sample_ll = log_r_sum

        return ss, sample_ll

    def _m_step(self, ss, params):
        """ M-Step of the EM-algorithm.

        The M-step takes the sufficient statistics computed in the E-step, and
        maximizes the expected complete data log-likelihood with respect to the
        parameters.

        Args
        ----
        ss : dict

        Returns
        -------
        params : dict

        """
        resp = ss['responsibilities']
        x_list = ss['x_list']
        xx_list = ss['xx_list']
        xz_list = ss['xz_list']
        zx_list = ss['zx_list']
        z_list = ss['z_list']
        zz_list = ss['zz_list']
        W_list_old = params['W_list']

        # Update components param
        components = np.mean(resp, axis=0)

        # Update mean / Sigma params
        mu_list = []
        W_list = []
        Psi_list = []
#        n_examples = resp.shape[0]
        for r, W, x, xx, xz, zx, z, zz in zip(resp.T, W_list_old, x_list,
                                              xx_list, xz_list, zx_list,
                                              z_list, zz_list):


            # mu
            resid = x - z.dot(W.T)
            mu = np.sum(resid*r[:, np.newaxis], axis=0) / r.sum()
            mu_list.append(mu)

            # W
            W1 = np.sum(xz*r[:, np.newaxis, np.newaxis], axis=0)
            W2 = np.sum(zz*r[:, np.newaxis, np.newaxis], axis=0)
            try:
                W = np.linalg.solve(W2, W1.T).T
            except np.linalg.linalg.LinAlgError:
                if self.robust:
                    W2_robust = W2 + self.SMALL*np.eye(self.latent_dim)
                    W = np.linalg.solve(W2_robust, W1.T).T
                else:
                    error_msg = ('Matrix ill-conditioned. Use ' +
                                 'robust=True to pre-condition covariance ' +
                                 'matrices or choose fewer mixture ' +
                                 'components')
                    raise np.linalg.linalg.LinAlgError(error_msg)

            W = W1.dot(W2)
            W_list.append(W)

            # Psi
            s1 = xx*r[:, np.newaxis, np.newaxis]
            s2 = (W[np.newaxis, :, :].dot(zx*r[:, np.newaxis, np.newaxis])
                  [0, :, :, :].transpose((1, 0, 2)))
            Psi = np.diag(np.diag(np.sum(s1-s2, axis=0))) / r.sum()
            Psi_list.append(Psi)


        # Store params in dictionary
        params = {'W_list': W_list,
                  'Psi_list': Psi_list,
                  'mu_list': mu_list,
                  'components': components}
        return params

    def _params_to_Sigma(self, params):
        W_list = params['W_list']
        Psi_list = params['Psi_list']
        Sigma_list = [W.dot(W.T) + Psi for W, Psi in zip(W_list, Psi_list)]
        return Sigma_list