Skip to content

Commit

Permalink
Implemented stepwise EM
Browse files Browse the repository at this point in the history
  • Loading branch information
charlienash committed Jun 14, 2016
1 parent 93bde9e commit 52fc9af
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 64 deletions.
Binary file modified __pycache__/pyMM.cpython-34.pyc
Binary file not shown.
Binary file modified __pycache__/util.cpython-34.pyc
Binary file not shown.
23 changes: 13 additions & 10 deletions demo.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
import numpy as np
from pyMM import GMM, SphericalGMM, DiagonalGMM, MPPCA, MFA
from util import _generate_mixture_data, plot_density
from util import _generate_mixture_data, plot_density, _gen_low_rank_data


def main():
n_examples = 2500
data_dim = 500
n_examples = 2000
data_dim = 1000
n_components = 1
X = _generate_mixture_data(data_dim, n_components, n_examples)
rank = 50
# X = _generate_mixture_data(data_dim, n_components, n_examples)
X = _gen_low_rank_data(data_dim, rank, n_examples)

# Obscure data
r = np.random.rand(n_examples, data_dim)
X_miss = X.copy()
X_miss[r > 0.7] = np.nan
X_miss[r > 0.6] = np.nan

# Initialize model
gmm = GMM(n_components=n_components, robust=True, SMALL=1e-5)
# gmm = GMM(n_components=n_components)
# gmm = SphericalGMM(n_components=n_components, robust=True)
# gmm = DiagonalGMM(n_components=n_components)
# gmm = MPPCA(n_components=n_components, latent_dim=1)
# gmm = MFA(n_components=n_components, latent_dim=50)
gmm = MFA(n_components=n_components, latent_dim=40, tol=1e-3)

# Fit GMM
# gmm.stepwise_fit(X, init_method='kmeans')
gmm.fit(X, init_method='kmeans')
# gmm.fit(X_miss, init_method='kmeans')
gmm.stepwise_fit(X_miss, init_method='kmeans', batch_size=200,
step_alpha=0.7)
# gmm.fit(X, init_method='kmeans')
gmm.fit(X_miss, init_method='kmeans')

# Plot results
# plot_density(gmm, X=X, n_grid=50)
Expand Down
49 changes: 22 additions & 27 deletions pyMM.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _e_step_no_miss(self, X, params):
# Get params
mu_list = params['mu_list']
components = params['components']
n_examples, data_dim = X.shape
n_examples = X.shape[0]

# Get Sigma from params
Sigma_list = self._params_to_Sigma(params)
Expand All @@ -124,7 +124,6 @@ def _e_step_no_miss(self, X, params):
log_r = np.zeros([n_examples, self.n_components])
for k, mu, Sigma in zip(range(self.n_components), mu_list,
Sigma_list):
# print(np.linalg.eig(Sigma))
try:
log_r[:, k] = multivariate_normal.logpdf(X, mu, Sigma)
except (np.linalg.linalg.LinAlgError, ValueError):
Expand Down Expand Up @@ -373,22 +372,22 @@ def fit(self, X, params_init=None, init_method='kmeans'):
Matrix of training data, where nExamples is the number of
examples and nFeatures is the number of features.
"""
n_examples, data_dim = np.shape(X)
self.data_dim = data_dim
self.n_examples = n_examples
if np.isnan(X).any():
self.missing_data = True
else:
self.missing_data = False

# Check for missing values and remove if whole row is missing
X = X[~np.isnan(X).all(axis=1), :]
n_examples, data_dim = np.shape(X)
self.data_dim = data_dim
self.n_examples = n_examples

if params_init is None:
params = self._init_params(X, init_method)
else:
params = params_init

# Check for missing values and remove if whole row is missing
X = X[~np.isnan(X).all(axis=1), :]

oldL = -np.inf
for i in range(self.max_iter):

Expand All @@ -398,7 +397,7 @@ def fit(self, X, params_init=None, init_method='kmeans'):
# Evaluate likelihood
ll = sample_ll.mean() / self.data_dim
if self.verbose:
print("Iter {:d} NLL: {:.3f} Change: {:.3f}".format(i,
print("Iter {:d} NLL: {:.4f} Change: {:.4f}".format(i,
-ll, -(ll-oldL)), flush=True)

# Break if change in likelihood is small
Expand All @@ -421,7 +420,7 @@ def fit(self, X, params_init=None, init_method='kmeans'):
self.isFitted = True

def stepwise_fit(self, X, params_init=None, init_method='kmeans',
batch_size=500, step_alpha=0.75):
batch_size=250, step_alpha=0.7):
""" Fit the model using EM with data X.
Args
Expand All @@ -430,22 +429,22 @@ def stepwise_fit(self, X, params_init=None, init_method='kmeans',
Matrix of training data, where nExamples is the number of
examples and nFeatures is the number of features.
"""
n_examples, data_dim = np.shape(X)
self.data_dim = data_dim
self.n_examples = n_examples
if np.isnan(X).any():
self.missing_data = True
else:
self.missing_data = False

# Check for missing values and remove if whole row is missing
X = X[~np.isnan(X).all(axis=1), :]
n_examples, data_dim = np.shape(X)
self.data_dim = data_dim
self.n_examples = n_examples

if params_init is None:
params = self._init_params(X, init_method)
else:
params = params_init

# Check for missing values and remove if whole row is missing
X = X[~np.isnan(X).all(axis=1), :]

# Get batch indices
batch_id = np.hstack([np.arange(0, n_examples, batch_size),
n_examples])
Expand All @@ -465,7 +464,7 @@ def stepwise_fit(self, X, params_init=None, init_method='kmeans',
# Evaluate likelihood
ll = sample_ll.mean() / self.data_dim
if self.verbose:
print("Iter {:d} NLL: {:.3f} Change: {:.3f}".format(i,
print("Iter {:d} NLL: {:.4f} Change: {:.4f}".format(i,
-ll, -(ll-oldL)), flush=True)

# Break if change in likelihood is small
Expand Down Expand Up @@ -812,8 +811,7 @@ def _e_step_no_miss(self, X, params):
ss_list.append(np.sum(r*(s1 + s2 + s3)))

# Store sufficient statistics in dictionary
ss = {'n_examples': n_examples,
'r_list': r_list,
ss = {'r_list': r_list,
'x_list': x_list,
'xz_list': xz_list,
'z_list': z_list,
Expand Down Expand Up @@ -985,8 +983,7 @@ def _e_step_miss(self, X, params):
ss_list.append(ss_tot)

# Store sufficient statistics in dictionary
ss = {'n_examples': n_examples,
'r_list': r_list,
ss = {'r_list': r_list,
'x_list': x_list,
'xz_list': xz_list,
'z_list': z_list,
Expand Down Expand Up @@ -1014,7 +1011,7 @@ def _m_step(self, ss, params):
params : dict
"""
n_examples = ss['n_examples']
n_examples = self.n_examples
r_list = ss['r_list']
x_list = ss['x_list']
z_list = ss['z_list']
Expand Down Expand Up @@ -1204,8 +1201,7 @@ def _e_step_no_miss(self, X, params):
zx_list.append(np.sum(zx*r[:, np.newaxis, np.newaxis], axis=0))

# Store sufficient statistics in dictionary
ss = {'n_examples': n_examples,
'r_list': r_list,
ss = {'r_list': r_list,
'x_list': x_list,
'xx_list': xx_list,
'xz_list': xz_list,
Expand Down Expand Up @@ -1383,8 +1379,7 @@ def _e_step_miss(self, X, params):
zx_list.append(zx_tot)

# Store sufficient statistics in dictionary
ss = {'n_examples': n_examples,
'r_list': r_list,
ss = {'r_list': r_list,
'x_list': x_list,
'xx_list': xx_list,
'xz_list': xz_list,
Expand Down Expand Up @@ -1413,7 +1408,7 @@ def _m_step(self, ss, params):
params : dict
"""
n_examples = ss['n_examples']
n_examples = self.n_examples
r_list = ss['r_list']
x_list = ss['x_list']
xx_list = ss['xx_list']
Expand Down
65 changes: 38 additions & 27 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,29 @@
from matplotlib.patches import Ellipse

def _mv_gaussian_pdf(X, mu, Sigma):
"""
"""
Get Gaussian probability density for given data points and parameters.
"""
"""
return np.exp(_mv_gaussian_log_pdf(X, mu, Sigma))

def _mv_gaussian_log_pdf(X, mu, Sigma):
"""
Get Gaussian log probability density for given data points and
"""
Get Gaussian log probability density for given data points and
parameters.
"""
"""
d = mu.size
dev = X - mu
Sigma_inv = np.linalg.inv(Sigma)
log_det = np.log(np.linalg.det(Sigma))
maha = np.diag(dev.dot(Sigma_inv).dot(dev.T))
return -0.5 * (d*np.log(2*np.pi) + log_det + maha)

def _get_rand_cov_mat(dim):
Sigma = np.random.randn(dim, dim)
Sigma = Sigma.dot(Sigma.T)
Sigma = Sigma + dim*np.eye(dim)
return Sigma

def _generate_mixture_data(dim, n_components, n_samples):
components = np.random.rand(n_components)
components = components / np.sum(components)
Expand All @@ -35,14 +35,25 @@ def _generate_mixture_data(dim, n_components, n_samples):
samples = np.zeros([n_samples, dim])
for n in range(n_samples):
r = np.random.rand(1)
z = np.argmin(r > components_cumsum)
samples[n] = np.random.multivariate_normal(mu_list[z], Sigma_list[z])
z = np.argmin(r > components_cumsum)
samples[n] = np.random.multivariate_normal(mu_list[z], Sigma_list[z])
return samples


def _gen_low_rank_data(dim, rank, n_samples):
sigma = 1.
rng = np.random.RandomState(42)
U, _, _ = np.linalg.svd(rng.randn(dim, dim))
X = np.dot(rng.randn(n_samples, rank), U[:, :rank].T)

# Adding heteroscedastic noise
sigmas = sigma * rng.rand(dim) + sigma / 2.
X_hetero = X + rng.randn(n_samples, dim) * sigmas
return X_hetero

def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
"""
Plots an `nstd` sigma error ellipse based on the specified covariance
matrix (`cov`). Additional keyword arguments are passed on to the
matrix (`cov`). Additional keyword arguments are passed on to the
ellipse patch artist.
Parameters
Expand All @@ -52,7 +63,7 @@ def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
sequence of [x0, y0].
nstd : The radius of the ellipse in numbers of standard deviations.
Defaults to 2 standard deviations.
ax : The axis that the ellipse will be plotted on. Defaults to the
ax : The axis that the ellipse will be plotted on. Defaults to the
current axis.
Additional keyword arguments are pass on to the ellipse patch.
Expand All @@ -77,37 +88,37 @@ def eigsorted(cov):

ax.add_artist(ellip)
return ellip
def plot_density(model, x_range='auto', y_range='auto', n_grid=100,
with_scatter=True, X=None, contour_options=None,

def plot_density(model, x_range='auto', y_range='auto', n_grid=100,
with_scatter=True, X=None, contour_options=None,
scatter_options=None, with_missing=False, X_miss=None):

# Set default options
if contour_options is None:
contour_options = {'cmap' : plt.cm.plasma}
if scatter_options is None:
if scatter_options is None:
scatter_options = {'color' : 'w', 'alpha' : 0.5, 'lw' : 0}
scatter_options_miss = {'color' : 'r', 'alpha' : 0.5, 'lw' : 0}

# Automatic x_range and y_range
if x_range == 'auto' and X is not None:
x_range = [X[:,0].min() - 2, X[:,0].max() + 2]
if y_range == 'auto' and X is not None:
y_range = [X[:,1].min() - 2, X[:,1].max() + 2]

# Setup grid for contour plot
x_vec = np.linspace(x_range[0], x_range[1], n_grid)
y_vec = np.linspace(y_range[0], y_range[1], n_grid)
x, y, = np.meshgrid(x_vec, y_vec)
X_grid = np.zeros([n_grid**2, 2])
y_vec = np.linspace(y_range[0], y_range[1], n_grid)
x, y, = np.meshgrid(x_vec, y_vec)
X_grid = np.zeros([n_grid**2, 2])
X_grid[:,0] = x.reshape(n_grid**2)
X_grid[:,1] = y.reshape(n_grid**2)

# Get sample log-likelihood from model
grid_ll = model.score_samples(X_grid)
grid_prob = np.exp(grid_ll) # Convert to probability density
grid_prob = grid_prob.reshape((n_grid, n_grid))

# Plot contours
plt.contourf(x, y, grid_prob, **contour_options)

Expand All @@ -118,13 +129,13 @@ def plot_density(model, x_range='auto', y_range='auto', n_grid=100,
plt.scatter(X[~id_miss,0], X[~id_miss,1], **scatter_options)
plt.scatter(X[id_miss,0], X[id_miss,1], **scatter_options_miss)
else:
plt.scatter(X[:,0], X[:,1], **scatter_options) # data
plt.scatter(X[:,0], X[:,1], **scatter_options) # data

# Plot options
plt.xlim(x_range)
plt.ylim(y_range)
plt.xticks([])
plt.yticks([])
plt.show()
plt.show()
# fig = plt.gcf()
# fig.set_size_inches(8, 8, forward=True)

0 comments on commit 52fc9af

Please sign in to comment.