Source code for impyute.imputation.cs.em

import numpy as np
from impyute.ops import matrix
from impyute.ops import wrapper

[docs]@wrapper.wrappers @wrapper.checks def em(data, loops=50): """ Imputes given data using expectation maximization. E-step: Calculates the expected complete data log likelihood ratio. M-step: Finds the parameters that maximize the log likelihood of the complete data. Parameters ---------- data: numpy.nd.array Data to impute. loops: int Number of em iterations to run before breaking. inplace: boolean If True, operate on the numpy array reference Returns ------- numpy.nd.array Imputed data. """ nan_xy = matrix.nan_indices(data) for x_i, y_i in nan_xy: col = data[:, int(y_i)] mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() col[x_i] = np.random.normal(loc=mu, scale=std) previous, i = 1, 1 for i in range(loops): # Expectation mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() # Maximization col[x_i] = np.random.normal(loc=mu, scale=std) # Break out of loop if likelihood doesn't change at least 10% # and has run at least 5 times delta = (col[x_i]-previous)/previous if i > 5 and delta < 0.1: data[x_i][y_i] = col[x_i] break data[x_i][y_i] = col[x_i] previous = col[x_i] return data