Source code for impyute.imputation.cs.central_tendency
import numpy as np
from impyute.ops import matrix
from impyute.ops import wrapper
[docs]@wrapper.wrappers
@wrapper.checks
def mean(data):
""" Substitute missing values with the mean of that column.
Parameters
----------
data: numpy.ndarray
Data to impute.
Returns
-------
numpy.ndarray
Imputed data.
"""
nan_xy = matrix.nan_indices(data)
for x_i, y_i in nan_xy:
row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
new_value = np.mean(row_wo_nan)
data[x_i][y_i] = new_value
return data
[docs]@wrapper.wrappers
@wrapper.checks
def mode(data):
""" Substitute missing values with the mode of that column(most frequent).
In the case that there is a tie (there are multiple, most frequent values)
for a column randomly pick one of them.
Parameters
----------
data: numpy.ndarray
Data to impute.
Returns
-------
numpy.ndarray
Imputed data.
"""
nan_xy = matrix.nan_indices(data)
modes = []
for y_i in range(np.shape(data)[1]):
unique_counts = np.unique(data[:, [y_i]], return_counts=True)
max_count = np.max(unique_counts[1])
mode_y = [unique for unique, count in np.transpose(unique_counts)
if count == max_count and not np.isnan(unique)]
modes.append(mode_y) # Appends index of column and column modes
for x_i, y_i in nan_xy:
data[x_i][y_i] = np.random.choice(modes[y_i])
return data