Source code for impyute.util.compare

"""impyute.util.compare.py"""
import importlib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# pylint: disable=too-many-locals
# pylint: disable=dangerous-default-value


[docs]def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None): """ Given an imputed dataset with labels and a list of supervised machine learning model, find accuracy score of all model/imputation pairs. Parameters ---------- imputed: [(str, np.ndarray), (str, np.ndarray)...] List of tuples containing (imputation_name, imputed_data) where `imputation_name` is a string and `imputed_data` is a tuple where `imputed_data`[0] is the data, X and `imputed_data`[1] is the label, y classifiers: [str, str...str] (optional) Provide a list of classifiers to run imputed data sets on. Right now, it ONLY works with sklearn, the format should be like so: `sklearn.SUBMODULE.FUNCTION`. More generally its 'MODULE.SUBMODULE.FUNCTION'. If providing a custom classifier, make sure to add the file location to sys.path first and the classifier should also be structured like sklearn (with a `fit` and `predict` method). log_path: str (optional) To write results to a file, provide a relative path Returns ------- results.txt Classification results on imputed data """ clfs = [] for clf_name in classifiers: mod_name, smod_name, fn_name = clf_name.split(".") try: mod = importlib.import_module("{}.{}".format(mod_name, smod_name)) fn = getattr(mod, fn_name) clfs.append([fn_name, fn]) except ModuleNotFoundError: print("Cannot import '{}' from '{}.{}'".format(fn_name, mod_name, smod_name)) results = {imputation_name: [] for imputation_name, _ in imputed} for imputation_name, data in imputed: X, y = data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) print("Imputation {} =========".format(imputation_name)) for clf_name, clf in clfs: clf = clf() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) results[imputation_name].append((clf_name, accuracy)) print("...{}".format(clf_name)) # If not None, write to path if log_path: with open(log_path, 'w') as f: f.write(str(results)) print("Results saved to {}".format(log_path)) return results