Source code for flyqma.annotation.classification.mixtures

from os.path import join, exists
from os import mkdir
import pickle
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt

from ...utilities import IO
from ..mixtures import UnivariateMixture, BivariateMixture

from .classifiers import Classifier, ClassifierIO
from .visualization import MixtureVisualization, BivariateMixtureVisualization


[docs]class MixtureModelIO(ClassifierIO): """ Methods for saving and loading classifier objects. """
[docs] def save(self, dirpath, data=False, image=True, extension=None, **kwargs): """ Save classifier to specified path. Args: dirpath (str) - directory in which classifier is to be saved data (bool) - if True, save training data image (bool) - if True, save labeled histogram image extension (str) - directory name extension kwargs: keyword arguments for image rendering """ # instantiate Classifier path = super().save(dirpath, data, image, extension, **kwargs) # save model (temporarily remove values) if self.model is not None: #self.model.values = None with open(join(path, 'model.pkl'), 'wb') as file: pickle.dump(self.model, file) return path
[docs] @classmethod def load(cls, path): """ Load classifier from file. Args: path (str) - path to classifier directory Returns: classifier (Classifier derivative) """ io = IO() values_path = join(path, 'values.npy') if exists(values_path): values = io.read_npy(values_path) else: values = None parameters = io.read_json(join(path, 'parameters.json')) # load model with open(join(path, 'model.pkl'), 'rb') as file: model = pickle.load(file) if values is not None: model.values = np.log(values) return cls(values, model=model, **parameters)
[docs]class UnivariateMixtureClassifier(MixtureModelIO, Classifier, MixtureVisualization): """ Univariate mixed log-normal model classifier. Attributes: model (mixtures.UnivariateMixture) - frozen univariate mixture model num_components (int) - number of mixture components classifier (vectorized func) - maps values to labels labels (np.ndarray[int]) - predicted labels Inherited attributes: values (np.ndarray[float]) - basis for clustering num_labels (int) - number of output labels log (bool) - indicates whether clustering performed on log values cmap (matplotlib.colors.ColorMap) - colormap for labels parameters (dict) - {param name: param value} pairs """ def __init__(self, values, num_components=3, num_labels=3, fit_kw={}, model=None, **kwargs): """ Fit a univariate mixture model classifier to an array of values. Args: values (np.ndarray[float]) - basis for clustering (not log-transformed) num_components (int) - number of mixture components num_labels (int) - number of class labels fit_kw (dict) - keyword arguments for fitting mixture model model (mixtures.UnivariateMixture) - pre-fitted model Keyword arguments: attribute (str or list) - attribute(s) on which to cluster cmap (matplotlib.colors.ColorMap) - colormap for class_id """ # instantiate classifier (remove redundant log parameter) if 'log' in kwargs.keys(): _ = kwargs.pop('log') super().__init__(values, num_labels=num_labels, log=True, **kwargs) self.parameters['num_components'] = num_components self.parameters['fit_kw'] = fit_kw # fit model if model is None: model = self.fit(self.values, num_components, **fit_kw) self.model = model # build classifier and posterior self.classifier = self.build_classifier() self.posterior = self.build_posterior() # assign labels if values is not None: self.labels = self.classifier(self.values) @property def num_components(self): """ Number of model components. """ return self.model.n_components @property def means(self): """ Mean of each component. """ return self.model.means
[docs] @staticmethod def fit(values, num_components=3, **kwargs): """ Fit univariate gaussian mixture model. Args: values (np.ndarray[float]) - 1D array of log-transformed values num_components (int) - number of model components kwargs: keyword arguments for fitting Returns: model (mixtures.UnivariateMixture) """ return UnivariateMixture.from_logsample(values, num_components, **kwargs)
[docs] def predict(self, values): """ Predict which component each of <values> belongs to. """ return self.model.predict(values)
[docs] def predict_proba(self, values): """ Predict the posterior probability with which each of <values> belongs to each component. """ return self.model.predict_proba(values)
[docs] def build_posterior(self): """ Build function that returns the posterior probability of each label given a series of values. """ def posterior(values): """ Returns probabilities of each label for <values>. """ # evaluate posterior probability of each label for each value p = self.model.predict_proba(values) _posterior = [p[:,i].sum(axis=1) for i in self.component_groups] _posterior = np.vstack(_posterior).T # fix label probabilities for points outside the support bounds below = values[:, 0] < self.model.lbound above = values[:, 0] > self.model.ubound for rows, col in zip((below.ravel(), above.ravel()), [0, -1]): if rows.sum() == 0: continue adjust = np.zeros((rows.sum(), self.num_labels), dtype=float) adjust[:, col] = 1. _posterior[rows] = adjust return _posterior return posterior
[docs] def evaluate_posterior(self, data): """ Returns posterior across components for <data>. """ x = data[self.attribute].values if self.log: x = np.log(x) return self.posterior(x)
[docs] def build_classifier(self): """ Build function that returns the most probable label for each of a series of values. """ def classifier(values): """ Returns <label> for <values> by maximizing posterior. """ return self.posterior(values).argmax(axis=1) return classifier
[docs]class BivariateMixtureClassifier(BivariateMixtureVisualization, UnivariateMixtureClassifier): """ Bivariate mixed log-normal model classifier. Attributes: model (mixtures.BivariateMixture) - frozen bivariate mixture model Inherited attributes: values (np.ndarray[float]) - basis for clustering attribute (list) - attributes on which to cluster num_labels (int) - number of labels num_components (int) - number of mixture components classifier (vectorized func) - maps values to labels labels (np.ndarray[int]) - predicted labels log (bool) - indicates whether clustering performed on log values cmap (matplotlib.colors.ColorMap) - colormap for labels parameters (dict) - {param name: param value} pairs """ def __getitem__(self, margin): """ Returns UnivariateMixtureClassifier for specified margin. """ return self.marginalize(margin)
[docs] def marginalize(self, margin): """ Returns UnivariateMixtureClassifier for specified margin. """ # assemble marginalized properties values = self._values[:, [margin]] model = self.model[margin] # duplicate parameters parameters = deepcopy(self.parameters) parameters['attribute'] = self.attribute[0] _ = parameters.pop('log') return UnivariateMixtureClassifier(values, model=model, **parameters)
[docs] @staticmethod def fit(values, num_components=3, **kwargs): """ Fit univariate gaussian mixture model. Args: values (np.ndarray[float]) - 1D array of log-transformed values num_components (int) - number of model components kwargs: keyword arguments for fitting Returns: model (mixtures.BivariateMixture) """ return BivariateMixture.from_logsample(values, num_components, **kwargs)