Source code for flyqma.annotation.classification.classifiers

from os.path import join, exists
from os import mkdir
import gc
import numpy as np
from sklearn.cluster import k_means
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from ...utilities import IO


[docs]class ClassifierIO: """ Methods for saving and loading classifier objects. """
[docs] def save(self, dirpath, data=False, image=True, extension=None, **kwargs): """ Save classifier to specified path. Args: dirpath (str) - directory in which classifier is to be saved data (bool) - if True, save training data image (bool) - if True, save labeled histogram image extension (str) - directory name extension kwargs: keyword arguments for image rendering """ # create directory for classifier dirname = 'classifier' if extension is not None: dirname += '_{:s}'.format(str(extension)) path = join(dirpath, dirname) if not exists(path): mkdir(path) # save values if data: np.save(join(path, 'values.npy'), self._values) # save parameters io = IO() io.write_json(join(path, 'parameters.json'), self.parameters) # save image if image: pass # # visualize classification # self.show() # # save image # self.fig.savefig(join(path, 'classifier.pdf'), **kwargs) # self.fig.clf() # plt.close(self.fig) # gc.collect() return path
[docs] @classmethod def load(cls, path): """ Load classifier from file. Args: path (str) - path to classifier directory Returns: classifier (Classifier derivative) """ io = IO() values_path = join(path, 'values.npy') if exists(values_path): values = io.read_npy(values_path) else: values = None parameters = io.read_json(join(path, 'parameters.json')) return cls(values, **parameters)
[docs]class ClassifierProperties: """ Properties for classifier objects. """ @property def num_samples(self): """ Number of samples. """ return len(self._values) @property def values(self): """ Values for classifier. """ if self.log: return np.log(self._values) else: return self._values @property def order(self): """ Ordered component indices (low to high). """ x = self.component_to_label return sorted(x, key=x.__getitem__) @property def component_groups(self): """ List of lists of components for each label. """ x = self.component_to_label labels = np.unique(list(x.values())) return [[k for k, v in x.items() if v == l] for l in labels] @property def centroids(self): """ Means of each component (not log transformed). """ centroids = self.model.means_ if self.log: centroids = np.exp(centroids) return centroids @property def component_to_label(self): """ Returns dictionary mapping components to labels. Mapping is achieved by k-means clustering the model centroids (linear scale). """ n = self.num_labels cluster_means, cluster_labels, _ = k_means(self.centroids, n, n_init=100) component_to_label = {} for label, c in enumerate(np.argsort(cluster_means.mean(axis=1))): for d in (cluster_labels==c).nonzero()[0]: component_to_label[d] = label return component_to_label
[docs]class Classifier(ClassifierProperties, ClassifierIO): """ Classifier base class. Children of this class must possess a means attribute, as well as a predict method. Attributes: values (array like) - basis for clustering attribute (str or list) - attribute(s) used to determine labels log (bool) - indicates whether clustering performed on log values num_labels (int) - number of output labels classifier (vectorized func) - maps value to label_id labels (np.ndarray[int]) - predicted labels cmap (matplotlib.colors.ColorMap) - colormap for label_id parameters (dict) - {param name: param value} pairs fig (matplotlib.figures.Figure) - histogram figure """ def __init__(self, values, attribute=None, num_labels=3, log=True, cmap=None): """ Instantiate classifier mapping <n> clusters to <num_labels>. Args: values (np.ndarray[float]) - basis for clustering attribute (str or list) - attribute(s) on which to cluster num_labels (int) - number of class labels log (bool) - if True, cluster log-transformed values cmap (matplotlib.colors.ColorMap) - colormap for cell labels """ # set values, whether to log transform them, and number of clusters self._values = values self.log = log self.num_labels = num_labels # set colormap self.set_cmap(cmap=cmap) # store parameters if type(attribute) == str: attribute = [attribute] self.attribute = attribute self.parameters = dict(num_labels=num_labels, log=log, attribute=attribute) self.fig = None def __call__(self, data): """ Assign class labels to <data>. Args: data (pd.DataFrame) - must contain necessary attributes Returns: labels (np.ndarray[int]) """ return self.evaluate_classifier(data)
[docs] def evaluate_classifier(self, data): """ Assign class labels to <data>. Args: data (pd.DataFrame) - must contain necessary attributes Returns: labels (np.ndarray[int]) """ x = data[self.attribute].values if self.log: x = np.log(x) return self.classifier(x)
[docs] @classmethod def from_measurements(cls, data, attribute, **kwargs): """ Fit classifier to data. Args: data (pd.DataFrame) - measurement data attribute (str or list) - attribute(s) on which to cluster kwargs: keyword arguments for classifier Returns: classifier (Classifier derivative) """ return cls(data[attribute].values, attribute, **kwargs)
[docs] @classmethod def from_grouped_measurements(cls, data, attribute, groupby=None, **kwargs): """ Fit classifier to data grouped by a specified feature. Args: data (pd.DataFrame) - measurement data groupby (str) - attribute used to group measurement data attribute (str or list) - attribute(s) on which to cluster kwargs: keyword arguments for classifier Returns: classifier (Classifier derivative) """ if groupby is None: groupby = ('disc_genotype', 'disc_id', 'layer', 'im_label') values = data.groupby(by=groupby)[attribute].mean().values return cls(values, attribute, **kwargs)
[docs] def show(self): """ Visualize classification. """ pass
[docs] def set_cmap(self, cmap=None, vmin=0, vmax=None): """ Set colormap for class labels. Args: cmap (matplotlib.colormap) vmin (int) - lower bound for color scale vmax (int) - upper bound for color scale """ # select colormap if cmap is None: cmap = plt.cm.viridis if vmax is None: vmax = self.num_labels - 1 # normalize norm = Normalize(vmin=vmin, vmax=vmax) self.cmap = lambda x: cmap(norm(x))
[docs] def build_colormap(self, cmap, vmin=-1): """ Build normalized colormap for class labels. Args: cmap (matplotlib.colormap) vmin (float) - lower bound for colorscale Returns: colormap (func) - function mapping class labels to colors """ norm = Normalize(vmin=vmin, vmax=self.num_labels-1) return lambda x: cmap(norm(x))
[docs] def build_classifier(self): """ Build function that returns the most probable label for each of a series of values. """ # build classifier that maps model components to labels. component_to_label = np.vectorize(self.component_to_label.get) def classifier(values): """ Returns <label> for <values>. """ return component_to_label(self.predict(values)) return classifier