Source code for flyqma.annotation.classification.classifiers

from os.path import join, exists
from os import mkdir
import gc
import numpy as np
from sklearn.cluster import k_means
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from ...utilities import IO


[docs]class ClassifierIO:
    """
    Methods for saving and loading classifier objects.
    """

[docs]    def save(self, dirpath, data=False, image=True, extension=None, **kwargs):
        """
        Save classifier to specified path.

        Args:

            dirpath (str) - directory in which classifier is to be saved

            data (bool) - if True, save training data

            image (bool) - if True, save labeled histogram image

            extension (str) - directory name extension

            kwargs: keyword arguments for image rendering

        """

        # create directory for classifier
        dirname = 'classifier'
        if extension is not None:
            dirname += '_{:s}'.format(str(extension))
        path = join(dirpath, dirname)
        if not exists(path):
            mkdir(path)

        # save values
        if data:
            np.save(join(path, 'values.npy'), self._values)

        # save parameters
        io = IO()
        io.write_json(join(path, 'parameters.json'), self.parameters)

        # save image
        if image:
            pass

            # # visualize classification
            # self.show()

            # # save image
            # self.fig.savefig(join(path, 'classifier.pdf'), **kwargs)
            # self.fig.clf()
            # plt.close(self.fig)
            # gc.collect()

        return path

[docs]    @classmethod
    def load(cls, path):
        """
        Load classifier from file.

        Args:

            path (str) - path to classifier directory

        Returns:

            classifier (Classifier derivative)

        """
        io = IO()

        values_path = join(path, 'values.npy')
        if exists(values_path):
            values = io.read_npy(values_path)
        else:
            values = None

        parameters = io.read_json(join(path, 'parameters.json'))
        return cls(values, **parameters)


[docs]class ClassifierProperties:
    """
    Properties for classifier objects.
    """

    @property
    def num_samples(self):
        """ Number of samples. """
        return len(self._values)

    @property
    def values(self):
        """ Values for classifier. """
        if self.log:
            return np.log(self._values)
        else:
            return self._values

    @property
    def order(self):
        """ Ordered component indices (low to high). """
        x = self.component_to_label
        return sorted(x, key=x.__getitem__)

    @property
    def component_groups(self):
        """ List of lists of components for each label. """
        x = self.component_to_label
        labels = np.unique(list(x.values()))
        return [[k for k, v in x.items() if v == l] for l in labels]

    @property
    def centroids(self):
        """ Means of each component (not log transformed). """
        centroids = self.model.means_
        if self.log:
            centroids = np.exp(centroids)
        return centroids

    @property
    def component_to_label(self):
        """
        Returns dictionary mapping components to labels.  Mapping is achieved by k-means clustering the model centroids (linear scale).
        """
        n = self.num_labels
        cluster_means, cluster_labels, _ = k_means(self.centroids, n, n_init=100)
        component_to_label = {}
        for label, c in enumerate(np.argsort(cluster_means.mean(axis=1))):
            for d in (cluster_labels==c).nonzero()[0]:
                component_to_label[d] = label
        return component_to_label


[docs]class Classifier(ClassifierProperties, ClassifierIO):
    """
    Classifier base class. Children of this class must possess a means attribute, as well as a predict method.


    Attributes:

        values (array like) - basis for clustering

        attribute (str or list) - attribute(s) used to determine labels

        log (bool) - indicates whether clustering performed on log values

        num_labels (int) - number of output labels

        classifier (vectorized func) - maps value to label_id

        labels (np.ndarray[int]) - predicted labels

        cmap (matplotlib.colors.ColorMap) - colormap for label_id

        parameters (dict) - {param name: param value} pairs

        fig (matplotlib.figures.Figure) - histogram figure

    """

    def __init__(self, values,
                 attribute=None,
                 num_labels=3,
                 log=True,
                 cmap=None):
        """
        Instantiate classifier mapping <n> clusters to <num_labels>.

        Args:

            values (np.ndarray[float]) - basis for clustering

            attribute (str or list) - attribute(s) on which to cluster

            num_labels (int) - number of class labels

            log (bool) - if True, cluster log-transformed values

            cmap (matplotlib.colors.ColorMap) - colormap for cell labels

        """

        # set values, whether to log transform them, and number of clusters
        self._values = values
        self.log = log

        self.num_labels = num_labels

        # set colormap
        self.set_cmap(cmap=cmap)

        # store parameters
        if type(attribute) == str:
            attribute = [attribute]
        self.attribute = attribute
        self.parameters = dict(num_labels=num_labels,
                               log=log,
                               attribute=attribute)
        self.fig = None

    def __call__(self, data):
        """
        Assign class labels to <data>.

        Args:

            data (pd.DataFrame) - must contain necessary attributes

        Returns:

            labels (np.ndarray[int])

        """
        return self.evaluate_classifier(data)

[docs]    def evaluate_classifier(self, data):
        """
        Assign class labels to <data>.

        Args:

            data (pd.DataFrame) - must contain necessary attributes

        Returns:

            labels (np.ndarray[int])

        """
        x =  data[self.attribute].values
        if self.log:
            x = np.log(x)
        return self.classifier(x)

[docs]    @classmethod
    def from_measurements(cls, data, attribute, **kwargs):
        """
        Fit classifier to data.

        Args:

            data (pd.DataFrame) - measurement data

            attribute (str or list) - attribute(s) on which to cluster

            kwargs: keyword arguments for classifier

        Returns:

            classifier (Classifier derivative)

        """
        return cls(data[attribute].values, attribute, **kwargs)

[docs]    @classmethod
    def from_grouped_measurements(cls,
                            data,
                            attribute,
                            groupby=None,
                            **kwargs):
        """
        Fit classifier to data grouped by a specified feature.

        Args:

            data (pd.DataFrame) - measurement data

            groupby (str) - attribute used to group measurement data

            attribute (str or list) - attribute(s) on which to cluster

            kwargs: keyword arguments for classifier

        Returns:

            classifier (Classifier derivative)

        """

        if groupby is None:
            groupby = ('disc_genotype', 'disc_id', 'layer', 'im_label')
        values = data.groupby(by=groupby)[attribute].mean().values
        return cls(values, attribute, **kwargs)

[docs]    def show(self):
        """ Visualize classification. """
        pass

[docs]    def set_cmap(self, cmap=None, vmin=0, vmax=None):
        """
        Set colormap for class labels.

        Args:

            cmap (matplotlib.colormap)

            vmin (int) - lower bound for color scale

            vmax (int) - upper bound for color scale

        """

        # select colormap
        if cmap is None:
            cmap = plt.cm.viridis

        if vmax is None:
            vmax = self.num_labels - 1

        # normalize
        norm = Normalize(vmin=vmin, vmax=vmax)
        self.cmap = lambda x: cmap(norm(x))

[docs]    def build_colormap(self, cmap, vmin=-1):
        """
        Build normalized colormap for class labels.

        Args:

            cmap (matplotlib.colormap)

            vmin (float) - lower bound for colorscale

        Returns:

            colormap (func) - function mapping class labels to colors

        """
        norm = Normalize(vmin=vmin, vmax=self.num_labels-1)
        return lambda x: cmap(norm(x))

[docs]    def build_classifier(self):
        """
        Build function that returns the most probable label for each of a series of values.
        """

        # build classifier that maps model components to labels.
        component_to_label = np.vectorize(self.component_to_label.get)

        def classifier(values):
            """ Returns <label> for <values>.  """
            return component_to_label(self.predict(values))

        return classifier
Source code for flyqma.annotation.classification.classifiers

Navigation

Related Topics