deconvolution.py

﻿#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Deconvolution
=============

Contains
  1. Tools to extract relevant components from a series of spectra
  2. Plotting facilities to show the results and asses the quality of the
  deconvolution
  3. A couple of utilities to simulate spectra.

Documentation
-------------
Code examples are indicated by three greater-than signs:

>>> x = 1916
>>> print(2014 - x)

Whenever referred in the documentation, a function is between backquotes. To
view the documentation of a function `foo` in IPython, just type
>>> foo?
To view the source code, in case you need to know more about how the function
`foo` works, or if the documentation is not enough, type
>>> foo??

Guidelines
----------
To work on a series of spectra, you can load it using the `acquire_data`
function. Then you may determine the number of relevant components using PCA
through the function `pca_step`, that also offers the additional benefit of
denoising the spectra. You may need to use `clean` before you call `nmf_step`
to perform the actual deconvolution.

To view one or more Raman spectra, be they experimental or artificial, you can
use `raman_plot`. To plot the mixing coefficients extracted from one or more
series of spectra, use `multi_mix_plot`. If you want to generate area graphs,
build a generator with `area_graph_generator`.

Dummy binary or ternary spectra can be generated with `generate_binary` or
`generate_ternary`. Since artificial spectra let you compare the deconvoluted
components and mixing coefficients with the true ones, you can use
`compare_components`, `compare_mixings` and `compare_ternary_mixing`.
"""

import glob
import contextlib
import numpy as np
from sklearn import decomposition
from scipy import optimize
from scipy import integrate
from scipy import interpolate
# Graphic imports
import matplotlib as mpl
from matplotlib import ticker
from matplotlib import gridspec
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Home module imports
import ternary


# rc settings
font_size = 10
fig_width_pt = 222.0 * 3.5
inches_per_pt = 1.0/72.27
golden_mean = (np.sqrt(5) - 1.0) / 2.0
fig_width = fig_width_pt * inches_per_pt
fig_height = fig_width * golden_mean
fig_size = (fig_width, fig_height)
# color_set comes from brewer2mpl
# brewer2mpl.get_map('Set1', 'qualitative', 8).mpl_colors
color_set = [(0.89411765, 0.10196078, 0.10980392),
             (0.21568627, 0.49411765, 0.72156863),
             (0.30196078, 0.68627451, 0.29019608),
             (0.59607843, 0.30588235, 0.63921569),
             (1.00000000, 0.49803922, 0.00000000),
             (1.00000000, 1.00000000, 0.20000000),
             (0.65098039, 0.33725490, 0.15686275),
             (0.96862745, 0.50588235, 0.74901961)]
# color_map comes from brewer2mpl
# brewer2mpl.get_map('RdBu', 'diverging', 6).mpl_colors
color_map = [(0.69803922, 0.09411765, 0.16862745),
             (0.93725490, 0.54117647, 0.38431373),
             (0.99215686, 0.85882353, 0.78039216),
             (0.81960784, 0.89803922, 0.94117647),
             (0.40392157, 0.66274510, 0.81176471),
             (0.12941176, 0.40000000, 0.67450980)]
# blue_red originally comes from brewer2mpl
# brewer2mpl.get_map('RdBu', 'Diverging', 11,
#                    reverse=True).mpl_colormap
blue_red = mpl.cm.seismic
params = {'backend': 'pdf', 'axes.labelsize': font_size,
          'text.fontsize': font_size, 'legend.fontsize': font_size,
          'xtick.labelsize': font_size, 'ytick.labelsize': font_size,
          'text.usetex': False, 'figure.figsize': fig_size,
          'font.family': 'serif', 'font.serif': 'times',
          'axes.color_cycle': color_set, 'legend.frameon': False,
          'legend.loc': 'best'}
mpl.rcParams.update(params)


###############################################################################
#                      PART 1 : Raman spectra deconvolution                   #
###############################################################################


def acquire_data(path, delimiter=','):
    """
    Gets the data from csv files generated by the EMC2 Raman spectrometer,
    alphabetically sorted. Each spectrum becomes a line of the spectra array.

    Parameters
    ----------
    path : str
        Unix-style regex of the csv spectra files
    delimiter : char, default ','
        Character delimiting the columns of the CSV file.

    Returns
    -------
    sigma : numpy.ndarray, n_features
        1D array containing the wavenumbers at which the Raman intensities are
        recorded.
    spectra : numpy.ndarray, n_samples * n_features
        2D array, each line is the spectrum found in one file. The spectra are
        sorted alphabetically.

    Examples
    --------
    >>> ls
    my_csv_file_000.CSV my_csv_file_001.CSV my_csv_file_002.CSV
    my_csv_file_003.CSV my_csv_file_004.CSV my_csv_file_005.CSV
    my_csv_file_006.CSV my_csv_file_007.CSV my_csv_file_008.CSV
    my_csv_file_009.CSV my_csv_file_010.CSV my_csv_file_011.CSV
    >>> sigma, spectra = acquire_data('my_csv_file_*.CSV')

    Notes
    -----
    If you get an error 'got 1 columns instead of 1' then you picked the wrong
    delimiter. In IPython, you can have a peek inside your CSV file:
    >>> !head my_csv_file_000.CSV
    9.068555e+001;4.565897e+002
    9.116766e+001;4.636029e+002
    In this case, the right delimiter would be ';', so this call to
    `acquire_data` will work:
    >>> sigma, spectra = acquire_data('my_csv_file_*.CSV', delimiter=';')
    """
    
    
    data_files = glob.glob(path)
    data_files.sort()
    try:
        data_src = (np.genfromtxt(f, delimiter=delimiter, skip_header = 1, usecols=(1))
                    for f in data_files)
        spectra = np.vstack(tuple(data_src))
    except ValueError:
        message = 'No file matching pattern "{:s}" could be found'
        print(message.format(path))
        raise
    sigma = np.genfromtxt(data_files[0], skip_header = 1, delimiter=delimiter, usecols=(0))
    return sigma, spectra


def _long_correction(sigma, lambda_laser):
    """
    Function computing the Long correction factor according to Long
    1977. This function can operate on numpy.ndarrays as well as on
    simple numbers.

    Parameters
    ----------
    sigma : numpy.ndarray
        Wavenumber in cm-1
    lambda_inc : float
        Laser wavelength in nm.

    Examples
    --------
    >>> sigma, i = deconvolution.acquire_data('my_raman_file.CSV')
    >>> corrected_i = i * long_correction(sigma)
    """
    c = 2.998e10                          # cm/s
    lambda_inc = lambda_laser * 1e-7      # cm
    sigma_inc = 1. / lambda_inc           # cm-1
    h = 6.63e-34                          # J.s
    T = 293.0                             # K
    kB = 1.38e-23                         # J/K
    return (sigma_inc**3 * sigma / (sigma_inc - sigma)**4
            * (1 - np.exp(-h*c*sigma/kB/T)))


def clean(sigma, raw_spectra, mode='area', delete=None, long_cor=532):
    """
    Cleans the spectra to remove abnormal ones, remove the baseline offset,
    correct temperature & frequency effects, and make them comparable
    by normalizing them according to their area or their maximum.

    Parameters
    ----------
    sigma : numpy.ndarray
        Wavenumber in cm-1
    raw_spectra : numpy.ndarray, n_spectra * n_features
        Input spectra
    mode : {'area', 'max'}
        Controls how spectra are normalized
    delete : list of int, default None
        Spectra that should be removed, eg outliers
    long_cor : float, optional
        Laser wavelength in nm. If given, then temperature-frequence correction
        will be applied. If None or False, no correction is applied.
    """
    # Remove abnormal spectra
    if delete is not None:
        clean_spectra = np.delete(raw_spectra, delete, axis=0)
    else:
        clean_spectra = np.copy(raw_spectra)
    # Remove the offset
    clean_spectra -= clean_spectra.min(axis=1)[:, np.newaxis]
    # Apply Long correction
    if long_cor:
        clean_spectra *= _long_correction(sigma, long_cor)
    # Normalize the spectra
    if mode == 'max':
        clean_spectra /= clean_spectra.max(axis=1)[:, np.newaxis]
    elif mode == 'area':
        clean_spectra /= integrate.trapz(clean_spectra)[:, np.newaxis]
    return clean_spectra


def pca_step(raw_spectra, graph=False):
    """
    Performs a PCA on the input spectra, calculates the number of components
    using the Malinowski indicator function.

    Notes
    -----
    Malinowski's indicator function is explained at the following webpages
    http://www.vub.ac.be/fabi/multi/pcr/chaps/chap6.html
    http://pubs.acs.org/doi/pdf/10.1021/ac50012a027

    Parameters
    ----------
    raw_spectra : numpy.ndarray, n_spectra * n_features
        Input spectra
    graph : bool, default False
        If True, the eigenvalue graph is shown

    Returns
    -------
    n_components : int
        Number of components from Malinowski's IND function
    denoised_spectra : numpy.ndarray, n_spectra * n_features
        Denoised spectra, reconstructed from PCA with n_components components.

    See also
    --------
    view_pca_denoising : to compare spectra before and after pca_step
    """
    n_samples, n_features = raw_spectra.shape
    pca = decomposition.PCA()
    pca.fit(raw_spectra)   # Fit the data
    eigenvalues = np.sqrt(n_samples * pca.explained_variance_)
    variance_ratio = pca.explained_variance_ratio_.copy()

    def min_spectra(threshold):
        return 1 + np.argmax(variance_ratio.cumsum() > threshold)
    # Malinowski's indicator function (see
    # http://www.vub.ac.be/fabi/multi/pcr/chaps/chap6.html and
    # http://pubs.acs.org/doi/pdf/10.1021/ac50012a027 for reference
    real_error = np.sqrt(eigenvalues[::-1].cumsum()[::-1] /
                         (n_features * (n_samples - np.arange(n_samples))))
    indicator = real_error / (n_samples - np.arange(n_samples))**2
    n_components = 1 + np.argmin(indicator[:int(n_samples / 2)])
    print('Significant components : {:d}'.format(n_components))
    print('Min number of spectra (> 95%) : {:d}'.format(min_spectra(.95)))
    print('Min number of spectra (> 99.5%) : {:d}'.format(min_spectra(.995)))
    # Raw_spectra denoising
    pca.n_components = n_components
    denoised_spectra = pca.fit_transform(raw_spectra)
    denoised_spectra = pca.inverse_transform(denoised_spectra)
    if graph is True:
        plt.plot(np.arange(n_samples), variance_ratio, marker='d')
        plt.vlines(n_components - 1, 0, 1)
        plt.show()
    return n_components, denoised_spectra


def nmf_step(spectra, n_components, sparsity='components', beta=1e-5,
             **kwargs):
    """
    Performs the non-negative matrix factorization of the spectra into a
    partial spectra aka components matrix, and a mixing coefficients
    matrix.
    kwargs are passed to decomposition.NMF.

    Parameters
    ----------
    spectra : numpy.ndarray, n_spectra * n_features
        Clean input spectra
    n_components : int
        Number of significant components
    sparsity : {'data', 'components', None},  default 'components'
        Where to enforce sparsity in the model.
    beta : double, default 1e-5
        Degree of sparseness, if sparseness is not None. Larger values mean
        more sparseness

    Returns
    -------
    components : numpy.ndarray, n_components * n_features
        Resulting components, aka H matrix
    mixing_matrix : numpy.ndarray, n_samples * n_components
        Resulting mixing coefficients, aka W matrix
    reconstruction_error : float
        Frobenius norm of (S - WH)
    """
    init = kwargs.pop('init', 'nndsvda')
    max_iter = kwargs.pop('max_iter', 1000)
    nls_max_iter = kwargs.pop('nls_max_iter', 10000)
    
    try:
        # Old version of sklearn.decomposition.nfm
        nmf = decomposition.NMF(n_components=n_components, init=init,
                                sparseness=sparsity, beta=beta, tol=1e-5,
                                max_iter=max_iter, nls_max_iter=nls_max_iter,
                                **kwargs)
    except:
        # New version of sklearn.decomposition.nfm (beta and nls_max_iter are
        # no longer arguments).
        nmf = decomposition.NMF(n_components=n_components, init=init,
                                tol=1e-5, max_iter=max_iter, **kwargs)
    
    mix = nmf.fit_transform(spectra)
    components = nmf.components_
    print('Reconstruction error: {:.3e}'.format(nmf.reconstruction_err_))
    return components, mix, nmf.reconstruction_err_


def all_in_one(pattern):
    """
    `all_in_one` takes care of the work in these lazy moments when typing is
    just too much of a hassle.

    Parameters
    ----------
    pattern : str
        Unix-style regex of the csv spectra files
    """
    sigma, spectra = acquire_data(pattern)
    n_components, denoised_spectra, _ = pca_step(spectra)
    cleaned_spectra = clean(sigma, denoised_spectra)
    return sigma, nmf_step(cleaned_spectra, n_components)


def project(spectra, components):
    """
    Project a series of spectra on a set of reference components. This is
    achieved through non-negative least squares, contained in scipy.optimize.

    Parameters
    ----------
    spectra : numpy.ndarray, n_spectra * n_features
        Spectra to project.
    components : numpy.ndarray, n_components * n_features
        Components on which to project the spectra.
    """
    projections = (optimize.nnls(components.T, sp)[0] for sp in spectra)
    return np.vstack(tuple(projections))


###############################################################################
#                          PART 2: PLOTTING                                   #
###############################################################################

@contextlib.contextmanager
def _printoptions(*args, **kwargs):
    """
    Set the precision of the priting of an array.

    Parameters
    ----------
    precision : int
    suppress : bool
    """
    original = np.get_printoptions()
    np.set_printoptions(*args, **kwargs)
    yield
    np.set_printoptions(**original)


def _make_ticklabels_invisible(ax, which=['y']):
    """
    Erase the ticks and their labels from the given directions of an Axes
    """
    for loc in which:
        if loc == 'y':
            for tl in ax.get_yticklabels():
                tl.set_visible(False)
        elif loc == 'x':
            for tl in ax.get_xticklabels():
                tl.set_visible(False)


def _parts(number, mixing_proportions, components):
    """
    Parameters
    ----------
    number : int
        Spectrum number
    mixing_proportions : np.ndarray, n_spectra * n_components
        Mixing proportions matrix of the components for each spectrum
    components : numpy.ndarray, n_components * n_features
        Partial Raman spectra
    """
    return mixing_proportions[number, np.newaxis].T * components

tota = ''
def area_graph_generator(sigma, ref_data, mix, components, legend=None,
                         **kwargs):
    """
    Generate an area graph plotting function for a given set of spectra and
    its components and mixing values.

    This avoids the hassle of constantly giving the mixing values and
    components along with the spectrum number.

    Parameters
    ----------
    sigma : numpy.ndarray, shape n_features
        The wavenumber array
    ref_data : numpy.ndarray, shape n_samples * n_features
        The reference spectra to which the reconstructed spectra are compared
        by subtraction.
    mix : numpy.ndarray, shape n_samples * n_components
        Mixing values, aka the weights of the components.
    components : numpy.ndarray, shape n_components * n_features
        Components, aka Partial Raman Spectra.
    legend : iterable of n_components strings, optional
        The legend to display on the graph. Must have exactly as many elements
        as there are components. If None, the legend will name them
        alphabetically, up to 15 components.

    Examples
    --------
    >>> extracted_mix.shape
    (170, 3)
    >>> extracted_components.shape
    (3, 715)
    >>> my_legend = ['First component', 'Extra component', 'Unknown component']
    >>> area_gen = area_graph_generator(sigma, cleaned_spectra, extracted_mix,
                                        extracted_components, legend=my_legend)
    >>> area_gen(50)
    Species           | Area ratio
    ------------------------------
    First component   | 37.4 %
    Extra component   | 5.8 %
    Unknown component | 56.8 %

    Notes
    -----
    The area graph plotting function that is generated does not accept
    kwargs later. They must be supplied here.
    """
    n_comp = components.shape[0]
    # kwargs processing
    alpha = kwargs.pop('alpha', .33)
    if legend and len(legend) == n_comp:
        legend = list(legend)
    elif legend is None:
        legend = list('abcdefghijklmno'[:n_comp])
    else:
        raise ValueError('Legend must have as many items as components.')
    # Environment variables for the closure
    legend = legend[:n_comp]
    colors = color_set[:n_comp]
    patches = [plt.Rectangle((0, 0), 1, 1, ec='none', fc=c, alpha=.33)
               for c in colors]

    def print_area_ratios(area_ratios):
        max_width = max(len('Species'),
                        max(len(elt) for elt in legend))
        headline_pattern = '{:{max_width}s} | Area ratio'
        content_line_pattern = '{:{max_width}s} | {:.1f} %'
        headline = headline_pattern.format('Species', max_width=max_width)
        print(headline)
        print('-' * len(headline))
        for elt, area in zip(legend, area_ratios):
            print(content_line_pattern.format(elt, area, max_width=max_width))
        print('\n' * 2)


    def area_graph(spectrum_number, spectral_step=100):
        """
        Plots a graph where the spectral parts of a given experimental
        spectrum are shown underneath it. Their sum is plotted as a continuous
        line, while the experimental spectrum is disconnected markers. The
        second part of the graph is the residual, ie the difference between the
        experimental spectrum and the reconstructed spectrum.

        Parameters
        ----------
        spectrum_number : int
            The index of the spectrum to plot.
        spectral_step : int, optional
            The specing between major labeled ticks on the x axis.

        Notes
        -----
        For clarity, the spectra are normalized to the max intensity.
        """
        # Data crunching
        spectra_parts = _parts(spectrum_number, mix, components)
        areas = integrate.trapz(spectra_parts)
        print_area_ratios(100 * areas / areas.sum())
        reconstructed = spectra_parts.sum(axis=0)
        reference = ref_data[spectrum_number]
        maximum = max(reference.max(), reconstructed.max())
        # Setup the plotting frame
        fig = plt.figure(figsize=(fig_width, fig_height * 1.2))
        layout = gridspec.GridSpec(2, 1, height_ratios=[4, 1], hspace=.2)
        ax = fig.add_subplot(layout[0])  # First plot: spectrum+spectral parts
        sigma_major_locator = ticker.MultipleLocator(spectral_step)
        sigma_minor_locator = ticker.MultipleLocator(spectral_step / 5)
        spectrum_formatter = ticker.FormatStrFormatter('% 3d')
        ax.set_title(r'Spectrum \#{:3d} deconvolution'.format(spectrum_number))
        ax.set_xlim(sigma.min(), sigma.max())
        ymin, ymax = 0, 100
        ax.set_ylim((ymin, ymax))
        new_yticks = np.linspace(ymin, ymax, 5, endpoint=True)[1:-1]
        ax.set_yticks(new_yticks)
        ax.xaxis.set_major_locator(sigma_major_locator)
        ax.xaxis.set_minor_locator(sigma_minor_locator)
        ax.set_xticklabels([])
        ax.yaxis.set_major_formatter(spectrum_formatter)
        ax.set_ylabel('Intensity / AU')
        ax.yaxis.set_label_coords(-.1, .5)
        ax.legend(patches, legend)
        ax2 = fig.add_subplot(layout[1])  # Second plot:residual
        ax2.axhline(color='k')
        ax2.set_xlim(ax.get_xlim())
        ax2.xaxis.set_major_locator(sigma_major_locator)
        ax2.xaxis.set_minor_locator(sigma_minor_locator)
        ax.set_xlabel('Wavenumber / cm-1')
        ax2.yaxis.set_major_formatter(spectrum_formatter)
        ax2.set_ylabel('Residual / AU')
        ax2.yaxis.set_label_coords(-.1, .5)
        # Plot the spectra parts
        for l, leg, col in zip(spectra_parts, legend, colors):
            ax.fill_between(sigma, 0, 100 * l / maximum, alpha=alpha,
                            label=leg, color=col)
        # Plot the reconstructed versus clean spectrum
        ax.plot(sigma, 100 * reference / maximum, marker='o', mfc='none',
                linestyle='none', mec='k', ms=2, label='Exp. data', mew=.1,
                markevery=2)
        ax.plot(sigma, 100 * reconstructed / maximum, color='k', linewidth=.5,
                label='Reconstructed spectrum')
        # Plot the residual and adjust the axes
        ax2.plot(sigma, 100 * (reference - reconstructed) / maximum, mec='k',
                 linestyle='none', mfc='none', marker='o', markevery=2)
        ymin, ymax = ax2.get_ylim()
        new_yticks = np.linspace(ymin, ymax, 3, endpoint=True)
        new_yticks = np.delete(new_yticks, new_yticks.size / 2)
        ax2.set_yticks(new_yticks)
        # Final adjustments to align 1-digit labels with the 2-digit labels
        # from the 1st subplot, and adjust the spacing between curves
        for t in ax2.get_yticklabels():
            t.set_horizontalalignment('right')
        layout.tight_layout(fig, pad=.5)
        layout.update(hspace=.1)
        return fig
    return area_graph

def raman_plot(sigma, spectra, clear=True, legend=None, spectral_step=200,
               **kwargs):
    """
    Plots Raman spectra as a function of the wavenumber, with ticks and
    subticks.

    Parameters
    ----------
    sigma : numpy.ndarray, n_features
        List of the wavenumbers at which the intensities are recorded.
    spectra : numpy.ndarray, (n_spectra *) n_features
        Array containing one or more spectra to be plotted. The
        spectrum/spectra must have as many points as there are in `sigma`.
    clear : bool, default False
        Whether the plot frame should be cleared of numerical values.
    legend : iterable of n_spectra strings, optional
        The legend to display on the graph. Must have exactly as many elements
        as there are components. If None, there will be no legend.
    spectral_step : int, default 200
        Major ticks on the x axis will be placed every `spectral_step`. Minor
    ticks will be placed every `spectral_step`/5.

    Notes
    -----
    `kwargs` are passed to the `plt.plot` call.
    """
    # kwargs processing
    linewidth = kwargs.pop('lw', 1.25)
    # Figure setup
    fig, ax = plt.subplots()
    sigma_major_locator = ticker.MultipleLocator(spectral_step)
    sigma_minor_locator = ticker.MultipleLocator(spectral_step / 5)
    spectrum_formatter = ticker.FormatStrFormatter('%1.1e')
    ax.xaxis.set_major_locator(sigma_major_locator)
    ax.xaxis.set_minor_locator(sigma_minor_locator)
    ax.set_xlabel('Wavenumber / cm$^{-1}$')
    ax.yaxis.set_major_formatter(spectrum_formatter)
    ax.set_ylabel('Intensity / AU')
    # Actual plotting
    lines = ax.plot(sigma, spectra.T, linewidth=linewidth, **kwargs)
    if legend and len(legend) == len(lines):
        plt.legend(lines, legend)
    elif legend is None:
        pass
    else:
        raise ValueError('Legend must have as many items as components.')
    # Postprocessing
    ax.set_xlim((sigma.min(), sigma.max()))
    if clear is True:
        ax.yaxis.set_ticks([])
    return fig, ax


component_plot = raman_plot

def multi_mix_plot(mix, list_of_positions, spatial_step=500, **kwargs):
    """
    Draws a figure with as many subplots as samples. Each of them represents
    the mixing values/proportions as a function of the spatial position.

    Parameters
    ----------
    mix : numpy.ndarray, shape n_samples * n_components
        Array containing the mixing values or proportions
    list_of_positions : list of arrays
        List of the arrays containing the positions of the points where the
        mixing proportions are recorded.
    spatial_step : int, optional
        Spacing between major ticks. Default value is 500 µm

    Returns
    -------
    fig : matplotlib Figure object
    axes : list of matplotlib Axes objects
        Each axis is one of the subplots.

    Examples
    --------
    >>> extracted_mix.shape
    (170, 4)
    >>> sequence1_pos.shape
    (87,)
    >>> sequence2_pos.shape
    (35,)
    >>> sequence3_pos.shape
    (48,)
    >>> list_of_positions = [sequence1_pos, sequence2_pos, sequence3_pos]
    >>> fig, axes = multi_mix_plot(extracted_mix, list_of_positions)
    >>> fig.show()

    Notes
    -----
    kwargs are passed to the `matplotlib.pyplot.plot` calls.
    """
    # Kwarg processing
    alpha = kwargs.pop('alpha', .5)
    marker = kwargs.pop('marker', 'o')
    mec = kwargs.pop('mec', 'none')
    linestyle = kwargs.pop('linestyle', 'none')
    # Figure setup
    width_ratios = [pos[-1] - pos[0] for pos in list_of_positions]
    num_sections = len(width_ratios)
    fig = plt.figure()
    axes = []
    gs = mpl.gridspec.GridSpec(1, num_sections, width_ratios=width_ratios,
                               wspace=.05)
    # Actual plotting
    spatial_shift = 0
    for i, (grid_pos, pos) in enumerate(zip(gs, list_of_positions)):
        ax = fig.add_subplot(grid_pos)
        axes += [ax]            # save the axes to return them
        ax_major_locator = mpl.ticker.MultipleLocator(spatial_step)
        ax_minor_locator = mpl.ticker.MultipleLocator(spatial_step / 5)
        ax.xaxis.set_major_locator(ax_major_locator)
        ax.xaxis.set_minor_locator(ax_minor_locator)
        ax.set_xlim(pos.min(), pos.max())
        mix_data = mix[spatial_shift:spatial_shift + len(pos)]
        ax.plot(pos, mix_data, marker=marker, mec=mec, alpha=alpha,
                linestyle=linestyle, **kwargs)
        if i == 0:
            # If this is the leftmost curve, keep the y axis graduated and
            # labeled
            ax.set_ylabel('Mixing proportions / AU')
            _make_ticklabels_invisible(ax, ['x'])
        else:
            _make_ticklabels_invisible(ax, ['x', 'y'])
        # Let's not forget to update the spatial shift
        spatial_shift += len(pos)
    gs.tight_layout(fig, pad=.5)
    gs.update(wspace=.05)
    return fig, axes


def empa_plot(empa_mix, list_of_positions, spatial_step=500, legend_pos=-1,
              legend=None, right_label=None, **kwargs):
    """
    Plot the composition profiles determined by electron microprobe analysis
    (EMPA) on as many subplots as there are samples.

    Parameters
    ----------
    empa_mix : numpy.ndarray, shape n_oxides * n_samples
        The microprobe profiles, first line being the major oxide (silica).
    list_of_positions : list of arrays
        List of the arrays containing the positions of the points where the
        mixing proportions are recorded.
    spatial_step : int, optional
        Spacing between major ticks. Default value is 500 µm
    legend_pos : int
        Index of the subplot where the legend should be written.
    legend : list of strings, optional
        Labels for the minor oxides.
    right_label : string, optional
        Label for the rightmost right-hand yaxis.

    Returns
    -------
    fig : matplotlib Figure object
    axes : list of matplotlib Axes objects
        Each axis is one of the subplots.

    Examples
    --------
    >>> empa_mix.shape
    (3, 170)
    >>> sequence1_pos.shape
    (87,)
    >>> sequence2_pos.shape
    (35,)
    >>> sequence3_pos.shape
    (48,)
    >>> list_of_positions = [sequence1_pos, sequence2_pos, sequence3_pos]
    >>> legend = ['CaO', 'MgO']
    >>> fig, axes = empa_plot(empa_mix, list_of_positions, legend=legend)
    >>> fig.show()


    Notes
    -----
    kwargs are passed to `matplotlib.pyplot.plot` calls.
    """
    if not - len(list_of_positions) <= legend_pos < len(list_of_positions):
        raise ValueError(('Specified legend position is outside the subplots'
                          ' range'))
    # kwarg processing
    alpha = kwargs.pop('alpha', .5)
    marker = kwargs.pop('marker', 'o')
    mec = kwargs.pop('mec', 'none')
    linestyle = kwargs.pop('linestyle', 'none')
    # Figure setup
    width_ratios = [pos[-1] - pos[0] for pos in list_of_positions]
    num_sections = len(width_ratios)
    fig = plt.figure()
    axes = []
    gs = mpl.gridspec.GridSpec(1, num_sections, width_ratios=width_ratios,
                               wspace=.05)
    # Actual plotting
    spatial_shift = 0
    empa_mix = empa_mix.T
    axxmin, axxmax = empa_mix[:, 1:].min(), empa_mix[:, 1:].max()
    for i, (grid_pos, pos) in enumerate(zip(gs, list_of_positions)):
        if i == 0:
            # Leftmost subplot
            ax = fig.add_subplot(grid_pos)  # Main axis for the main oxide
            ax.set_ylabel('SiO$_2$ content / wt\%')
            # Keep the labels on the left-hand yaxis
            _make_ticklabels_invisible(ax, ['x'])
            axx = ax.twinx()    # Secondary axis for minor oxides
            # Don't display any label, there's no room if there's more than 1
            # subplot. If there's only one subplot, then we have to display the
            # y labels on the secondary axis
            if num_sections > 1:
                _make_ticklabels_invisible(axx, ['x', 'y'])
            else:
                _make_ticklabels_invisible(axx, ['x'])
            # Manually skip the first color: it's used by ax
            axx._get_lines.color_cycle.next()
        elif i != 0 and i != num_sections - 1:
            # Subplot somewhere in the middle
            # Keep the alignment with the leftmost left yaxis
            ax = fig.add_subplot(grid_pos, sharey=axes[0][0])
            # Don't display any label, there's no room
            _make_ticklabels_invisible(ax, ['x', 'y'])
            axx = ax.twinx()    # Secondary axis for minor oxides
            # Don't display any label, there's no room
            _make_ticklabels_invisible(axx, ['x', 'y'])
            # Manually skip the first color: it's used by ax
            axx._get_lines.color_cycle.next()
        else:
            # Rightmost subplot
            # Keep the alignment with the leftmost left yaxis
            ax = fig.add_subplot(grid_pos, sharey=axes[0][0])
            # Don't display any label, there's no room
            _make_ticklabels_invisible(ax, ['x', 'y'])
            axx = ax.twinx()    # Secondary axis for minor oxides
            # Keep the labels on the right-hand yaxis
            _make_ticklabels_invisible(axx, ['x'])
            # Manually skip the first color: it's used by ax
            axx._get_lines.color_cycle.next()
        axes += [(ax, axx)]            # save the axes to return them
        ax_major_locator = mpl.ticker.MultipleLocator(spatial_step)
        ax_minor_locator = mpl.ticker.MultipleLocator(spatial_step / 5)
        ax.xaxis.set_major_locator(ax_major_locator)
        ax.xaxis.set_minor_locator(ax_minor_locator)
        ax.plot(pos, empa_mix[spatial_shift:spatial_shift + len(pos), 0],
                marker=marker, mec=mec, alpha=alpha,
                linestyle=linestyle, **kwargs)
        ax.set_xlim(pos.min(), pos.max())
        l2 = axx.plot(pos, empa_mix[spatial_shift:spatial_shift + len(pos), 1:],
                      marker=marker, mec=mec, alpha=alpha,
                      linestyle=linestyle, **kwargs)
        axx.set_xlim(pos.min(), pos.max())
        axx.set_ylim((axxmin, axxmax))  # Manual alignment of the secondary
        # axis
        spatial_shift += len(pos)
    if legend and (len(legend) == empa_mix.shape[1] - 1):
        axes[legend_pos][-1].legend(l2, legend, loc='best', numpoints=1)
    if right_label:
        axes[-1][-1].set_ylabel(right_label)
    gs.tight_layout(fig, pad=.5)
    gs.update(wspace=.05)
    return fig, axes


# def mix_plot(mix, proportions=True, ax=None, separations=None, **kwargs):
#     if ax is None:
#         _, ax = ppl.subplots()
#     if proportions is False:
#         mix = mix.copy()/mix.sum(-1)[:, np.newaxis]
#     mec = kwargs.pop('mec', 'none')
#     ppl.plot(ax, mix, 'o', mec=mec, alpha=.5, **kwargs)
#     if separations is not None:
#         for s in separations[:-1]:
#             ax.axvline(s, color='#262626')
#         ax.set_xlim((0, separations[-1]))
#     ax.set_ylabel('Mixing proportion / AU')
#     ax.set_xlabel('Sprectrum number')
#     ax.grid('on')

def correlation_plot(correlation_matrix, delimitation, first_part_legend=None,
                     second_part_legend=None, printout=True):
    """
    Plot a figure of colored tiles reprensenting the correlation coefficient
    between two sets of observations, such as EMPA profiles and Raman mixing
    profiles.

    Parameters
    ----------
    correlation_matrix : numpy.ndarray
        Correlation coefficients between two sets of observations.
    delimitation : int
        Number of observations in the first group.
    first_part_legend : list of strings, optional
        Labels for the first set of observations, ie one side of the tiles
        rectangle.
    second_part_legend : list of strings, optional
        Labels for the second set of observations, ie one side of the tiles
        rectangle.
    printout : bool, default True
        If True, the correlation matrix will be printed.

    Examples
    --------
    >>> empa_profiles.shape
    (3, 170)
    >>> extracted_mix.shape
    (170, 4)
    >>> observations = np.column_stack((empa_profiles.T, extracted_mix))
    >>> delimitation = 3
    >>> correlation_data = np.corrcoef(observations)
    >>> first_legend = ['SiO2', 'CaO', 'MgO']
    >>> fig, ax = correlation_plot(correlation_data, delimitation,
                                   first_part_legend=first_legend,
                                   printout=False)
    >>> fig.show()
    """
    n_species, _ = correlation_matrix.shape
    if first_part_legend and len(first_part_legend) != delimitation:
        error_message = ('There must be exactly `delimitation` items in '
                         '`first_part_legend`.')
        raise ValueError(error_message)
    if second_part_legend and (len(second_part_legend) != (n_species -
                                                           delimitation)):
        error_message = 'Wrong number of  items in `second_part_legend`.'
        raise ValueError(error_message)
    fig, ax = plt.subplots()
    im = ax.pcolormesh(correlation_matrix[:delimitation, delimitation:],
                       cmap=blue_red, vmin=-1, vmax=1)
    yticks = np.arange(.5, .5 + delimitation)
    ax.set_yticks(yticks)
    if first_part_legend:
        ax.set_yticklabels(first_part_legend)
    xticks = np.arange(.5, .5 + n_species - delimitation)
    ax.set_xticks(xticks)
    if second_part_legend:
        ax.set_xticklabels(second_part_legend)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", "5%", pad="3%")
    cb = plt.colorbar(im, cax=cax)
    cb.set_ticks(np.arange(-1, 1.4, .4))
    for t in cb.ax.get_yticklabels():
        t.set_horizontalalignment('right')
        t.set_x(3.5)
        cb.solids.set_rasterized(True)
    fig.tight_layout(pad=.5)
    # Finally, if desired, print the values in the same order as the graph
    if printout:
        with _printoptions(precision=3, suppress=False):
            print(correlation_matrix[:delimitation, delimitation:][::-1])
    return fig, ax


def downsample(high_sigma, low_sigma, spectra):
    """
    When spectra are recorded on different spectrometers, or on the same
    spectrometer but with a different resolution, or with different spectral
    ranges, downsampling is needed to realign data.
    This function subsamples by interpolating a spectra on a highly sampled
    spectral range `high_sigma` and applying this interpolation on a less
    sampled spectral range `low_sigma`.
    """
    interpolations = (interpolate.interp1d(high_sigma, sp) for sp in spectra)
    return np.vstack(tuple(interpolation(low_sigma)
                           for interpolation in interpolations))


def view_pca_denoising(sigma, raw_spectra, denoised_spectra):
    """
    Plots original and denoised spectra on the same graph to let the user
    evaluate the quality of the denoising.

    Parameters
    ----------
    sigma : numpy.ndarray, n_features
        List of the wavenumbers at which the intensities are recorded.
    raw_spectra : numpy.ndarray, (n_spectra *) n_features
        Array containing the original spectra to be plotted.
    denoised_spectra : numpy.ndarray, (n_spectra *) n_features
        Array containing the denoised spectra to be plotted.

    Notes
    -----
    The spectra must have as many points as there are in `sigma`.
    """
    fig, ax = raman_plot(sigma, raw_spectra, color='#333333', lw=.75)
    ax.plot(sigma, denoised_spectra.T, lw=1.5)
    ax.set_title('Original versus preprocessed spectra')
    return fig, ax


###############################################################################
#                    PART 3 : Simulated spectra                               #
###############################################################################

def generate_binary(n_samples, n_features=800, min_content=.1, nts=.01):
    """
    Generates artificial spectra by linearly mixing two components and
    adding Gaussian noise.

    Parameters
    ----------
    n_samples : int
        Number of samples to generate
    n_features : int, default 800
        Number of points in a spectrum
    min_content : float, default .1
        Minimum amount of one component.
    nts : float, default .01
        Noise to signal ratio. Must be >= 0.

    Examples
    --------
    >>> sigma, spectra, components, mix = generate_binary(55)

    See also
    --------
    generate_ternary : generates artificial spectra by mixing three components
    according to a ternary mesh and adding Gaussian noise.
    """
    sigma = np.linspace(301., 1306., n_features)
    a = np.exp(-(sigma-450)**2 / (2*50**2))
    b = np.exp(-(sigma-900)**2 / (2*90**2))
    c = np.exp(-(sigma-510)**2 / (2*50**2))
    d = np.exp(-(sigma-750)**2 / (2*70**2))
    original_components = np.vstack((a + b / 3, c / 2 + d))
    if nts > 0:
        noise = np.random.normal(scale=nts, size=(n_samples, n_features))
    else:
        noise = np.zeros((n_samples, n_features))
    rate = np.linspace(min_content, 1 - min_content, num=n_samples)
    mix = np.column_stack((rate, 1 - rate))
    spectra = np.dot(mix, original_components) + noise
    return sigma, spectra, original_components, mix


def generate_ternary(n_samples, n_features=800, min_content=.1, nts=.01):
    """
    Generates artificial spectra by mixing three components according
    to a ternary mesh and adding Gaussian noise.

    Parameters
    ----------
    n_samples : int
        Number of samples to generate
    n_features : int, default 2046
        Number of points in a spectrum
    min_content : float, default .1
        Minimum amount of one component.
    nts : float, default .01
        Noise to signal ratio.

    Examples
    --------
    >>> sigma, spectra, components, mix = generate_ternary(45)

    See also
    --------
    generate_binary : generates artificial spectra by linearly mixing two
    components and adding Gaussian noise
    """
    sigma = np.linspace(301., 1306., n_features)
    a = np.exp(-(sigma-450)**2/(2*50**2))
    b = np.exp(-(sigma-900)**2/(2*90**2))
    c = np.exp(-(sigma-510)**2/(2*50**2))
    d = np.exp(-(sigma-750)**2/(2*70**2))
    e = np.exp(-(sigma-1100)**2/(2*120**2))
    original_components = np.vstack((a + b / 3, c / 2 + d, e))
    mix = ternary.mesh(n_samples, min=min_content, max=1-min_content)
    mix = ternary.cartesian_to_ternary(*np.hsplit(mix, 2))
    noise = np.random.normal(scale=nts, size=(mix.shape[0], n_features))
    spectra = np.dot(mix, original_components) + noise
    return sigma, spectra, original_components, mix


def _aio(n_samples, n_features=2046, arity=2, clean_mode='area',
         pca=False, beta=1e-5):
    # Generate the data by mixing 2 or 3 components
    if arity == 3:
        res = generate_ternary(n_samples, n_features=n_features)
        x, raw_spectra, original_components, original_mix = res
    elif arity == 2:
        res = generate_binary(n_samples, n_features=n_features)
        x, raw_spectra, original_components, original_mix = res
    else:
        raise ValueError('Binary and ternary sample are the only ones handled')
    if pca is True:
        n_components, denoised_spectra = pca_step(raw_spectra, graph=True)
        clean_spectra = clean(x, denoised_spectra, mode=clean_mode,
                              long_cor=False)
    else:
        n_components, _ = pca_step(raw_spectra)
        del _
        clean_spectra = clean(x, raw_spectra, mode='area', long_cor=False)
    components, mix, _ = nmf_step(clean_spectra, beta=beta,
                                  n_components=n_components)
    # Reordering the components
    proximities = np.array([[np.dot(v, sp) for sp in original_components]
                            for v in components])
    order = np.argsort(np.argmax(proximities, axis=1))
    components = components[order, :]
    mix = mix[:, order]
    # Enough with the data crunching, let's plot!
    # First, a sample of the spectra
    _sample_spectra(x, raw_spectra)
    # Then, the components
    compare_components(x, original_components, components)
    # Then, the mixing proportions
    compare_mixings(original_mix, mix)
    # If PCA has been used as preprocessing step, let's view the result
    if pca is True:
        view_pca_denoising(x, raw_spectra, denoised_spectra)
    if arity == 3:
        compare_ternary_mixings(original_mix, mix)
    plt.show()


def compare_components(sigma, original_components, deconvoluted_components,
                       already_normalized=False):
    """
    Plots original and extracted components on the same Raman plot to let you
    evaluate the quality of the deconvolution.

    Parameters
    ----------
    sigma : numpy.ndarray, n_features
        List of the wavenumbers at which the intensities are recorded.
    original_components : numpy.ndarray, n_components * n_features
        True components used to generate the spectra.
    deconvoluted_components : numpy.ndarray, n_components * n_features
        Components extracted by deconvoluting the series of spectra.
    already_normalised : bool, default False
        Are the deconvoluted components already normalized to the max?

    See also
    --------
    compare_mixings : compares the original mixing matrix to the deconvoluted
    one.
    """
    if not(already_normalized):
        this_deconvoluted = (deconvoluted_components /
                             deconvoluted_components.max(1)[:, np.newaxis])
        true_components = (original_components /
                           original_components.max(1)[:, np.newaxis])
    else:
        this_deconvoluted = deconvoluted_components
        true_components = original_components
    fig, ax = raman_plot(sigma, true_components, color='#333333', lw=.75)
    ax.plot(sigma, this_deconvoluted.T, label='Deconvoluted components')
    ax.set_title('Original versus deconvoluted components')
    ax.legend(loc='best')
    return fig, ax


def compare_mixings(original_mix, deconvoluted_mix, already_normalized=False):
    """
    Plots original and extracted mixing matrices on the same plot to let you
    evaluate the quality of the deconvolution.

    Parameters
    ----------
    original_mix : numpy.ndarray, n_samples * n_components
        True mixing values used to generate the spectra.
    deconvoluted_mix : numpy.ndarray, n_samples * n_components
        Mixing values extracted by deconvoluting the series of spectra.
    already_normalised : bool, default False
        Are the deconvoluted mixing values already normalized to the sum?

    See also
    --------
    compare_ternary_mixings : compares mixings in ternary coordinates
    compare_components : compares the original components to the deconvoluted
    ones.
    """
    if not(already_normalized):
        this_mix = deconvoluted_mix / deconvoluted_mix.sum(-1)[:, np.newaxis]
    else:
        this_mix = deconvoluted_mix
    fig, ax = plt.subplots()
    ax.plot(original_mix, 'd', label='Original mixing', color='#333333')
    ax.plot(this_mix, 'o', mec='none', label='Deconvoluted mixing')
    ax.set_xlabel('Spectrum number [ ]')
    ax.set_ylabel('Mixing proportion [ ]')
    ax.set_title('Original versus deconvoluted mixing proportions')
    ax.legend(loc='best')
    return fig, ax


def _sample_spectra(x, spectra, sampling_rate=8.1):
    step = int(round(spectra.shape[0] / sampling_rate))
    fig, ax = plt.subplots()
    for i, line in enumerate(spectra[::step]):
        ax.plot(x, line, label='%3d' % (i * step))
    ax.set_xlim(x[0], x[-1])
    ax.set_ylim(ymin=0)
    ax.set_xlabel('Wavenumber [cm-1]')
    ax.set_ylabel('Intensity [A.U.]')
    ax.set_title('Sampling of the constructed spectra')
    ax.legend(loc='best', ncol=2)
    return fig


def compare_ternary_mixings(original_mix, mix):
    """
    Plots original and extracted mixing matrices on a ternary plot to let you
    evaluate the quality of the deconvolution.

    Parameters
    ----------
    deconvoluted_mix : numpy.ndarray, n_samples * n_components
        Mixing values extracted by deconvoluting the series of spectra.
    already_normalised : bool, default False
        Are the deconvoluted mixing values already normalized to the sum?

    See also
    --------
    compare_mixings : compares mixings on a standard plot
    compare_components : compares the original components to the deconvoluted
    ones.
    """
    fig = ternary.ternary_frame()
    ax = fig.get_axes()[0]
    ax.scatter(*ternary.ternary_to_cartesian(*np.hsplit(original_mix, 3)),
               c='#333333')
    ax.scatter(*ternary.ternary_to_cartesian(*np.hsplit(mix, 3)),
               c=color_set[0], edgecolor='none')
    return fig, ax


if __name__ == '__main__':
    print(__doc__)
    n_samples = 150
    sigma, spectra, components, mix = generate_binary(n_samples,
                                                      n_features=1500,
                                                      min_content=.2, nts=.03)
    n_components, denoised_spectra = pca_step(spectra)
    cleaned_spectra = clean(sigma, denoised_spectra, long_cor=False)
    extracted_components, extracted_mix, err = nmf_step(cleaned_spectra,
                                                        n_components,
                                                        beta=1e-5)
    compare_mixings(mix, extracted_mix)
    compare_components(sigma, components, extracted_components)
    ag = area_graph_generator(sigma, cleaned_spectra, extracted_mix,
                              extracted_components)
    for i in range(0, n_samples, int(n_samples / 5)):
        ag(i)