deepsphere.tex

\documentclass{article} % For LaTeX2e
\usepackage{iclr2019,times}
\usepackage{amssymb}
\usepackage{float}
\usepackage{hyperref}
\usepackage{url}
\usepackage{bm}
\usepackage{graphicx,array} \graphicspath{{figures/}}
\usepackage{amsmath}
\usepackage{color}
\usepackage{caption}
\usepackage{subcaption}

% % Optional math commands from https://github.com/goodfeli/dlbook_notation.
% \input{math_commands.tex}

\newtheorem{theorem}{Theorem}

\newcommand{\figref}[1]{figure~\ref{fig:#1}}
\newcommand{\Figref}[1]{Figure~\ref{fig:#1}}
\newcommand{\tabref}[1]{Table~\ref{tab:#1}}
\newcommand{\Tabref}[1]{table~\ref{tab:#1}}
\newcommand{\secref}[1]{Section~\ref{sec:#1}}
\newcommand{\Secref}[1]{section~\ref{sec:#1}}
%\newcommand{\secref}[1]{\S\ref{sec:#1}}
\newcommand{\eqnref}[1]{(\ref{eqn:#1})}

\renewcommand{\b}[1]{{\bm{#1}}}   % bold symbol

% MATH SYMBOLS
\newcommand{\1}{\b{1}}              % all-ones vector
\newcommand{\0}{\b{0}}              % all-zero vector
\newcommand{\g}[1]{\b{#1}}
\newcommand{\G}{\mathcal{G}}
\newcommand{\V}{\mathcal{V}}
\newcommand{\E}{\mathcal{E}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\B}{\mathcal{B}}
%\newcommand{\circle}{\mathcal{C}^1}
\newcommand{\torus}{\mathcal{C}^2}
\newcommand{\sphere}{\mathcal{S}^2}
\renewcommand{\L}{\b{L}}
\newcommand{\tL}{\tilde{\L}}
\newcommand{\W}{\b{W}}
\newcommand{\I}{\b{I}}
\newcommand{\D}{\b{D}}
\newcommand{\U}{\b{U}}
\newcommand{\x}{\b{x}}
\newcommand{\X}{\b{X}}
\newcommand{\y}{\b{y}}
\newcommand{\Y}{\b{Y}}
\newcommand{\bu}{\b{u}}
\newcommand{\f}{\b{f}}
\newcommand{\trans}{^\intercal}
\newcommand{\R}{\mathbb{R}}
\newcommand{\bLambda}{\b{\Lambda}}
\newcommand{\blambda}{\b{\lambda}}
\newcommand{\bO}{\mathcal{O}}
\newcommand{\T}{\mathcal{T}}
\DeclareMathOperator*{\esp}{E}
\DeclareMathOperator*{\var}{Var}
\DeclareMathOperator*{\vect}{vec}
\DeclareMathOperator*{\argmin}{arg \, min}

%\title{DeepSphere: a graph-based spherical CNN with approximate equivariance}
% Use towards to indicate it's a work-in-progress. Keep the above for the final paper.
\title{DeepSphere: towards an equivariant graph-based spherical CNN}
% Keywords: equivariance, spherical CNN, graph NN
%\title{DeepSphere: an equivariant spherical CNN based on a graph NN}
%\title{DeepSphere: a graph-based equivariant spherical CNN}
% Exploiting symmetries with Graph Neural Networks
% Another reason to use Graph Neural Networks: exploiting symmetries
% Another case for Graph Neural Networks: exploiting symmetries
% The case for using graphs to reason / compute / solve continuous problems.

\author{Michaël Defferrard \\
Institute of Electrical Engineering \\
EPFL, Lausanne, Switzerland \\
\texttt{michael.defferrard@epfl.ch}
\And
Nathanaël Perraudin \\
Swiss Data Science Center (SDSC) \\
Zurich, Switzerland \\
\texttt{nathanael.perraudin@sdsc.ethz.ch}
\And
Tomasz Kacprzak \& Raphael Sgier \\
Institute for Particle Physics and Astrophysics, ETH Zurich, Switzerland \\
\texttt{\{tomaszk,rsgier\}@phys.ethz.ch} \\
}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
\newcommand{\todo}[1]{{\color[rgb]{.6,.1,.6}{#1}}}
\newcommand{\nati}[1]{{\color[rgb]{.1,.6,.1}{#1}}}

\iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
\begin{document}

\maketitle

\begin{abstract}
	Spherical data is found in many applications.
	By modeling the discretized sphere as a graph, we can accommodate non-uniformly distributed, partial, and changing samplings.
	Moreover, graph convolutions are computationally more efficient than spherical convolutions.
	% As equivariance is desired to exploit rotational symmetries, we present preliminary results on the rotation equivariance of the graph neural network introduced in \citet{defferrard2016convolutional}.
	As equivariance is desired to exploit rotational symmetries, we discuss how to approach rotation equivariance using the graph neural network introduced in \citet{defferrard2016convolutional}.
	Experiments show good performance on rotation-invariant learning problems.
	Code and examples are available at \url{https://github.com/SwissDataScienceCenter/DeepSphere}.
\end{abstract}

\begin{figure}[h]
	\includegraphics[height=0.25\linewidth]{example_cosmo_cmb}
	\hfill
	\includegraphics[height=0.25\linewidth]{example_ghcn_daily_tmax}
	\hfill
	\includegraphics[height=0.25\linewidth]{example_brain_meg}
	\hfill
	\includegraphics[height=0.25\linewidth]{example_ghcn_graph}
	\caption{
		Examples of intrinsically spherical data: (left) the cosmic microwave background (CMB) temperature from \citet{planck2015overview},
		%with galactic plane masked
		(middle) daily maximum temperature from the Global Historical Climatology Network (GHCN),\protect\footnotemark (right) brain activity recorded through magnetoencephalography (MEG).\protect\footnotemark
		For those examples, a rigid full-sphere pixelization is not ideal: the Milky Way's galactic plane masks observations, brain activity is measured on the scalp only, and the position and density of weather stations is arbitrary and changes over time.
		Graphs can faithfully and efficiently represent sampled spherical data by placing vertices where data has been measured.
		%\todo{something from climate, weather?}
		%\todo{compare ideal full-spheres (like SHREC-17 projection from Esteves, 360 from Coors) to real-world measurements?}
		%\todo{horizontal colorbar for brain}
	}
	\label{fig:examples}
\end{figure}
\footnotetext[1]{\scriptsize\url{https://www.ncdc.noaa.gov/ghcn-daily-description}}
\footnotetext[2]{\scriptsize\url{https://martinos.org/mne/stable/auto_tutorials/plot_visualize_evoked.html}}

\section{Introduction}

% \todo{sphere only, or start more general?}
% 1. general
% 2. sphere
% 3. equivariance to symmetries (we might not necessary want that)

% \todo{What is the problem we want to solve? Identify it. Our solution: flexible (any sampling) and computational efficient.}

% \todo{Related work: mention interest in spherical CNNs for various applications, importance of equivariance and invariance, previous exploratory work with graphs from Renata and ourselves.}

Graphs have long been used as models for discretized manifolds: for example to smooth meshes \citep{taubin1996meshsmoothing}, reduce dimensionality \citep{belkin2003laplacian}, and, more recently, to process signals \citep{shuman2013gsp}.
% In most works, it is assumed that the true data manifold is unknown to the processing or learning method, and is only observed through discrete samples.
% In this contribution, we want to make a case for using graphs as a support of computation on known manifolds.\footnote{For example, the plane where images live is known, while the true surface of a point cloud is unknown.}
%From now on, we focus on the sphere.
Along
%$d$-dimensional
Euclidean spaces, the sphere is one of the most commonly encountered manifold: it is notably used to represent omnidirectional images, global planetary data (in meteorology, climatology, geology, geophysics, etc.), cosmological observations, and brain activity measured on the scalp (see \figref{examples}).
Spherical convolutional neural networks (CNNs) have been developed to work with some of these modalities \citep{cohen2018sphericalcnn, perraudin2018deepsphere, khasanova2017graphomni, su2017sphericalcnn, coors2018sphericalcnn,
% kondor2018sphericalcnn, => quite specific, same applications as Cohen
jiang2019sphericalcnn}.

% why graphs for the sphere
Spherical data can be seen as a continuous function
%$f: S^2 \rightarrow \R^d$
that is sampled at discrete locations.
%is sampled at convenient points, given by a chosen pixelization.
As it is impossible to construct a regular discretization of the sphere, there is no perfect spherical sampling.
Schemes have been engineered for particular applications and come with trade-offs \citep{gorski2005healpix,glesp}.
%emphasizing one or more features at the cost of others.
% For example, the equirectangular (a.k.a.\ equiangular or geographic) grid, used in most spherical CNNs, is most similar to a 2D grid.
% HEALPix features equal-area and hierarchically organized pixels, and has a fast spherical harmonic transform (SHT) \citep{healpix}.
% GLESP features a fast and exact
% %(to machine precision)
% SHT \citep{glesp}.
% sympix?, cubed-sphere? => no space
%\todo{maybe have a table, or a list of desired properties from a sampling} => no space
However, while sampling locations can be precisely controlled in some cases (like the CMOS sensors of an omni-directional camera), sensors might in general be non-uniformly distributed, cover only part of the sphere, and move (see \figref{examples}).
Modeling the sampled sphere as a discrete graph has the potential to faithfully and efficiently represent sampled spherical data by placing vertices where data has been measured: no need to handle missing data or to interpolate to some predefined sampling, and no waste of memory or precision due to over- or under-sampling.
Graph-based spherical CNNs have been proposed in \citet{khasanova2017graphomni} and \citet{perraudin2018deepsphere}.
Moreover, graph convolutions have a low computational complexity of $\bO(N_{pix})$, where $N_{pix}$ is the number of considered pixels.
Methods based on proper spherical convolutions, like \citet{cohen2018sphericalcnn} and \citet{esteves2017sphericalcnn}, cost $\bO(N_{pix}^{3/2})$ operations.

%\todo{Most of our views are shaped by cosmology. What is going on in other fields (climate, weather)? Do they have their own samplings? Cosmo seems however the most advanced for SP on the sphere.}

%As the sphere $\mathcal{S}^2$ is a homogeneous space of the 3D rotation group $SO(3)$.

% Finally, to exploit the rotation symmetry of certain tasks involving spherical data, we would like a rotation equivariant formulation \citep{cohen2016equivariance}.
% speak about symmetries in the conclusion

Finally, like classical 2D CNNs are equivariant to translations, we want spherical CNNs to be equivariant to 3D rotations \citep{cohen2016equivariance, kondor2018equivariance}.
% we'll nuance this view in the conclusion
A rotation-equivariant CNN detects patterns regardless of how they are rotated on the sphere: it exploits the rotational symmetries of the data through weight sharing.
Realizing that, spheres can be used to support data which does not intrinsically live on a sphere but have rotational symmetries \citep[for 3D objects and molecules]{cohen2018sphericalcnn, esteves2017sphericalcnn}.
In this contribution we present DeepSphere~\citep{perraudin2018deepsphere}, a spherical neural network leveraging graph convolution for its speed and flexibility.
Furthermore, we discuss the rotation equivariance of graph convolution on the sphere.

% \todo{From \citet{kondor2018equivariance}:
% A convolution implies the equivariance to the action of some group.
% Contrary as well: equivariance requires a convolution structure.
% }

% \begin{figure}
% 	% three figures: healpix, glesp, graph
% 	\label{fig:samplings}
% \end{figure}

\begin{figure}[t]
	\centering
	\includegraphics[width=0.9\linewidth]{figure_architecture_v3}
	\caption{Example architecture.
		Global tasks need a spatial summarization: the FCN variant is rotation invariant (and accepts inputs of varying sizes), while the CNN variant is not.
		Dense tasks (when the output lives on the sphere, like segmentation) are rotation equivariant.
 % %    A convolutional layer is based on five operations: convolution, non-linearity, batch normalization, down-sampling, and pooling.
	% % While most operations are agnostic to the data domain, the convolution and the down-sampling have to be adapted.
	% In this paper, we propose first to model the sphere with a graph and to perform the convolution on the graph.
	% Graphs are versatile data structures which can model any sampling, even irregular or partial.
	% Second, we propose to exploit a hierarchical pixelization of the sphere for the down-sampling operation.
	% It allows the NN to analyze the data at multiple scales while preserving the spatial localization of features.
	% This figure shows a network that operates on the whole sphere.
	% The process is the same when working with partial observations, except that the graph is only built for the region of interest.
	}
	\label{fig:architecture}
\end{figure}

\section{Method}

% \todo{Basics of GSP, how the graph is built, polynomial filters.}
Our method relies on the graph signal processing framework~\citep{shuman2013gsp}, which highly relies on the spectral properties of the graph Laplacian operator.
In particular, the Fourier transform is defined as the projection of the signal on the eigenvectors of the Laplacian, and the graph convolution as a multiplication in the Fourier domain.
It turns out that the graph convolution can be accelerated by being performed directly in the vertex domain
%, i.e., using the pixel values on the sphere
\citep{hammond2011wavelets}.

DeepSphere leverages graph convolutions
% and hierarchical pooling
to achieve the following properties: (i) computational efficiency, (ii) adaptation to irregular sampling, and (iii) close to rotation equivariance.
An example architecture is shown in \figref{architecture}.
The main idea is to model the discretised sphere as a graph of connected pixels: the length of the shortest path between two pixels is an approximation of the geodesic distance between them.
We use the graph CNN formulation introduced in \citep{defferrard2016convolutional}, and a pooling strategy that exploits a hierarchical pixelisation of the sphere to analyse the data at multiple scales.
The current implementation of DeepSphere relies on the Hierarchical Equal Area isoLatitude Pixelisation (HEALPix)~\citep{gorski2005healpix}, a popular sampling used in cosmology and astrophysics.
See \citet{perraudin2018deepsphere} for details.
DeepSphere is, however, easily used with other samplings as only two elements depend on it: (i) the choice of neighboring pixels when building the graph, and (ii) the choice of parent pixels when building the hierarchy.

The flexibility of modeling the data domain with a graph allows one to easily model data that spans only a part of the sphere, or data that is not uniformly sampled.
Furthermore, using a $k$-nearest neighbors graph, the convolution operation costs $\mathcal{O}(N_{pix})$ operations.
This is the lowest possible complexity for a convolution without approximations.
Nevertheless, while the graph framework offers great flexibility, its ability to faithfully represent the underlying sphere highly depends on the sampling locations and the graph construction.
This should not be neglected since the better the graph represents the sphere, the closer to rotation equivariant the graph convolution will be.
% true for all manifolds: exploit their intrinsic symmetries (but how to define that?)

% DeepSphere is readily apt to solve four tasks: (i) global classification (i.e., predict a class from a map), (ii) global regression (i.e., predict a set of parameters from a map), (iii) dense classification (i.e., predict a class for each pixel of a map), and (iv) dense regression, (i.e., predict a set of maps from a map).
% Input data are spherical maps with a single value per pixel, such as the CMB temperature, or multiple values per pixel, such as surveys at multiple radio frequencies.

% While there exists many graph NNs (see for example xxx and xxx for recent reviews),
% \todo{Why this graph NN? We want an equivalence with the continuous world to prove equivariance to arbitrary rotations.}

% \todo{We can take here a different stance than: "we'll use whatever method gives the best performance". SHTs are the standard analysis tool on the sphere. People not only want to apply learned filters, but designed ones (e.g., Gaussian smoothing), people want to interpret and see the spectrum as spectral analysis is important (at least in cosmo, we should find some refs for that}

% That should be short: it's an ML audience. We can reference previous work. No general ML stuff, as in cosmo paper.

%\section{Spherical harmonics and equivariance}
\section{Harmonics and equivariance}

% \todo{why equivalence to continuous provides equivariance. Clean way of doing it. Alternative: Renata's mechanical approach.}
% often easier to describe in Fourier space \citep{kondor2018equivariance} => motivation for the spectral interpretation?
% Proper study of symmetries and equivariance requires a continuous treatment.
% @Michael: Not necessarily. It is more complicated than that. For example, if you work in the spectral domain, you can bypass sampling problem. This is what happen with the work of Cohen. So I think we should not speak about that.
% For example, only rotations by $90^\circ$ can be studied on a discrete sampling grid.

% \todo{there is a compromise about the sampling quality (how far from uniform) and how close discrete computations can be made to the true continuous ones}

\begin{figure}
\begin{minipage}[c]{.21\linewidth}
	\centering
	\includegraphics[width=\linewidth]{ring}
	\caption{circle $\mathcal{C}^1$ (green), regular samples and graph vertices (red), graph edges (blue).}
	\label{fig:ring}
\end{minipage}
\hfill
\begin{minipage}[c]{.7\linewidth}
	\centering
	\includegraphics[width=\linewidth]{graph_eigenvalues}
	\caption{The eigenvalues $\bLambda$ of the graph Laplacian $\L = \U \bLambda \U\trans$ are clearly organized in groups. Each group corresponds to a degree $\ell$ of the spherical harmonics. Each degree has $2\ell + 1$ orders.
	% See also \figref{graph_harmonics}.
	}
	\label{fig:graph_eigenvalues}
\end{minipage}
\end{figure}

\begin{figure}
	\centering
	\includegraphics[trim={0 6.37cm 0 0},clip,height=0.15\linewidth]{eigenvectors}
	\hfill
	\includegraphics[height=0.15\linewidth]{subspace_harmonics_eigenvectors_v2}
	\caption{Correspondence between the subspaces spanned by the graph Fourier basis and the spherical harmonics.
		The eigenvectors $\U = [\b u_1, \dots, \b u_{N_{pix}}]$ of the graph Laplacian $\L = \U \bLambda \U\trans$, which form the Fourier basis, clearly resemble the spherical harmonics.
		The first 8 are shown on the left.
		%The correspondence is computed in two steps.
		To quantify the correspondence, we compute the power spectral density (PSD) of each eigenvector with the SHT.
		Second, as there is $2\ell+1$ spherical harmonics for each degree $\ell$, we sum the contributions of the corresponding $2\ell+1$ eigenvectors.
		% No real convergence with increasing Nside.
		% Row $j$ is the sum of the (normalized) power spectral densities (PSDs), computed by the SHT, of the $2j+1$ graph Fourier modes $\b u_i$ identified with degree $j$.
		% The PSD is computed for degrees $\ell=0$ to $\ell = 3 N_{side} -1$.
		The matrices on the right show how the subspaces align: the Fourier basis spans the same subspaces as the spherical harmonics in the low frequencies, and the eigenvectors leak towards adjacent frequency bands at higher frequencies.
		%While there is a systematic error,
	The graph Fourier basis aligns at higher frequencies as the resolution ($N_{pix} = 12 N_{side}^2$) increases.}
		\label{fig:subspace_harmonics_eigenvectors}
\end{figure}

%\paragraph{Why is the harmonic analysis of the graph important for equivariance?}
Both the graph and the spherical convolutions can be expressed as multiplications in a Fourier domain.
As the spectral bases align, the two operations become similar.
Hence, the equivariance property of the spherical convolution carries out to the graph convolution.
%the spectral analysis of the graph Laplacian gives insights on the equivariance property of the graph convolution.

\paragraph{Known case: the circle $\mathcal{C}^1$.}
Let $\theta\in[0,2\pi[$ be a parametrization of each point $(\cos\theta,\sin\theta)$ of $\mathcal{C}^1$.
The eigenfunctions of the Laplace-Beltrami operator of $\mathcal{C}^1$ are $u_\ell(\theta)=e^{i \theta m \ell}$, for $\ell \in \mathbb{N}$ and $m\in\{-1,1\}$.
Interestingly, for a \emph{regular sampling} of $\mathcal{C}^1$ (shown in \figref{ring}), the sampled eigenfunctions turn out to be the discrete Fourier basis.
That is, the harmonic decomposition of a discretized function on the circle can be done using the well-known discrete Fourier transform (DFT).
Moreover, the graph Laplacian of the sampled circle is diagonalized by the DFT basis, as all circulant matrices have complex exponentials as eigenbases \cite{strang1999dct}.
Hence, for $\mathcal{C}^1$, it can be verified, under mild assumptions, that the graph convolution is equivariant to translation \citep[section 2.2 and equation 3]{perraudin2017stationary}.
More generally, higher dimensional circles such as the torus $\mathcal{C}^2$ also have a circulant Laplacian and an equivariant convolution.
% For an irregular sampling however, the graph Fourier basis will not be the sampled eigenfunctions.
The above does however not hold for irregular samplings: the more irregular the sampling, the further apart the graph Fourier basis will be to the sampled eigenfunctions.
% Extreme case: all the samples at the same position.

% \paragraph{Necessary conditions for the rotational equivariance on the sphere}
% We note two important points for the equivariance property to be carried on the graph convolution: a) the sampling needs to be regular, and b), the constructed graph has to be constructed circulant,\footnote{A circulant graph is an undirected graph acted on by a cyclic group of symmetries which takes any vertex to any other vertex.} i.e. it has a circulant weight matrix.
% Unfortunately, there exist no regular sampling for a sphere $\mathcal{S}^2$ of more than $12$ points. Hence it is in general not possible to build a graph with a convolution that is equivariant to rotation\footnote{The exception is the dense graph. Unfortunately, in the extreme case the graph convolution is limited to Kronecker kernels, which makes it useless.}.

% \subsection{Boundary conditions}

% The graph setting used throughout this contribution corresponds to assumed reflective border conditions.
% While that is irrelevant when working on the complete sphere (as it has no border), it slightly affects the convolution operation when only a part of the sphere is considered.
% As depicted in \figref{border_effects}, a filter localized near a border (via $h(\L) \b \delta_i$) is no longer isotropic.
% These border effects can, however, be mitigated by padding with zeros a small area around the part of interest (in which case they become similar to border effects in classical CNNs).
% We however do not expect these effects to cause any problem as long as the data samples cover the same area.

% \begin{figure}[ht!]
% 	\centering
% 	\includegraphics[width=\linewidth]{border_effects}
% 	\caption{Convolution kernel (also called filter) localized in the center and left corner of a graph built from $1/12^\text{th}$ of the sphere at $N_{side} = 16$.
% 		A filter $h$ is localized on a pixel $i$ as $\T_i h = h(\L) \b \delta_i$ (see \secref{graph_convolution}, equation~\eqnref{graph_convolution_spatial}).
% 	The filter is not isotropic anymore when localized on the corner as the graph representation of a manifold assumes reflective border conditions.}
% 	\label{fig:border_effects}
% \end{figure}

% \subsection{Convergence proof for random uniform sampling}

% Let us use the angular parametrization of the sphere $\mathcal{S}^2=\{\bf{x} | \|\bf{x}\|_2=1 \}$, in term of angles of zenith, $\theta\in [0,\pi ]$, and azimuth $\phi\in [0,2\pi[$.
% \begin{equation}
% \Phi(\theta, \phi) = (\sin \theta \cos \phi, \sin \theta \sin \phi, \cos \theta )
% \end{equation}
% Let $\mathcal(C)^\infty(\mathcal{S}^2)$ denote the class of infinitely differentiable functions $\mathcal{S}^2\rightarrow \mathbb{R}$. Using the angular parametrization of the sphere, the Laplace-Beltrami for a function $f\in \mathcal(C)^\infty(\mathcal{S}^2)$ is given by:
% \begin{equation}
% \Delta_{\mathcal{S}^2} f = \frac{1}{\sin \theta } \frac{\partial}{\partial \theta} \left(\sin \theta  \frac{\partial f}{\partial \theta} + \frac{1}{\sin^2 \theta} \frac{\partial^2f}{\partial \phi^2} \right)
% \end{equation}

% Let us extend the graph Laplacian

% \begin{equation}
% \L \b{f}[i] := \frac{1}{n} \left( \b{f}[i]\sum_{j=1}^n k_t(x_i,x_j) -\sum_{j=1}^n \b{f}[j] k_t(x_i,x_j)\right),
% \end{equation}
% where $k_t(x_i,x_j)=e^{-\frac{x_i-x_j}{4t}}$, \label{eq:full_graph_gaussian_weight}
% to each point on the Sphere.

%  It is a generalization of the graph Laplacian operator that apply on a function $f:\mathcal{S}^2 \rightarrow \mathbb{R} $:
% \begin{equation}
% L_t f(y) := \frac{1}{n} \left( f(y)\sum_{j=1}^n k_t(y,x_j)-\sum_{j=1}^n f(x_j)k_t(y,x_j) \right)
% \end{equation}


%  Given a function $f\in \mathcal{C}^\infty(\mathcal{S}^2)$ and for a fixed point $p \in \mathcal{C}^2$, after appropriate scaling the operator $L_t$, converge toward of the  Laplace–Beltrami operator.

% \begin{theorem}[Adaptation of Theorem 3.1 of~\cite{belkin2005towards}, Laplacian convergence] \label{theo:laplacian_convergence_belkin}
% Let data points $\b{x}_1,\dots, \b{x}_N$ be sampled from a uniform distribution on a manifold $\mathcal{M} \subset \mathbb{R}^T$.
% Using the weight function \eqref{eq:full_graph_gaussian_weight}, define a sequence $t_N=N^{-\frac{1}{4+\alpha}}$, where $\alpha>0$ and let $f\in \mathcal{C}^\infty(\mathcal{C}^2)$, then the following holds:
% \begin{equation}
% \lim_{N \rightarrow \infty} \frac{1}{t_N(4\pi t_N)} (L \b{f})[i] = \frac{1}{\text{vol} (\mathcal{C}^2)} (\Delta_{\mathcal{C}^2} f)(\b{x}_i)
% \end{equation}
% where the limit is taken in probability and $\text{vol}(\mathcal{C}^2)=4 \pi$ is the surface of the sphere with respect to the canonical measure.
% \end{theorem}

% \subsection{Empirical analysis of the graph eigenvectors}

% The first 16 eigenvectors $[\b u_1, \ldots, \b u_{16}]$ of the graph Laplacian $\L$, forming the lower part of the graph Fourier basis $\U$, are shown in \figref{graph_harmonics}.

\paragraph{Analysis of the graph Laplacian used in DeepSphere.}
As there is no regular sampling of the sphere, we cannot have a similar reasoning as we had for the circle.
We can however perform a harmonic analysis to empirically assess how similar the graph and the spherical convolutions are.
% Let us further observe the spectral properties of our constructed spherical graph laplacian $\L$.
%Given the graph used for DeepSphere, let us analyze the spectral alignment of the graph Fourier basis and the spherical harmonics.
The graph Laplacian eigenvalues, shown in \figref{graph_eigenvalues}, are clearly organized in frequency groups of $2\ell + 1$ orders for each degree $\ell$.
These blocks correspond to the different eigenspaces of the spherical harmonics.
We also show the correspondence between the subspaces spanned by the graph Fourier basis and the spherical harmonics in \figref{subspace_harmonics_eigenvectors}.
For example with $N_{side}=4$, we observe a good alignment for $\ell \leq 8$: the graph convolution will be equivariant to rotations for low frequency filters.
The imperfections are likely due to the small irregularities of the HEALPix sampling (varying number of neighbors and varying distances between vertices).
%Furthermore, the graph is currently not optimally constructed.
%Further work is necessary
Furthermore, a follow-up study is underway to optimally construct the graph.
We also hope to get a proof of equivalence or convergence.
% as the resulting Laplacian is probably far from circulant.
%One construction scheme that might be more adapted is a fully connected graph as this scheme is used in convergence proof in \cite{belkin2005towards,belkin2007convergence}.
% Those discrepancies result in a convolution operation that is not exactly equivariant to rotation.
% Nevertheless, our experiments suggest that these downsides do not have an important effect on the convolution nor hinder the performance of the NN.

% \todo{new results from Martino: becomes better in BN setting, currently under study}

% @Michael: Sorry no space for that
% \todo{we have less rich operations (compared to the most general linear equivariant map) by restricting our filters to be radial, but does it matter in practice?}
% \todo{isotropic filters provide invariance to the third rotation}


\begin{figure}[t!]
	\centering
	\includegraphics[width=0.32\linewidth]{result_order1}
	\hfill
	\includegraphics[width=0.32\linewidth]{result_order2}
	\hfill
	\includegraphics[width=0.32\linewidth]{result_order4}
	\caption{
		Classification accuracies.
		% of the fully convolutional variant of DeepSphere (DeepSphere FCN), the standard convolutional variant of DeepSphere (DeepSphere CNN), the fully convolutional 2D ConvNet (2D ConvNet FCN), the standard 2D ConvNet (2D ConvNet CNN), the support vector machine (SVM) with the power spectral density (PSD) as features, and the SVM with the histogram as features.
		The difficulty of the task depends on the level of noise and the size of a sample (that is, the number of pixels that constitute the sample to classify). Order $o=1$ corresponds to samples which area is $\frac{1}{12}=8.1\%$ of the sphere ($\approx 1 \times 10^6$ pixels), order $o=2$ to $\frac{1}{12 \times 2^2} = 2.1\%$ ($\approx 260 \times 10^3$ pixels), and order $o=4$ to $\frac{1}{12 \times 4^2} = 0.5\%$ ($\approx 65 \times 10^3$ pixels).
		% The standard deviation of the added Gaussian noise varies from zero to $2\times$ the standard deviation of pixel's values in the noiseless maps.
		% It is clear from those results that the noise makes the problem harder, and that having more pixels available to classify a sample makes the problem easier (the classifier having more evidence to make a decision).
		The FCN variant of DeepSphere beats the CNN variant by being invariant to rotation. Both variants largely beat the 2D ConvNet and the two SVM baselines.
	}
	\label{fig:results}
\end{figure}

\section{Experiments}

% Preliminary results on shape retrieval (SHREC-17) indicate similar performance as \citet{cohen2018sphericalcnn} and \citet{esteves2017sphericalcnn}.
% \todo{(while being isotropic and invariant to the third rotation)}

\paragraph{Setup.}
The performance of DeepSphere is demonstrated on a discrimination problem: the classification of cosmological convergence maps\footnote{Convergence maps represent the distribution of over- and under-densities of mass in the universe. They were created using the fast lightcone method UFalcon described in \citep{sgier2018fastgeneration}.} into two model classes.
Details about the experimental setup can be found in \cite{perraudin2018deepsphere}.
Code to reproduce the results is in the git repository.
%The code and instructions to reproduce the results are available at \url{https://github.com/SwissDataScienceCenter/DeepSphere}.
% \todo{While we only demonstrate a classification task here, other tasks, such as regression or segmentation, are also possible.}

% We control the difficulty of the classification problem by limiting the number of pixels available to the algorithms, i.e., extracting partial maps from whole sky maps.
% Using the properties of HEALPix, we split maps into $12 \times o^2$ ($o=1,2,4,\dots$) independent samples of $(N_{pix} / o)^2$ pixels. The resulting partial maps are large enough to suffer from the effects of the spherical geometry, and cover 8.3\% ($\approx 1 \times 10^6$ pixels), 2.1\% ($\approx 260 \times 10^3$ pixels), and 0.5\% ($\approx 65 \times 10^3$ pixels) of the sphere for $o=1,2,4$, respectively.
% To make the discrimination harder and the problem more realistic, centered Gaussian noise is added to the simulated maps.
% \todo{The standard deviation of the noise varies from zero (i.e., no noise) to $2\times$ the standard deviation of pixel's values in the noiseless maps.}

We propose two architectures: the ``CNN variant'', where convolutional layers are followed by dense layers, and the ``FCN variant'', where convolutional layers are followed by global average pooling.
The FCN variant is rotation invariant to the extent that the convolution is rotation equivariant.

\paragraph{Baselines.}
DeepSphere is first compared against two simple yet powerful cosmological baselines, based on features that are (i) the power spectral densities (PSD) of maps, and (ii) the histogram of pixels in the maps \citep{patton2017cosmologicalconstraints}.
DeepSphere is further compared to a classical CNN for 2D Euclidean grids, as in \citep{krachmalnicoff2019convolutional}.
%It can be applied thanks to properties of HEALPix .
To be fed into the 2D ConvNet, partial spherical signals are transformed into flat images.
DeepSphere and the 2D ConvNet have the same number of trainable parameters.


\paragraph{Results.}
\Figref{results} summaries the results.
% We indeed observe that the problem is made more difficult as the sample size decreases, and the noise increases.
% As fewer pixels are available to make a decision about a sample, the algorithms have access to less information and thus cannot classify as well.
% As the noise increases, the useful information gets diluted in irrelevant data, i.e., the signal-to-noise ratio (SNR) diminishes.
% As the cosmological parameters were chosen for the maps to have similar PSDs (see \figref{psd_sigma3}), it is reassuring to observe that an SVM with those as features has difficulties discriminating the models.
% Other statistics are therefore needed to solve the problem. Using histograms as features gives a significant improvement.
% Performance is very good for larger maps and deteriorates for the smaller cases with increased noise level, reaching 80\% at the highest noise level for $o=4$.
% The histogram features contain information about the distribution of pixels, which clearly varies for the two classes.
% They do not, however, include any spatial information.
% DeepSphere (both the FCN and CNN variants) shows superior performance over all configurations.
Overall, DeepSphere performs best with a gap that widens as the problem gets more difficult.
% This is likely due to DeepSphere learning features that are adapted to the problem instead of relying on hand-designed features.
% The accuracy of DeepSphere is $>97\%$ for orders $o=1$ and $o=2$, for all noise levels, and starts to deteriorate for $o=4$, reaching 90\% for the highest noise level.
% It may seem unfair to compare DeepSphere, who has access to raw pixels, with SVMs who only see limited features (histograms and PSDs).
% We however trained an SVM on the raw pixels and were unable to obtain over $60\%$ accuracy in any of the three noiseless cases.
The FCN variant outperforms the CNN variant as it exploits the rotational symmetry of the task (explained by the cosmological principle of homogeneity and isotropy).
The 2D ConvNet fairs in-between the SVMs and DeepSphere.
Lower performance is thought to be due to a lack of rotation equivariance, and to the geometric distortion introduced by the projection, which the NN has to learn to compensate for.%, which comparatively requires more training data.
% We also observed that the learned convolution kernels were not radial.

% This may seem counterintuitive as the CNN is a generalization of the FCN and hence should be able to learn the same function and provide at least equivalently good results.
% Nevertheless, as discussed in \secref{architecture}, the larger number of parameters incurred by the increased flexibility requires more training data to learn the rotation symmetry.
% The superior performance of the FCN is an empirical validation that these maps are stationary with a small-support radial autocorrelation function, stemming from the fact that the mass distribution is homogeneous and isotropic.
% It also implies that this classification problem is invariant to rotation.
% Hence, a pixel can be statistically classified using only its surrounding, and those local predictions can be averaged to vote for a global consensus.
% The CNN variant, however, may be better for data that does not have this property, as the architecture should always be adapted to the data and task (see \secref{architecture}).

% While testing the FCN and CNN variants of DeepSphere, we made the following empirical observations.
% First, training was more stable (in the sense that the loss decreased more smoothly) when using Chebyshev polynomials, as in \eqnref{graph_convolution_cheby}, rather than monomials, as in \eqnref{graph_convolution_monomial}.
% Nevertheless, we could not observe a significant difference in accuracy after convergence.
% Second, using $\ell_2$ regularization does not help either with performance or training of the models, mostly because the models are not over-fitting.
% Third, we recommend initializing the Chebyshev coefficients with a centered Gaussian distribution with standard deviation $\sqrt{2/(F_{in} \times (K + 0.5))}$, where $K$ is the degree of the Chebyshev polynomials and $F_{in}$ the number of input channels.
% This standard deviation has the property of keeping the energy of the signal more or less constant across layers.\footnote{We derived this rule from the principles presented in \cite{glorot2010understanding}, the Chebyshev polynomial equations, and some empirical experiments.}
% % Nati to Michael: The problem with the previous claim is that it depends on the non-linarity. Furthermore, does it make sense to make such a claim if we use batch norm?
% % MD: I don't know... But that's not too important.
% Finally, we observe that scaling the Laplacian's eigenvalues between $[-a, a]$, where $0 < a \leq 1$, significantly helps in stabilizing the optimization. We use $a = 0.75$ in our experiments.


% \section{Conclusion}  % & perspective

% \todo{Generalization vs specialization: most general is to assume no symmetries (fully connected NN).
% NNs can be specialized by adding some equivariance and invariance to symmetry groups such as translation, rotation, flip, etc.
% More specialized NNs are more limited in the class of functions they can approximate, but requires less samples.
% Again, use the right symmetries.}

% exploit symmetries in the data by being equivariant (or invariant) to a select group of symmetric transformations (symmetry group).
% In this paper, we argue that graph neural networks are a flexible model (albeit not the most general) to exploit such symmetries.

% \todo{As equivariance is not the Graal either. We expect the desired amount of equivariance to depend on the data and task. As usual, practitioners should adopt a NN architecture that exploit the symmetries of their problem. \citep{coors2018sphericalcnn, jiang2019sphericalcnn}.}

% \todo{future directions: equivalence / convergence to spherical harmonics, other tasks, comparison with other spherical CNNs, graph CNNs (especially those for manifolds), boundary conditions on partial sphere}

% \textbf{Long term vision.}

% We hope to establish graphs as a generic support for processing and learning over known and unknown manifolds.
% \todo{Applications are plenty:
% known manifolds (1D, 2D, 3D Euclidean spaces, circle, sphere)
% % Minkowski spacetime
% unknown manifolds (shapes, point clouds, feature sets)
% }
% \todo{
% * known manifolds: grid, sphere, spacetime (a pseudo-Riemannian 4-manifold), surfaces defined by NURBS, anything else?
% * unknown manifolds: meshes (human body), point clouds => what are the symmetries? local isometry
% * non-manifold: all networks (brain, social, transportation, telecom, etc.) and relations (author-papers, user-products)
% }

% \todo{Similar goal: graph CNNs for group equivariant convolution.}

% The vision is to push the use of graphs as the support for computation on manifolds. The advantages of graphs are to relax constraints on the sampling (e.g., allow an irregular or partial sampling), and be computationally more efficient (while being exact w.r.t. the continuous case).
% Part of a broader research effort to explore the use of GSP to solve continuous problems.

\subsubsection*{Acknowledgments}

We thank Pierre Vandergheynst for advice and helpful discussions, and Andreas Loukas for having processed the raw GHCN data.
% We used the Python Graph Signal Processing package (PyGSP) \citep{pygsp} for computations and plots.
The Python Graph Signal Processing package (PyGSP) \citep{pygsp} was used to build graphs, compute the Laplacian and Fourier basis, and perform graph convolutions.

\bibliography{refs}
\bibliographystyle{iclr2019}

\end{document}