functional_manifold_data/index.html at gh-pages · suswei/functional_manifold_data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />


<title>The hidden manifold distance for functional data</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/bootstrap.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/default.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>


<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


</head>

<body>

<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
  height: auto;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}

.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>


<div class="container-fluid main-container">

<!-- tabsets -->
<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});
</script>

<!-- code folding -->


<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">The hidden manifold distance for functional manifold data</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Paper</a>
</li>
<li>
  <a href="sim_functional_data.html">Functional manifold simulation scenarios</a>
</li>
<li>
  <a href="test_analytic_geodesic_distance.html">Verifying analytic geodesic distance derivation</a>
</li>
<li>
  <a href="README.html">About</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">

      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<div class="fluid-row" id="header">


<h1 class="title toc-ignore">The hidden manifold distance for functional data</h1>

</div>


<div id="introduction" class="section level1">
<h1>Introduction</h1>
<p>Many statistical and machine learning methods rely on some measure of distance. For example, clustering and classification of functional data, discretely-sampled curves varying over a continuum, is a common learning task in which distance plays a crucial role. In a naive treatment, the data may be simply processed as vectors in Euclidean space. Doing so however risks losing the richness of functional data; indeed, one may do better to regard the data as functions. The challenge of analyzing infinite-dimensional objects can be ameliorated if the curves are smooth since the dimensionality is then only artificially high. This insight underpins functional data analysis, a subfield of statistics that studies functional data.</p>
<!-- This has many implications for the processing of functional data.  -->
<!-- For example, the $L^2$ distance between the smoothed functional observations has advantages over the Euclidean distance between noisy functional observations treated as vectors.  -->
<p>In this work, we advocate that for functional manifold data, i.e. functional data that lie on a manifold, the geodesic distance is more appropriate than the <span class="math inline">\(L^2\)</span> distance. Indeed, when the manifold hypothesis is plausible, e.g. for classes of probability density functions and classes of warped curves of a common template function, clustering and classification using the geodesic distance may give better results than using the <span class="math inline">\(L^2\)</span> distance.</p>
<p>The manifold setting presents certain challenges since, for nonlinear manifolds, even basic operations such as addition and subtraction require special consideration. We will carefully lay out a technique for estimating the geodesic distance between noisy functional manifold data.</p>
<p>Let <span class="math inline">\(X_1,\ldots,X_n\)</span> be a sample of <span class="math inline">\(n\)</span> independent realizations of a random variable <span class="math inline">\(X\)</span> that takes value in the Hilbert space <span class="math inline">\(L^2([a,b],{\mathbb{R}})\)</span>. Suppose additionally that the function <span class="math inline">\(X\)</span> belongs to a Riemannian manifold <span class="math inline">\({\mathcal{M}}\subset L^2([a,b],{\mathbb{R}})\)</span> with Riemannian metric tensor <span class="math inline">\(g\)</span> which are often used to assign a metric on the manifold as follows . For each point <span class="math inline">\(X\)</span> on the manifold, the Riemannian metric tensor <span class="math inline">\(g\)</span> has an inner product <span class="math inline">\(g_X\)</span> on the tangent space <span class="math inline">\(T_X {\mathcal{M}}\)</span>. The norm of a tangent vector <span class="math inline">\(V \in T_X {\mathcal{M}}\)</span> is defined as <span class="math display">\[\|V\| = \sqrt{g_X(V,V)}.\]</span> The geodesic distance between two functions <span class="math inline">\(X_i,X_j\)</span> on the manifold <span class="math inline">\({\mathcal{M}}\)</span>, based on this metric tensor <span class="math inline">\(g\)</span>, is defined as <span class="math display">\[ d_{{\mathcal{M}}}(X_i,X_j):=\inf_{\gamma:[0,1] \to {\mathcal{M}}, \gamma(0) = X_i, \gamma(1) = X_j} l(\gamma),\]</span> where <span class="math display">\[ l(\gamma) := \int_0^1 \left\| \frac{\,d\gamma}{\,d t}(t) \right\| \,dt\]</span> is the length of the piecewise-smooth curve <span class="math inline">\(\gamma\)</span>. <!-- ???Is it clear how $l(\gamma)$ depends on $g$??? --></p>
<!-- ???insert example that shows pairwise geodesic distances give visually more interesting results than pairwise $L^2$ distance.??? -->
<!-- (gives examples and references for functional manifolds). -->
<!-- Before continuing, some diambiguities are called for. Contrast existing work on functional data where either the domain is a manifold or the range is a manifold, or both.  -->
<!-- There exist many nonlinear dimension reduction methods for manifolds embedded in Euclidean space, also known as manifold learning methods, which are particularly popular in computer vision. However, the success of these methods typically require the data to be observed with a high signal-to-noise ratio. (is it true? ref?)  -->
<p>Ideally, the functions <span class="math inline">\(X_i\)</span> and <span class="math inline">\(X_j\)</span> are observed on a very dense domain grid with no measurement error. Then the geodesic distance <span class="math inline">\(d_{\mathcal{M}}(X_i,X_j)\)</span> can be estimated using any of a variety of shortest-path algorithms, e.g. the Floyd-Warhsall  algorithm. However, most shortest-path algorithms critically assume observations live near the manifold with little noise  and are thus likely to fail for functional data which can be observed with low signal-to-noise ratio. <!-- A preprocessing step where the nosiy functional manifold data is smoothed is also likely to fail. This is because smoothing techniques in functional data analysis are designed for the $L^2$ recovery of the original curve, not to ensure the recovered functional versions of the data lie on or close to the functional manifold $\M$. This presents challenges to the application of many manifold learning techniques which assume fully observed nearly-noiseless inputs. --> <!-- Estimating the manifold $\M$ from a sample of functions observed discretely and with measurements is a challenging task (results depend on curvature of the manifold, need a lot of data). Even after recovering a functional version of the data, the chances are pretty high that $\tilde X_1,\ldots,\tilde X_n$ do not lie exactly on $\M$.  --> In this work, we put forth a technique for the recovery of the <span class="math inline">\(n{\times}n\)</span> matrix <span class="math inline">\(G\)</span> of pairwise geodesic distances: <span class="math display">\[
G(i,j)=G(j,i) = \begin{cases}
      d_{{\mathcal{M}}} (X_i,X_j) &amp; \textrm{if $i\neq j$,} \\
      0 &amp; \textrm{otherwise.}
   \end{cases}
\]</span> In lieu of <span class="math inline">\(X_i\)</span> and <span class="math inline">\(X_j\)</span>, we have access only to discretely-sampled noisy functional observations <span class="math inline">\(Y_i\)</span> and <span class="math inline">\(Y_j\)</span> that possibly lie off the true manifold.</p>
<!-- Pairwise distances are important for many downstream tasks, e.g.  classification, clustering, manifold learning (ISOMAP), nonparametric regression (kernel regression). Certainly, pairwise geodesic distances are legitimate objects of interest in themselves but our motivation is more general in nature. Specifically, we use pairwise distances as an illuminating example to advocate and promote the nonlinear perspective in FDA.  -->
<p>The work of  was among the first in functional data analysis to consider the manifold hypothesis for functional data. A notion of the mean and variation of functional manifold data was introduced there. Of particular relevance to this work is their proposed P-ISOMAP procedure which allows for noisy functional observations. This is in contrast to the classic ISOMAP algorithm  which assumes observations lie exactly on the manifold. This drawback to ISOMAP was also realised in  who proposed a procedure we will call robust-ISOMAP for functional data that is less sensitive to outliers. We will discuss P-ISOMAP and robust-ISOMAP in more details below.</p>
<!-- \cite{LinYao2017]  -->
</div>
<div id="proposed-method-for-estimating-geodesic-distances" class="section level1">
<h1>Proposed method for estimating geodesic distances</h1>
<p>Suppose that each curve <span class="math inline">\(X_i \in ({\mathcal{M}},g)\)</span> is observed with measurement error on a time grid <span class="math inline">\(T_i=(t_{i1},\ldots,t_{iK}), a \le t_{i1} &lt; \ldots &lt; t_{iK} \le b\)</span>, i.e. we observe a sample of <span class="math inline">\(K\)</span>-dimensional vectors <span class="math inline">\(Y_1,\ldots,Y_n\)</span> with <span class="math inline">\(Y_{ij} = X_i(t_{ij}) + \epsilon_{ij}\)</span>. Let the random variables <span class="math inline">\(\epsilon_{ij}\)</span> have mean zero and be uncorrelated with each other. We assume that each observation grid <span class="math inline">\(T_1,\ldots,T_n\)</span> is dense, i.e. <span class="math inline">\(K\)</span> is large.</p>
We begin by converting the discretely-sampled functional observations into continuous ones. Let <span class="math inline">\(\tilde X_1,\ldots,\tilde X_n\)</span> denote the functional versions of the raw data obtained by some smoothing method. For example, we may employ spline smoothing  to recover the functional versions of the raw data, i.e.<br />

<span class="math display">\[\begin{equation}\label{eq_spline_smoothing}
\tilde X_i = \arg \min_{f\in C^2[0,1]}\left\{\sum_{j=1}^{K}\left(f(t_{ij})-Y_{ij}\right)^2+\lambda \|\partial^2_tf\|^2_{L^2}\right\},
\end{equation}\]</span>
<p>where <span class="math inline">\(\lambda&gt;0\)</span> is a tuning parameter controlling the smoothness of <span class="math inline">\(\tilde X_i\)</span>. <!-- The proposed methodology for estimating the pairwise geodesic distances $\{ d_\M (X_i,X_j)\}_{i>j}$ is agnostic to the smoothing method employed. --> Our method is based on the idea that the underlying functional manifold <span class="math inline">\({\mathcal{M}}\)</span> can be sufficiently well-recovered by the subspace-constrained mean-shift (SCMS) algorithm  which we now review briefly in the Euclidean case. Suppose <span class="math inline">\(Z_1,\ldots,Z_n\)</span> is a sample of realisations of a random vector <span class="math inline">\(Z \in {\mathcal{M}}\)</span>. We can use a kernel density estimator <span class="math inline">\(\hat f\)</span> based on this data to estimate the probability density function of <span class="math inline">\(Z\)</span>. Let the set <span class="math inline">\(\hat M\)</span> be comprised of the basins of attraction of <span class="math inline">\(\hat f\)</span>. The SCMS algorithm sends each point <span class="math inline">\(Z_i\)</span> to its destination in <span class="math inline">\(\hat M\)</span>. Theoretical justification that <span class="math inline">\(\hat M\)</span> as estimated by SCMS is a reasonable surrogate for the true manifold <span class="math inline">\({\mathcal{M}}\)</span> can be found in .</p>
<p>Before presenting our procedure, we also need to review the ISOMAP algorithm , a three-step procedure that takes as input a set of points <span class="math inline">\(x_1,\ldots,x_n\)</span> in a submanifold <span class="math inline">\({\mathcal{M}}\subset {\mathbb{R}}^D\)</span> and produces an embedding in the space <span class="math inline">\({\mathbb{R}}^d\)</span> with <span class="math inline">\(d&lt;D\)</span> that preserves pairwise geodesic distances. The procedure is as follows.</p>
<ol style="list-style-type: decimal">
<li>Construct a weighted graph <span class="math inline">\(\mathcal{G}\)</span> with nodes corresponding to the observations <span class="math inline">\(x_1,\ldots,x_n\in {\mathbb{R}}^D\)</span>. Two nodes <span class="math inline">\(x_i\)</span> and <span class="math inline">\(x_j\)</span> are connected by an edge <span class="math inline">\(e_{ij}\)</span> with weight <span class="math inline">\(d_{ij} 1\{d_{ij} \le \varepsilon \}\)</span> where <span class="math inline">\(d_{ij}=\|x_i-x_j \|_{2}\)</span> and <span class="math inline">\(\varepsilon &gt; 0\)</span>.</li>
<li>Estimate the pairwise geodesic distances <span class="math inline">\(d_{\mathcal{M}}(x_i,x_j)\)</span> based on <span class="math inline">\(\mathcal{G}\)</span> using shortest-path algorithms. Specifically, this is the sum of the weights of the edges forming the shortest path, which is calculated either with the Floyd-Warshall algorithm  or with the Dijkstra algorithm .</li>
<li>Use multidimensional scaling to obtain an embedding in <span class="math inline">\({\mathbb{R}}^d\)</span> that preserves the pairwise geodesic distances estimated above.</li>
</ol>
<p>Note that shortest-path algorithms such as the Floyd-Warshall algorithm are used to find the smallest path  a weighted graph but they do not produce the weighted graphs themselves. Both P-ISOMAP and robust-ISOMAP are modifications of ISOMAP in the sense that they change the construction of the weighted graph in step 1 above. <!-- (which is the difficult part, depend on number of neighbors, many method out there to calculate that graph in a robust way, P-ISOMAP is one of these). --></p>
<p>In what follows, we call IsoGeo the two-step procedure which performs a modified version of the first step of ISOMAP followed by the original second step of ISOMAP. The first step of IsoGeo constructs a weighted graph <span class="math inline">\(\mathcal{G}\)</span> according to  as follows. First we construct a complete weighted graph <span class="math inline">\(\mathcal{G}_c\)</span>, meaning that every pair of nodes <span class="math inline">\(x_i\)</span> and <span class="math inline">\(x_j\)</span> is connected by an edge of weight <span class="math inline">\(d_{ij}\)</span>, so that the graph is independent of the tuning parameter <span class="math inline">\(\varepsilon\)</span>. Next, we obtain the minimal spanning tree associated with <span class="math inline">\(\mathcal{G}_c\)</span> and denote its set of edges by <span class="math inline">\(E_s\)</span>. The graph <span class="math inline">\(\mathcal{G}\)</span> is obtained by adding all edges <span class="math inline">\(e_{ij}\)</span> to <span class="math inline">\(E_s\)</span> for which the following condition is true: <span class="math display">\[ \overline{x_ix_j} \subset \bigcup_{i=1}^n B(x_i,\varepsilon_i),\]</span> where <span class="math inline">\(B(x_i,\varepsilon_i)\)</span> is the open ball of center <span class="math inline">\(x_i\)</span> and radius <span class="math inline">\(\epsilon_i = \max_{e_{ij}\in E_s}d_{ij}\)</span>, and <span class="math inline">\(\overline{x_ix_j}= \{x\in R^D \ | \ \exists \lambda \in [0,1], x=\lambda x_i + (1-\lambda)x_j \}\)</span>. Let the distance estimated by IsoGeo be denoted <span class="math inline">\(\text{IsoGeo}(x_i,x_j)\)</span>. Note that IsoGeo has no tuning parameters.</p>
<p>We are now ready to describe our procedure. Since SCMS is based on kernel density estimation, we first reduce the dimension of our data with multidimensional scaling before applying SCMS to avoid the curse of dimensionality.</p>
<!-- 1. [Susan: I think the smoothing step should be independent of our proposed methodology] Transform each vector $Y_i$ into a function $\tilde X_i$ by spline smoothing: -->
<!-- $$ \tilde X_i = \arg\min_{f\in C^2[0,1]}\left\{\sum_{j=1}^{K}\left(f(t_{ij})-Y_{ij}\right)^2+\lambda \|\partial^2_tf\|^2_{L^2}\right\}$$ -->
<!-- where $\lambda>0$ is a tuning parameter controlling the smoothness of $\tilde X_i$.  -->
<ol style="list-style-type: decimal">
<li>Discretise the functions <span class="math inline">\(\tilde X_1,\ldots,\tilde X_n\)</span> by evaluating them on a dense common grid <span class="math inline">\(\tilde T=(t_{1},\ldots,t_{D})\)</span>. Let <span class="math inline">\(s\)</span> be a positive integer, much smaller than <span class="math inline">\(D\)</span>. Obtain <span class="math inline">\(\tilde X^s_1,\ldots,\tilde X^s_n \in \mathbb R^s\)</span> using multidimensional scaling so that the pairwise <span class="math inline">\(\mathbb R^D\)</span> Euclidean distances of the discretised functions <span class="math inline">\(\tilde X_i\)</span> can be preserved as much as possible.</li>
<li>Apply the subspace constrained mean-shift algorithm  to each of <span class="math inline">\(\tilde X^s_i\)</span> to obtain its destination <span class="math inline">\(\tilde X^{s,\hat {\mathcal{M}}}_i\)</span>.</li>
<li>Our geodesic distance estimator is the <span class="math inline">\(n{\times}n\)</span> matrix <span class="math inline">\(\hat G\)</span> whose elements are given by <span class="math display">\[
\hat G(i,j)=\hat G(j,i) = \left\{ \begin{array}{ll}
 \text{IsoGeo}(\tilde X^{s,\hat {\mathcal{M}}}_i,\tilde X^{s,\hat {\mathcal{M}}}_j) &amp; \textrm{if $i\neq j$,}\\
 0 &amp; \textrm{otherwise.}
  \end{array} \right.
\]</span></li>
</ol>
<p>There are two tuning parameters to our procedure. First, there is the dimension <span class="math inline">\(s\)</span> in step 1. In our simulations we try <span class="math inline">\(s \in \{1,2,3\}\)</span>. The other parameter is the bandwidth <span class="math inline">\(h\)</span> in the subspace constrained mean-shift. In our simulations we set <span class="math inline">\(h\)</span> according to the heuristic in Equation (A1) of : <span class="math display">\[h = \left(\frac{4}{s+2}\right)^{\frac{1}{s+4}}n^{\frac{-1}{s+4}}\sigma_{\min}, \]</span> where <span class="math inline">\(\sigma_{\min}\)</span> is the minimum over <span class="math inline">\(j=1,\ldots,s\)</span> of the standard deviation of <span class="math inline">\(\tilde X^s_{1,j},\ldots, \tilde X^s_{n,j}\)</span>, with <span class="math inline">\(\tilde X^s_{i,j}\)</span> being the <span class="math inline">\(j\)</span>th entry of the vector <span class="math inline">\(\tilde X^s_{i}\)</span>. In reality, depending on the downstream task, both <span class="math inline">\(s\)</span> and <span class="math inline">\(h\)</span> can be tuned in a data-adaptive way, say using cross-validation.</p>
<!-- ##############  -->
</div>
<div id="simulation-study" class="section level1">
<h1>Simulation study</h1>
<p>We perform simulations to study various estimators of pairwise geodesic distances. We compare the performance of our method to each of the following four alternatives. The first alternative, denoted <strong>RD</strong>, takes the naive approach of applying IsoGeo directly on the raw vectors <span class="math inline">\(Y_1,\ldots,Y_n \in R^K\)</span>. Note that this procedure is only sensible if all the time grids <span class="math inline">\(T_1,\ldots,T_n\)</span> are the same.</p>
<p>For the next two alternatives, we first smooth the discretely-sampled noisy functional data using splines as described in (). Then, we either estimate pairwise distance between the smoothed <span class="math inline">\(\tilde X_1,\ldots,\tilde X_n\)</span> evaluated on a common time grid of <span class="math inline">\(D\)</span> points using IsoGeo or the <span class="math inline">\(L^2\)</span> distance. The former method will be denoted <strong>Spline IsoGeo</strong> and the latter <strong>Spline L2</strong>.</p>
<p>The final alternative we consider is P-ISOMAP which is the two-step procedure developed in  for handling noisy functional data. In the first step, a penalty is incorporated to robustify the construction of the weighted graph. This is followed by the original second step of ISOMAP. <!--* **(Random Projection) RP**  same method as our but change step 2 by : obtain $s$-dimensional representation by random projection and setp 4 by obtain $\hat G$ using a ensemble method.--></p>
<p>The code for P-ISOMAP was provided by  in MATLAB. We ran P-ISOMAP and tuned for the parameter <span class="math inline">\(\varepsilon\)</span> and the penalty parameter <span class="math inline">\(\delta\)</span> using the relative Frobenius assessment metric, described below. We found that the penalty parameter chosen in this oracle fashion was always zero, thus reducing the P-ISOMAP procedure to standard ISOMAP. Given this, we did not make further comparison to P-ISOMAP in our simulation study. We also do not present the results for the RD method which always produced poor estimates of the geodesic distance.</p>
<p>Three different metrics are used to assess the quality of a pairwise geodesic distance estimator. The first metric assesses near-<span class="math inline">\(\epsilon\)</span> isometry, which we define as the percentage of estimated pairwise geodesic distances between <span class="math inline">\(1-\epsilon\)</span> and <span class="math inline">\(1+\epsilon\)</span> of the corresponding true pairwise geodesic distance. We form the receiver operating curve with <span class="math inline">\(\epsilon\)</span> on the <span class="math inline">\(x\)</span>-axis and near-<span class="math inline">\(\epsilon\)</span> isometry on the <span class="math inline">\(y\)</span>-axis. The metric is the area under this receiver operating curve. Thus a good estimator has high values for the near-isometry metric.</p>
<p>The second metric <span class="math inline">\(\|G-\hat G\|/\|G\|\)</span> is the Frobenius norm of the estimation error matrix <span class="math inline">\(G-\hat G\)</span>, relative to the Frobenius norm of the true distance matrix <span class="math inline">\(G\)</span>. The third metric is the Pearson correlation coefficient between the upper diagonal of <span class="math inline">\(G\)</span> and <span class="math inline">\(\hat G\)</span>. A good geodesic distance estimator has low Frobenius norm and high Pearson correlation coefficient.</p>
<p>We shall consider three functional manifolds in our simulation study. Throughout, we make use of a helpful property of one-dimensional manifolds. If <span class="math inline">\(X \in {\mathcal{M}}\)</span>, with the <span class="math inline">\(L^2\)</span> metric tensor, depends on a scalar parameter <span class="math inline">\(\theta \in \mathbb R\)</span>, then <span class="math display">\[
d_{\mathcal{M}}(X_{\theta_1},X_{\theta_2}) = \int_{\theta_1}^{\theta_2} \left\| \frac{\partial X}{\partial \theta}(t)\right\|_{L^2} \,d\theta.
\]</span></p>
<p>We also need to make a disclaimer about our sampling technique below. Proper sampling from a manifold remains an open problem. Even in the Euclidean case, it is not obvious how sampling should be done . For now, to safeguard against sampling unevenly on the functional manifold, we sample on a very narrow support of the manifold parameters.</p>
<div id="isometric-functional-manifold-of-normal-density-functions" class="section level2">
<h2>Isometric functional manifold of normal density functions</h2>
<!-- Scenario 2 in sim_functional_data -->
<p>We take Manifold 2 in  and set the variance of the normal density to be <span class="math inline">\(1\)</span>. Let <span class="math inline">\(X_\beta: [a,b] \to {\mathbb{R}}\)</span> be given by <span class="math inline">\(X_\beta(t) = \frac{1}{\sqrt{2\pi}} \exp{[-\frac{1}{2}(t-\beta)^2]}\)</span>. Consider <span class="math display">\[{\mathcal{M}}= \left \{X_\beta: \beta \in [-1,1], t \in [a,b]\right \},\]</span> with the <span class="math inline">\(L^2\)</span> inner product as the metric tensor. First, note that the “straight” line connecting <span class="math inline">\(X_{\beta_1}\)</span> and <span class="math inline">\(X_{\beta_2}\)</span> does not always stay inside of <span class="math inline">\({\mathcal{M}}\)</span>. Thus the geodesic distance will not coincide with the <span class="math inline">\(L^2\)</span> distance.</p>

We set <span class="math inline">\(a=-4\)</span> and <span class="math inline">\(b=4\)</span> and sample <span class="math inline">\(\beta\)</span> according to a truncated standard normal with support on <span class="math inline">\([-1,1]\)</span>. The geodesic distance between the curves <span class="math inline">\(X_{\beta_1}\)</span> and <span class="math inline">\(X_{\beta_2}\)</span> is given by
<span class="math display">\[\begin{eqnarray*}
d_\M(X_{\beta_1},X_{\beta_2}) &amp;=&amp; \int_{\beta_1}^{\beta_2} \left \| \frac{d X_\beta (t)}{d\beta} \right\|_{L^2} d\beta \\
% &amp;=&amp;  \int_{\beta_1}^{\beta_2} \sqrt{ \frac{1}{2\sqrt{\pi}} \int_{-4}^4 \frac{1}{\sqrt{\pi}} \exp\{-(t-\beta)^2\}(t-\beta)^2 dt  }  \  d\beta \\
&amp;=&amp;   \int_{\beta_1}^{\beta_2} \sqrt{ \frac{1}{2\sqrt{\pi}} \int_{-4}^4(t-\beta)^2 f(t) dt  }  \  d\beta   \\
&amp;\approx&amp;  \int_{\beta_1}^{\beta_2}  \sqrt{ \frac{1}{2\sqrt{\pi}} \frac{1}{2}} \ d\beta \\
&amp;=&amp; (\beta_2-\beta_1) \frac{1}{2\pi^{1/4}},
\end{eqnarray*}\]</span>
<p>where <span class="math inline">\(f\)</span> is the density of a N<span class="math inline">\((\beta,1/2)\)</span> and the approximation in the next to last step comes from integrating on <span class="math inline">\([a,b]=[-4,4]\)</span> rather than <span class="math inline">\({\mathbb{R}}\)</span>. We can see this manifold is isometric, since the geodesic distance between <span class="math inline">\(X_{\beta_1}\)</span> and <span class="math inline">\(X_{\beta_2}\)</span> in <span class="math inline">\({\mathcal{M}}\)</span> is proportional to <span class="math inline">\(\beta_2-\beta_1\)</span>.</p>
</div>
<div id="functional-manifold-of-square-root-velocity-functions" class="section level2">
<h2>Functional manifold of square root velocity functions</h2>
<!-- Scenario 4 in sim_functional_data -->
<p>It was shown in  that the square root representation of probability density functions gives rise to a simple closed-form geodesic distance. They consider the manifold <span class="math display">\[ {\mathcal{M}}= \{ \psi:[0,1] \to {\mathbb{R}}: \psi \ge 0, \int_0^1 \psi^2(s) \,ds = 1 \},\]</span> with the Fisher-Rao metric tensor <span class="math display">\[ &lt;v_1,v_2&gt; = \int_0^1 v_1(s) v_2(s) \,ds ,\]</span> for two tangent vectors <span class="math inline">\(v_1,v_2 \in T_\psi({\mathcal{M}})\)</span>. Note that this coincides with the <span class="math inline">\(L^2[0,1]\)</span> inner product. It was shown in  that the geodesic distance between any two <span class="math inline">\(\psi_1\)</span> and <span class="math inline">\(\psi_2\)</span> in <span class="math inline">\({\mathcal{M}}\)</span> is simply <span class="math display">\[d_{\mathcal{M}}(\psi_1,\psi_2) = \cos^{-1}&lt;\psi_1,\psi_2&gt;.\]</span></p>

<p>We will specifically examine the square root of <span class="math inline">\(Beta(\alpha,\beta)\)</span> densities, which are of course supported on <span class="math inline">\([0,1]\)</span>. That is, consider <span class="math display">\[ {\mathcal{M}}= \big\{ \sqrt{\psi_{\alpha,\beta}}: 1 \le \alpha \le 5, 2 \le \beta \le 5\big\}, \]</span> where <span class="math inline">\(\psi_{\alpha,\beta}: [0,1] \to {\mathbb{R}}\)</span> is the probability density function of a Beta random variable with shape parameters <span class="math inline">\(\alpha\)</span> and <span class="math inline">\(\beta\)</span>. We sample <span class="math inline">\(\alpha\)</span> according to a truncated normal with mean 3 and variance 0.09 with support on <span class="math inline">\([1,5]\)</span>. We sample <span class="math inline">\(\beta\)</span> according to a truncated normal with mean 3.5 and variance 0.09 with support on <span class="math inline">\([2,5]\)</span>.</p>
</div>
<div id="functional-manifold-of-warped-functions" class="section level2">
<h2>Functional manifold of warped functions</h2>
<!-- Scenario 5 in sim_functional_data -->

<p>Let <span class="math inline">\(X_\alpha(t) = \mu(h_\alpha(t))\)</span> be defined on <span class="math inline">\([-3,3]\)</span> where <span class="math display">\[ \mu(t) = \exp\{(t-1.5)^2/2\}+\exp\{(t+1.5)^2/2\},\]</span> and <span class="math display">\[ h_\alpha(t) = 6\frac{\exp\{\alpha(t+3)/6\}-1}{\exp\{\alpha\}-1},\]</span> if <span class="math inline">\(\alpha \ne 0\)</span> and <span class="math inline">\(h_\alpha(t) = t\)</span> otherwise.</p>
<p>Consider the manifold <span class="math display">\[ {\mathcal{M}}= \{X_\alpha: -1 \le \alpha \le 1\}.\]</span> We sample <span class="math inline">\(\alpha\)</span> according to a truncated standard normal with support on <span class="math inline">\([-1,1]\)</span>. This manifold is based on Equations 17 and 18 in  but with <span class="math inline">\(z_{i1}, z_{i2}\)</span> therein both set to <span class="math inline">\(1\)</span>. (Note that Equation 17 therein has a typo where the exponentials are missing negative signs.)</p>
<p>The geodesic distance is <span class="math display">\[ d_{\mathcal{M}}(X_{\alpha_1},X_{\alpha_2}) = \int_{\alpha_1}^{\alpha_2} \left \| \frac{d X_\alpha (t)}{d\alpha} \right\|_{L^2} d\alpha.\]</span> We estimate this using numerical integration.</p>
</div>
<div id="results" class="section level2">
<h2>Results</h2>
<p>For each manifold <span class="math inline">\({\mathcal{M}}\)</span> described above, we simulate 100 Monte Carlo samples of <span class="math inline">\(100\)</span> functions <span class="math inline">\(X_i\in{\mathcal{M}}\)</span>. We then construct vectors <span class="math inline">\(Y_i\in {\mathbb{R}}^K\)</span> such that <span class="math inline">\(Y_{ij}=X_i(t_j) + \epsilon_{ij}\)</span> with <span class="math inline">\(T=(t_1,\ldots,t_K)\)</span> a common grid of equally spaced points and <span class="math inline">\(\epsilon_{ij}\sim N(0,\sigma_\epsilon^2)\)</span>. The tuning parameters for the proposed method, <span class="math inline">\(s\)</span> and <span class="math inline">\(h\)</span>, were chosen as described at the end of the methodology section. In step 1 of our procedure, we take <span class="math inline">\(D = 100\)</span> for the size of the common time grid. We will investigate how two parameters affect the performance of the various methods in consideration. The first one is the signal-to-noise ratio <span class="math display">\[SNR = 10 log_{10}\left(\frac{\sigma^2_{sig}}{\sigma^2_\epsilon}\right),\]</span> where <span class="math inline">\(\sigma^2_{sig}\)</span> is the variance of the signal. We fix SNR at either <span class="math inline">\(0.1\)</span> (low) or <span class="math inline">\(0.5\)</span> (high). The second parameter is <span class="math inline">\(K\)</span>, the dimension of the discretely-observed noisy <span class="math inline">\(Y_i\)</span>’s. We set <span class="math inline">\(K\)</span> to either <span class="math inline">\(30\)</span> (sparse) or <span class="math inline">\(100\)</span> (abundant).</p>
<p>We will present the results for the high signal-to-noise and sparse <span class="math inline">\(K\)</span> setting. We show the performance of the Spline IsoGeo and Spline L2 method versus our method for three dimensions <span class="math inline">\(s=1,2,3\)</span>. We can see from Figures  and  that spline smoothing followed by either IsoGeo or the <span class="math inline">\(L^2\)</span> distance estimates the pairwise geodesic distances reasonably well though not as well as the proposed method for <span class="math inline">\(s=2\)</span> and <span class="math inline">\(s=3\)</span>. Notably, when <span class="math inline">\(s\)</span> in our method is not selected properly, for instance when <span class="math inline">\(s=1\)</span>, our estimator performs poorly in terms of the relative Frobenius norm although still better than Spline IsoGeo and Spline L2 in the other two metrics. It may be possible that for certain downstream tasks such as clustering or classification, precise estimation of the geodesic distance is less important than one that respects the ordering of the distances. In such cases, the near-isometry and Pearson correlation metrics may give a better indication of downstream performance of the geodesic distance estimator.</p>
<p>The result of the square root velocity manifold scenario is presented in Figure . The Spline L2 method has similar performance to our proposed method. This could be due to the fact that we did not sample the manifold broadly enough so that the sample lives in a locally flat region where the <span class="math inline">\(L^2\)</span> distance holds.</p>


<!--

## The cost of employing proposed method when manifold is flat [Susan/Marie?]

Suppose we take scenario 1 where the geodesic distance coincides with the $L^2$ distance. Then what is the cost of employing our distance estimator compared to smoothing the data and doing numerical integration to find the $L^2$ distance?

-->
<!-- # Distance-based functional classification [Marie] -->
<!-- In this section, we explore whether our geodesic distance estimator has benefits for downstream analysis task. There are many tasks we could consider here such as distance-based nonparametric regression and distanced-based functional clustering, but we will focus on distance-based functional classification. It must be noted that while curve alignment, also known as curve registration, is necessarily performed as a preprocessing technique prior to clustering and classification, our geodesic distance estimator allows one to forsake this step.  -->
<!-- For simplicity, assume the task is binary classification. Associated to each functional object $x$ is a binary $y$ indicating class membership. -->
<!-- Consider the classifier proposed in \cite{Ferraty2006} which is a functional version of the Nadaraya-Watson kernel estimator of class membership probabilities: -->
<!-- $$ -->
<!-- \hat p(y = 0 | x) \frac{ \sum_{i=1}^n K[h^{-1} d(x,x_i)] 1(y_i = 0) }{ \sum_{i=1}^n K[h^{-1} d(x,x_i)] } -->
<!-- $$ -->
<!-- We shall compare our method to using $L^2$ distance, possibly weighted, and with curve registration already accomplished. Describe alternative methods in detail.  -->
<!-- The bandwidth in the classifier should be tuned individually for each method. Also we might need to tune MDS dimension $s$ since in real data, the dimension of the manifold might be much higher than encountered in the simulation scenarios where it never goes above 2.  -->
<!-- Datasets used by functional classification papers -->
<!-- \begin{itemize} -->
<!-- \item Wheat, rainfall and phoneme in Aurore's paper "Achieving near-perfect classification for functional data" -->
<!-- \item Berkeley growth curves in \cite{ChenReiss2014}. -->
<!-- \item Tecator and phoneme in \cite{Galeano2015} Mahalanobis technometrics paper. -->
<!-- \item yeast cell cycle gene expression (can't find this publicly) in \cite{LengMuller2005} "Classification using functional data analysis for temporal gene expression data" -->
<!-- \end{itemize} -->
<!-- Datasets used in functional manifold papers -->
<!-- \begin{itemize} -->
<!-- \item Berkeley growth, yeast cell cycle gene expression (can't find this publicly) in \cite{ChenMuller2012} -->
<!-- \item Tecator in \cite{LinYao2017} contamination paper -->
<!-- \item Berkeley growth, gait cycle in \cite{Dimeglio2014} robust isomap paper -->
<!-- \end{itemize} -->
<!-- ## Distance-based functional clustering -->
<!-- Describe the $k$-medoids clustering method. -->
<!-- ### Simulated data -->
<!-- ###  Real data -->
<!-- # Acknowledgement -->
<!-- The authors would like to acknowledge support from the ARC Centre of Excellence for Mathematical & Statistical Frontiers (ACEMS) which enabled the two authors to work on the project at the University of Melbourne.  -->
<!-- # Appendices -->
<!-- The result of the square root velocity manifold scenario is presented in Figure \ref{fig:betas}. The Spline L2 method has similar performance to our proposed method. This could be due to the fact that we did not sample the manifold broadly enough so that the sample lives in a locally flat region where the $L^2$ distance holds.  -->
<!-- \begin{figure} -->
<!-- \includegraphics[height=0.3\textheight]{sim_results/sce=4_SNR=high_Kobs=30_MSE} -->
<!-- \includegraphics[height=0.3\textheight]{sim_results/sce=4_SNR=high_Kobs=30_isometry} -->
<!-- \includegraphics[height=0.3\textheight]{sim_results/sce=4_SNR=high_Kobs=30_Pearson} -->
<!-- \label{fig:betas} -->
<!-- \caption{Manifold of square root Beta densities. Besides Spline IsoGeo, all methods perform comparably well.} -->
<!-- \end{figure} -->
</div>
</div>


</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>