Skip to content

Commit 1c0e8c3

Browse files
alexbarrosaquemy
authored andcommitted
feat: fist version of the gap analysis tab for ts (#1410)
* feat: fist version of the gap analysis tab for ts * feat: add gap stats table * fix: adjust gap plot the image size * feat: new gap analysis visualization
1 parent b860476 commit 1c0e8c3

File tree

5 files changed

+192
-7
lines changed

5 files changed

+192
-7
lines changed

src/ydata_profiling/model/pandas/describe_timeseries_pandas.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from statsmodels.tsa.stattools import adfuller
88

99
from ydata_profiling.config import Settings
10+
from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency
1011
from ydata_profiling.model.summary_algorithms import (
1112
describe_numeric_1d,
1213
describe_timeseries_1d,
@@ -141,6 +142,49 @@ def get_fft_peaks(
141142
return threshold, orig_peaks, peaks
142143

143144

145+
def compute_gap_stats(series: pd.Series) -> pd.Series:
146+
"""Computes the intertevals in the series normalized by the period.
147+
148+
Args:
149+
series (pd.Series): time series data to analysis.
150+
151+
Returns:
152+
A series with the gaps intervals.
153+
"""
154+
155+
gap = series.dropna()
156+
index_name = gap.index.name if gap.index.name else "index"
157+
gap = gap.reset_index()[index_name]
158+
gap.index.name = None
159+
160+
if isinstance(series.index, pd.DatetimeIndex):
161+
period, frequency = get_period_and_frequency(series.index)
162+
period = pd.Timedelta(f"{period} {frequency}")
163+
base_frequency = pd.Timedelta(f"1 {frequency}")
164+
else:
165+
period = np.abs(np.diff(series.index)).mean()
166+
base_frequency = 1
167+
168+
diff = gap.diff()
169+
anchors = gap[diff > period].index
170+
gaps = []
171+
for i in anchors:
172+
gaps.append(gap.loc[gap.index[[i - 1, i]]].values)
173+
174+
stats = {
175+
"period": period / base_frequency,
176+
"min": diff.min() / base_frequency,
177+
"max": diff.max() / base_frequency,
178+
"mean": diff.mean() / base_frequency,
179+
"std": diff.std() / base_frequency,
180+
"series": series,
181+
"gaps": gaps,
182+
}
183+
if isinstance(series.index, pd.DatetimeIndex):
184+
stats["frequency"] = frequency
185+
return stats
186+
187+
144188
@describe_timeseries_1d.register
145189
@series_hashable
146190
@series_handle_nulls
@@ -164,5 +208,6 @@ def pandas_describe_timeseries_1d(
164208
stats["stationary"] = is_stationary and not stats["seasonal"]
165209
stats["addfuller"] = p_value
166210
stats["series"] = series
211+
stats["gap_stats"] = compute_gap_stats(series)
167212

168213
return config, series, stats

src/ydata_profiling/model/pandas/timeseries_index_pandas.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pandas.api.types import is_numeric_dtype
55

66
from ydata_profiling.config import Settings
7+
from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency
78
from ydata_profiling.model.timeseries_index import get_time_index_description
89

910

@@ -21,10 +22,7 @@ def pandas_get_time_index_description(
2122
start = df.index.min()
2223
end = df.index.max()
2324
if isinstance(df.index, pd.DatetimeIndex):
24-
freq = df.index.inferred_freq
25-
delta = abs(np.diff(df.index)).mean()
26-
delta = delta.astype(f"timedelta64[{df.index.inferred_freq}]")
27-
period = delta.astype(float)
25+
period, freq = get_period_and_frequency(df.index)
2826
else:
2927
freq = None
3028
period = abs(np.diff(df.index)).mean()

src/ydata_profiling/model/pandas/utils_pandas.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
from typing import Tuple
2+
13
import numpy as np
4+
import pandas as pd
25

36

47
def weighted_median(data: np.ndarray, weights: np.ndarray) -> int:
@@ -25,3 +28,21 @@ def weighted_median(data: np.ndarray, weights: np.ndarray) -> int:
2528
else:
2629
w_median = s_data[idx + 1]
2730
return w_median
31+
32+
33+
def get_period_and_frequency(index: pd.DatetimeIndex) -> Tuple[float, str]:
34+
delta = abs(np.diff(index)).mean()
35+
delta = pd.Timedelta(delta)
36+
if delta.days > 0:
37+
frequency = "Days"
38+
period = delta / pd.Timedelta(days=1)
39+
elif delta.seconds > 0:
40+
frequency = "Seconds"
41+
period = delta / pd.Timedelta(seconds=1)
42+
elif delta.microseconds > 0:
43+
frequency = "Microseconds"
44+
period = delta / pd.Timedelta(microseconds=1)
45+
else:
46+
frequency = "Nanoseconds"
47+
period = delta.nanoseconds / pd.Timedelta(nanoseconds=1)
48+
return period, frequency

src/ydata_profiling/report/structure/variables/render_timeseries.py

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,80 @@
1414
VariableInfo,
1515
)
1616
from ydata_profiling.report.structure.variables.render_common import render_common
17-
from ydata_profiling.visualisation.plot import histogram, mini_ts_plot, plot_acf_pacf
17+
from ydata_profiling.visualisation.plot import (
18+
histogram,
19+
mini_ts_plot,
20+
plot_acf_pacf,
21+
plot_timeseries_gap_analysis,
22+
)
23+
24+
25+
def _render_gap_tab(config: Settings, summary: dict) -> Container:
26+
gap_stats = [
27+
{
28+
"name": "period",
29+
"value": fmt_numeric(
30+
summary["gap_stats"]["period"], precision=config.report.precision
31+
),
32+
},
33+
]
34+
if "frequency" in summary["gap_stats"]:
35+
gap_stats.append(
36+
{
37+
"name": "frequency",
38+
"value": summary["gap_stats"]["frequency"],
39+
}
40+
)
41+
gap_stats.extend(
42+
[
43+
{
44+
"name": "min inverval",
45+
"value": fmt_numeric(
46+
summary["gap_stats"]["min"], precision=config.report.precision
47+
),
48+
},
49+
{
50+
"name": "max inverval",
51+
"value": fmt_numeric(
52+
summary["gap_stats"]["max"], precision=config.report.precision
53+
),
54+
},
55+
{
56+
"name": "mean inverval",
57+
"value": fmt_numeric(
58+
summary["gap_stats"]["mean"], precision=config.report.precision
59+
),
60+
},
61+
{
62+
"name": "interval std",
63+
"value": fmt_numeric(
64+
summary["gap_stats"]["std"], precision=config.report.precision
65+
),
66+
},
67+
]
68+
)
69+
gap_table = Table(
70+
gap_stats,
71+
name="Intervals statistics",
72+
style=config.html.style,
73+
)
74+
75+
gap_plot = Image(
76+
plot_timeseries_gap_analysis(
77+
config, summary["gap_stats"]["series"], summary["gap_stats"]["gaps"]
78+
),
79+
image_format=config.plot.image_format,
80+
alt="Gap plot",
81+
name="",
82+
anchor_id=f"{summary['varid']}_gap_plot",
83+
)
84+
return Container(
85+
[gap_table, gap_plot],
86+
image_format=config.plot.image_format,
87+
sequence_type="grid",
88+
name="Gap analysis",
89+
anchor_id=f"{summary['varid']}_gap_analysis",
90+
)
1891

1992

2093
def render_timeseries(config: Settings, summary: dict) -> dict:
@@ -289,8 +362,10 @@ def render_timeseries(config: Settings, summary: dict) -> dict:
289362
anchor_id=f"{varid}_ts_plot",
290363
)
291364

365+
ts_gap = _render_gap_tab(config, summary)
366+
292367
template_variables["bottom"] = Container(
293-
[statistics, hist, ts_plot, fq, evs, acf_pacf],
368+
[statistics, hist, ts_plot, ts_gap, fq, evs, acf_pacf],
294369
sequence_type="tabs",
295370
anchor_id=f"{varid}bottom",
296371
)

src/ydata_profiling/visualisation/plot.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from matplotlib.colors import Colormap, LinearSegmentedColormap, ListedColormap, rgb2hex
1212
from matplotlib.dates import AutoDateLocator, ConciseDateFormatter
1313
from matplotlib.patches import Patch
14-
from matplotlib.ticker import FuncFormatter
14+
from matplotlib.ticker import FuncFormatter, MaxNLocator
1515
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
1616
from typeguard import typechecked
1717
from wordcloud import WordCloud
@@ -557,6 +557,52 @@ def _format_ts_date_axis(
557557
return axis
558558

559559

560+
@manage_matplotlib_context()
561+
def plot_timeseries_gap_analysis(
562+
config: Settings,
563+
series: Union[pd.Series, List[pd.Series]],
564+
gaps: Union[pd.Series, List[pd.Series]],
565+
figsize: tuple = (6, 3),
566+
) -> matplotlib.figure.Figure:
567+
"""Plot an line plot from the data and return the AxesSubplot object.
568+
Args:
569+
variables: The data to plot.
570+
figsize: The size of the figure (width, height) in inches, default (6,4).
571+
Returns:
572+
The TimeSeries lineplot.
573+
"""
574+
fig = plt.figure(figsize=figsize)
575+
ax = fig.add_subplot(111)
576+
577+
colors = create_comparison_color_list(config)
578+
if isinstance(series, list):
579+
min_ = min(s.min() for s in series)
580+
max_ = max(s.max() for s in series)
581+
labels = config.html.style._labels
582+
for serie, gaps_, color, label in zip(series, gaps, colors, labels):
583+
serie.plot(
584+
ax=ax,
585+
label=label,
586+
color=color,
587+
alpha=0.65,
588+
)
589+
_format_ts_date_axis(serie, ax)
590+
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
591+
for gap in gaps_:
592+
ax.fill_between(x=gap, y1=min_, y2=max_, color=color, alpha=0.25)
593+
else:
594+
series.plot(ax=ax)
595+
_format_ts_date_axis(series, ax)
596+
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
597+
598+
for gap in gaps:
599+
ax.fill_between(
600+
x=gap, y1=series.min(), y2=series.max(), color=colors[0], alpha=0.25
601+
)
602+
603+
return plot_360_n0sc0pe(config)
604+
605+
560606
@manage_matplotlib_context()
561607
def plot_overview_timeseries(
562608
config: Settings,

0 commit comments

Comments
 (0)