TADKit - Interactive Anomaly Detection demonstrator

Loading use case

Where you import your data. We simply generate some synthetic timeseries. We either use the synchronous format where each timeseries is a column with label as a column name, or the asynchronous format with the two columns (“sensor”, “data”) where “sensor” has the timeseries labels and “data” their corresponding values.

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import matplotlib.pyplot as plt; plt.rcParams['figure.figsize'] = [11, 5]
import numpy as np
import pandas as pd

from tadkit.utils.synthetic_ornstein_uhlenbeck import synthetise_ornstein_uhlenbeck_data

X, y = synthetise_ornstein_uhlenbeck_data(n_rows=10000, n_cols_x=20)
display(X)
async_X = X.melt(value_vars=X.columns.tolist(), ignore_index=False).rename(
    columns={"variable": "sensor", "value": "data"}
)
display(async_X)
pd.concat([X, y], axis=1).plot()
X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19
2021-01-01 00:00:00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2021-01-01 01:00:00 0.001661 0.007820 0.008523 -0.007071 -0.009317 0.008867 -0.002218 0.003817 -0.007726 0.008630 -0.002813 -0.009319 -0.004985 0.007643 0.001915 -0.006188 0.016657 0.017382 0.011822 0.011194
2021-01-01 02:00:00 0.008823 -0.010364 0.019886 -0.007703 -0.007919 0.002534 -0.008656 0.010696 -0.018860 0.009704 -0.013080 -0.013086 0.008871 0.004319 -0.010231 -0.009694 0.017622 0.026136 0.014752 0.005832
2021-01-01 03:00:00 -0.002155 -0.019065 0.011051 -0.003709 -0.007369 -0.005963 0.001210 0.013652 -0.001756 0.005926 -0.015816 -0.024475 0.004826 -0.004678 -0.014786 -0.007505 0.024207 0.014618 0.013044 -0.004295
2021-01-01 04:00:00 0.013071 -0.016399 -0.006838 0.002436 0.000271 -0.016800 0.013201 -0.000847 0.009786 0.001089 -0.022053 -0.047641 -0.001618 -0.011110 -0.010734 -0.026655 0.029129 0.024849 0.024760 -0.006651
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2022-02-21 11:00:00 0.480157 1.036571 0.079168 -1.287169 2.634510 -0.210555 -0.207754 2.790915 1.994599 1.263292 -0.291272 0.304258 0.506799 0.362485 2.633189 0.370203 -1.510844 1.593467 -1.383251 0.287831
2022-02-21 12:00:00 0.472759 1.028854 0.078058 -1.292116 2.634895 -0.217651 -0.222453 2.803378 2.005869 1.255607 -0.292545 0.306667 0.504882 0.369449 2.632624 0.357541 -1.516517 1.592511 -1.402227 0.300942
2022-02-21 13:00:00 0.484448 1.035954 0.077118 -1.286552 2.640465 -0.218561 -0.227952 2.794754 2.003116 1.252523 -0.297669 0.301481 0.492823 0.380544 2.612742 0.354779 -1.513238 1.600206 -1.388866 0.297647
2022-02-21 14:00:00 0.478087 1.037739 0.066010 -1.273569 2.644860 -0.223948 -0.225738 2.787982 2.008781 1.265143 -0.302747 0.285736 0.491451 0.382196 2.613877 0.356197 -1.517774 1.585939 -1.385370 0.288772
2022-02-21 15:00:00 0.474328 1.052743 0.068460 -1.276385 2.637026 -0.225501 -0.232017 2.779689 2.006385 1.254136 -0.304361 0.281531 0.502489 0.386042 2.608148 0.352264 -1.510109 1.595649 -1.392864 0.284664

10000 rows × 20 columns

sensor data
2021-01-01 00:00:00 X0 0.000000
2021-01-01 01:00:00 X0 0.001661
2021-01-01 02:00:00 X0 0.008823
2021-01-01 03:00:00 X0 -0.002155
2021-01-01 04:00:00 X0 0.013071
... ... ...
2022-02-21 11:00:00 X19 0.287831
2022-02-21 12:00:00 X19 0.300942
2022-02-21 13:00:00 X19 0.297647
2022-02-21 14:00:00 X19 0.288772
2022-02-21 15:00:00 X19 0.284664

200000 rows × 2 columns

<Axes: >
../../_images/2c189d20fd60e934b6f36a719cba54dcb2749e4e3be5adf7a02bd4a5e427a2fa.png

Formalizer: given a use case, define how to formalize the dataset for ML

Select a Formalizer: tadkit-provided object adapted to the use case type (synchronous or asynchronous) you are set on learning.

from tadkit.catalog.formalizers import PandasFormalizer
from tadkit.utils.widget import select_formalizer

formalizers = {
    "synchronous_formalizer": PandasFormalizer(
        data_df=X,
        dataframe_type="synchronous",
    ),
    "asynchronous_formalizer": PandasFormalizer(
        data_df=async_X,
        dataframe_type="asynchronous",
    )
}
for formalizer_name, formalizer in formalizers.items():
    print(f"{formalizer_name} has {formalizer.available_properties=}")

formalizer_selector = select_formalizer(formalizers)
formalizer_selector
[TADKit-Catalog]
Class learner_name='cnndrad' is registered in TADKit.
cnndrad returns err=ModuleNotFoundError("No module named 'cnndrad'").
Class learner_name='sbad' is registered in TADKit.
sbad returns err=ModuleNotFoundError("No module named 'sbad_fnn'").
Class learner_name='kcpd' is registered in TADKit.
kcpd returns err=ModuleNotFoundError("No module named 'kcpdi'").
Class learner_name='tdaad' is registered in TADKit.
tdaad returns err=ModuleNotFoundError("No module named 'tdaad'").
Class learner_name='isolation-forest' is registered in TADKit.
isolation-forest is operational in this environment.
isolation-forest is implicit child of TADLearner.
Class learner_name='kernel-density' is registered in TADKit.
kernel-density is operational in this environment.
kernel-density is implicit child of TADLearner.
Class learner_name='scaled-kernel-density' is registered in TADKit.
scaled-kernel-density is operational in this environment.
scaled-kernel-density is implicit child of TADLearner.
synchronous_formalizer has formalizer.available_properties=['pandas', 'fixed_time_step']
asynchronous_formalizer has formalizer.available_properties=['pandas', 'fixed_time_step']

This allows to select a training interval, a resampling resolution and the set of target sensors for learning.

from tadkit.utils.widget import query_widget_selection

formalizer_name = formalizer_selector.value
query_dict = query_widget_selection(formalizer_name, formalizers[formalizer_name].query_description)
'synchronous_formalizer'
query_translated = {key: widget.value for key, widget in query_dict.items()}
X_train = formalizers[formalizer_name].formalize(**query_translated)
query_translated.pop("target_period")
X_test = formalizers[formalizer_name].formalize(**query_translated)

ax = X_test.plot()
ax.axvspan(X_train.index[0], X_train.index[-1], alpha=0.2)
plt.show()
../../_images/e856e5cf67c96b0260642996c6c2438a307f52264685b7467b7a597a3cf71744.png

The dataset formalizing is done and ready for learning.

TADLearner: detect anomalies

from tadkit.catalog.learners import installed_learner_classes

Select a set of learners that are compatible with the formalizing.

from tadkit.utils.widget import select_matching_available_learners

matching_available_learners = select_matching_available_learners(formalizers[formalizer_name], installed_learner_classes)

Finally, set the parameters of the chosen learners.

from tadkit.utils.widget import parameter_widget_selection

learners_params = {
    learner_name: parameter_widget_selection(
        tad_object_name=learner_name,
        params_description=installed_learner_classes[learner_name].params_description,
    ) for learner_name in matching_available_learners.value
}
  >>> Parameters for learner isolation-forest
  >>> Parameters for learner kernel-density
  >>> Parameters for learner scaled-kernel-density
learners = {
    learner_class_name:
    installed_learner_classes.get(learner_class_name)(
        **{param: selection.value for (param, selection) in params_dict.items()}
    )
    for learner_class_name, params_dict in learners_params.items()
}
learners
{'isolation-forest': IsolationForest(n_estimators=10),
 'kernel-density': KernelDensity(kernel='epanechnikov'),
 'scaled-kernel-density': ScaledKernelDensityLearner(scaling='quantile_normal')}

Fit on train query, score on the entire series.

# Fit loop:
anomalies = pd.DataFrame(index=X_test.index)
for learner_name, learner_object in learners.items():
    print(f"Fitting {learner_name}")
    learner_object.fit(X_train)

# Score loop:
for learner_name, fitted_learner_object in learners.items():
    print(f"Scoring with {learner_name}")
    anom_score = fitted_learner_object.score_samples(X_test)
    anomalies[str(fitted_learner_object)] = anom_score

axes = pd.concat([anomalies, y], axis=1).plot(subplots=True)
[ax.axvspan(X_train.index[0], X_train.index[-1], alpha=0.2) for ax in axes]
plt.show()
Fitting isolation-forest
Fitting kernel-density
Fitting scaled-kernel-density
Scoring with isolation-forest
Scoring with kernel-density
Scoring with scaled-kernel-density
../../_images/90ffdf3113c675d9c36ede98a23a1aa8ac85b0dc2a7323f443e62597905c4fe9.png
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)         # initiate notebook for offline plot

pd.options.plotting.backend = "plotly"
fig = pd.concat([anomalies.apply(lambda x: (x - x.min()) / (x.max() - x.min())), y], axis=1).plot()
fig.update_layout(
    legend=dict(
        orientation="v",
        entrywidth=100,
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    width=1000,
    height=600,
)
fig.show()
pd.options.plotting.backend = "matplotlib"