import sys
from pprint import pprint

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [11, 5]

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

PROJECT_DIR = '..'
if PROJECT_DIR not in sys.path:
    sys.path.insert(0, PROJECT_DIR)

from tadkit.base import TADLearner
from tadkit.catalog.formalizers import PandasFormalizer

Synthetic data simulation

from tadkit.utils.synthetic_ornstein_uhlenbeck import synthetise_ornstein_uhlenbeck_data
X, y = synthetise_ornstein_uhlenbeck_data(n_rows=1000, n_cols_x=5)

pd.concat((X, y), axis=1).plot()

<Axes: >

../../_images/ed979b81c7e40d9c69a81ea4902a4a243bd372ada9c5eb7b8ba8e273ece5175e.png

pd.concat([((X ** 2).sum(axis=1) ** .5), y], axis=1).plot()

<Axes: >

../../_images/cf30e551d0193be012e77cbfc8a8502cd4ceb5898d9b458250e8a770d713f742.png

Formalizer

formalizer = PandasFormalizer(
        data_df=X,
        dataframe_type="synchronous",
    )

formalizer.available_properties

['pandas', 'fixed_time_step']

pprint(formalizer.query_description)

{'resampling': {'default': False,
                'description': 'Resampling of the target query',
                'family': 'bool'},
 'resampling_resolution': {'default': 120,
                           'description': 'If resampling, resampling '
                                          'resolution in seconds.',
                           'family': 'time',
                           'start': 60,
                           'stop': 3600},
 'target_period': {'default': (Timestamp('2021-01-01 00:00:00'),
                               Timestamp('2021-02-11 15:00:00')),
                   'description': 'Time period for your query.',
                   'family': 'time_interval',
                   'start': Timestamp('2021-01-01 00:00:00'),
                   'stop': Timestamp('2021-02-11 15:00:00')},
 'target_space': {'default': Index(['X0', 'X1', 'X2', 'X3', 'X4'], dtype='object'),
                  'description': 'List of sensors used for your query.',
                  'family': 'space',
                  'set': Index(['X0', 'X1', 'X2', 'X3', 'X4'], dtype='object')}}

formalizer.formalize(data=X, columns_0=['X1', 'X2'],
                     time_interval=['2021-01-01 02:00:00', '2021-02-02'],
                     target_space=X.columns,
                     resampling=True,
                     resampling_resolution=60)

	X0	X1	X2	X3	X4
2021-01-01 00:00:00	0.000000	0.000000	0.000000	0.000000	0.000000
2021-01-01 00:01:00	0.000088	0.000412	0.000449	-0.000373	-0.000491
2021-01-01 00:02:00	0.000175	0.000824	0.000898	-0.000745	-0.000982
2021-01-01 00:03:00	0.000263	0.001236	0.001348	-0.001118	-0.001473
2021-01-01 00:04:00	0.000350	0.001649	0.001797	-0.001491	-0.001964
...	...	...	...	...	...
2021-02-11 14:56:00	-0.584958	0.942160	0.571343	-0.897471	0.755402
2021-02-11 14:57:00	-0.585516	0.941845	0.571196	-0.897901	0.755519
2021-02-11 14:58:00	-0.586074	0.941530	0.571048	-0.898330	0.755637
2021-02-11 14:59:00	-0.586633	0.941214	0.570901	-0.898760	0.755755
2021-02-11 15:00:00	-0.587191	0.940899	0.570754	-0.899189	0.755873

59941 rows × 5 columns

Learners

print(isinstance(IsolationForest, TADLearner))
from tadkit.utils.tadlearner_factory import tadlearner_factory
IsolationForestLearner = tadlearner_factory(IsolationForest, [], {})
print(isinstance(IsolationForestLearner, TADLearner))

False
True

from tadkit.utils.decomposable_tadlearner import decomposable_tadlearner_factory

StandardForestLearner = decomposable_tadlearner_factory(StandardScaler, IsolationForestLearner, [], {})
learner = StandardForestLearner()
learner.random_state = -1
learner.fit(X)
anom_score = -learner.score_samples(X)

iso_score = pd.DataFrame(anom_score, index=X.index, columns=["iso forest anomaly score"])

ceil = .55

iso_anomalies = iso_score[iso_score > ceil]
iso_anomalies.rename(columns={"iso forest anomaly score": "iso forest anomalies"}, inplace=True)

pd.concat([iso_score, iso_anomalies, y], axis=1).plot()
plt.axhline(y=ceil, color='r', linestyle='-')
learner

StandardScalerIsolationForest()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

../../_images/534fb26c7f0faa10031bdb89eb9e08761182a7cc0d5b37666a27d259d054a97c.png

class ExponantialMean:
    def __init__(self, lags, sub_X=False) -> None:
        self.lags = np.array(lags)
        self.sub_X = sub_X
        assert sum(d != 1 for d in self.lags.shape) <= 1
        self.lags = self.lags.reshape(-1)

    def transform(self, X):
        lags_len, = self.lags.shape
        index = X.index
        columns = sum(
            ([f'{colname}_ema{i}' for i in range(lags_len)] for colname in X.columns),
            start=[]
        )
        t = index.astype('int64') * 1e-9
        X = X.values
        n, d = X.shape
        result = np.empty((n, d, lags_len))
        result[0] = X[0, :, None]
        for i in range(n-1):
            weight = np.exp(-(t[i+1] - t[i]) / self.lags)[None, None]
            result[i+1] = result[i] * weight + X[i+1, :, None] * (1 - weight)
        if self.sub_X:
            result -= X[:, :, None]
        result = pd.DataFrame(
            result.reshape((n, -1)),
            index=index,
            columns=columns
        )
        return result

MINUTE = 60
HOUR = 60 * MINUTE
DAY = 24 * HOUR
exponential_mean = ExponantialMean([DAY, 3* DAY], sub_X=True)
exponential_mean.transform(X)

	X0_ema0	X0_ema1	X1_ema0	X1_ema1	X2_ema0	X2_ema1	X3_ema0	X3_ema1	X4_ema0	X4_ema1
2021-01-01 00:00:00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
2021-01-01 01:00:00	-0.005038	-0.005180	-0.023719	-0.024387	-0.025852	-0.026580	0.021447	0.022051	0.028259	0.029055
2021-01-01 02:00:00	-0.031711	-0.032745	-0.048762	-0.050794	-0.001294	-0.002048	0.037732	0.039390	0.037896	0.039748
2021-01-01 03:00:00	-0.015200	-0.016647	-0.069807	-0.073777	-0.007043	-0.007985	0.054845	0.058025	-0.014293	-0.012868
2021-01-01 04:00:00	-0.067253	-0.070574	-0.048571	-0.053855	-0.062409	-0.065097	0.030707	0.034709	0.041473	0.044046
...	...	...	...	...	...	...	...	...	...	...
2021-02-11 11:00:00	-0.051206	-0.211574	0.117038	0.144954	-0.001470	0.090219	-0.091695	-0.115633	0.091504	0.073529
2021-02-11 12:00:00	-0.028717	-0.187681	0.081779	0.111612	0.058907	0.150990	-0.060468	-0.085780	0.081817	0.066394
2021-02-11 13:00:00	-0.007902	-0.164897	0.104148	0.136504	-0.004352	0.086338	-0.138787	-0.167659	0.096796	0.084312
2021-02-11 14:00:00	0.077915	-0.074720	0.158832	0.195215	0.010635	0.100374	-0.104210	-0.135619	0.077579	0.067452
2021-02-11 15:00:00	0.106862	-0.040657	0.170498	0.211182	0.018675	0.107702	-0.075242	-0.108337	0.067626	0.059545

1000 rows × 10 columns

EMAForestLearner = decomposable_tadlearner_factory(ExponantialMean, IsolationForestLearner, [], {
    'lags': {
        'description': 'Lags typical time of exponential mean',
        'family': 'preprossing',
        'value_type': 'set',
        'start': 0,
    },
    'sub_X': {
        'description': 'Value substract from to the lags',
        'family': 'preprossing',
        'value_type': 'bool',
    },
})
learner = EMAForestLearner(lags=np.geomspace(.1 * DAY, 2 * DAY, 10), sub_X=True)
learner.random_state = -1
learner.fit(X)
anom_score = -learner.score_samples(X)
ema_score = pd.DataFrame(anom_score, index=X.index, columns=["ema iso forest anomaly score"])

ceil = .5

ema_anomalies = ema_score[iso_score > ceil]
ema_anomalies.rename(columns={"ema iso forest anomaly score": "ema iso forest anomalies"}, inplace=True)

learner

ExponantialMeanIsolationForest()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

pd.concat([ema_score, ema_anomalies, y], axis=1).plot()
plt.axhline(y=ceil, color='r', linestyle='-')

<matplotlib.lines.Line2D at 0x29528e1e630>

../../_images/72f498cc100e3a50b56f7d114cb1dfd428b60bb30e7f9c2c665114aba1d8bee9.png