Synthetic data simulation

import numpy as np
import pandas as pd

from ouagen import ornstein_uhlenbeck_anomaly

t = np.arange(1000)
t = t / t.shape[0]
pre_X, pre_y = ornstein_uhlenbeck_anomaly(
    t,
    size=20,
    noise_scale=1,
    mean_reverting=3,
    anomaly_freq=.1,
    anomaly_duration=0.005,
    anomaly_scale=40,
)
t = pd.to_datetime('2021-01-01') + pd.Timedelta(1, 'h') * np.arange(t.shape[0])
X = pd.DataFrame(pre_X, index=t, columns=[f'X{i}' for i in range(pre_X.shape[1])])
y = pd.DataFrame(pre_y[:, None], index=t, columns=['y'])
display(X)
async_X = X.melt(value_vars=X.columns.tolist(), ignore_index=False).rename(
    columns={"variable": "sensor", "value": "data"}
)
display(async_X)
pd.concat([X, y], axis=1).plot()
X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19
2021-01-01 00:00:00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2021-01-01 01:00:00 0.005252 0.024728 0.026952 -0.022360 -0.029462 0.028039 -0.007014 0.012071 -0.024431 0.027290 -0.008894 -0.029470 -0.015765 0.024170 0.006056 -0.019567 0.052675 0.054966 0.037385 0.035398
2021-01-01 02:00:00 0.027888 -0.032840 0.062813 -0.024298 -0.024962 0.007937 -0.027353 0.033791 -0.059576 0.030615 -0.041340 -0.041302 0.028096 0.013593 -0.032369 -0.030601 0.055585 0.082502 0.046551 0.018346
2021-01-01 03:00:00 -0.006903 -0.060266 0.034705 -0.011602 -0.023157 -0.018953 0.003918 0.043048 -0.005327 0.018585 -0.049878 -0.077205 0.015227 -0.014894 -0.046688 -0.023597 0.076257 0.045855 0.041023 -0.013725
2021-01-01 04:00:00 0.041263 -0.051675 -0.021960 0.007861 0.001067 -0.053171 0.041826 -0.002919 0.031187 0.003237 -0.069468 -0.150257 -0.005190 -0.035196 -0.033748 -0.084091 0.091617 0.078086 0.077962 -0.021141
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2021-02-11 11:00:00 -0.045880 -0.710766 -0.097687 1.036003 0.674646 0.726933 0.529645 0.031269 0.890464 -0.861574 -0.490439 -0.239189 -0.129772 -0.522085 0.307080 0.145893 0.437884 -0.842048 0.190714 0.357574
2021-02-11 12:00:00 -0.042093 -0.749522 -0.106187 1.042480 0.636715 0.668605 0.487211 0.003060 0.909014 -0.847666 -0.548754 -0.219306 -0.158270 -0.566806 0.305682 0.200074 0.402115 -0.870675 0.148192 0.400314
2021-02-11 13:00:00 -0.039539 -0.816215 -0.103101 1.023325 0.670355 0.665394 0.430242 0.038435 0.844820 -0.885687 -0.529422 -0.166633 -0.132188 -0.563205 0.275267 0.167789 0.399349 -0.837403 0.114430 0.363303
2021-02-11 14:00:00 0.045796 -0.803112 -0.105205 1.049967 0.643303 0.664459 0.465075 0.069509 0.930086 -0.889746 -0.534425 -0.147937 -0.141524 -0.562967 0.289132 0.150164 0.399101 -0.806114 0.096359 0.326373
2021-02-11 15:00:00 -0.002016 -0.804371 -0.100574 1.069136 0.639176 0.651138 0.487456 0.093443 0.904328 -0.890001 -0.555217 -0.210177 -0.177874 -0.493351 0.298163 0.221578 0.377257 -0.785403 0.082574 0.288254

1000 rows × 20 columns

sensor data
2021-01-01 00:00:00 X0 0.000000
2021-01-01 01:00:00 X0 0.005252
2021-01-01 02:00:00 X0 0.027888
2021-01-01 03:00:00 X0 -0.006903
2021-01-01 04:00:00 X0 0.041263
... ... ...
2021-02-11 11:00:00 X19 0.357574
2021-02-11 12:00:00 X19 0.400314
2021-02-11 13:00:00 X19 0.363303
2021-02-11 14:00:00 X19 0.326373
2021-02-11 15:00:00 X19 0.288254

20000 rows × 2 columns

<Axes: >
../../_images/e810d12ddcbfbf0d7dd462d56081082f73670cc56efef7fc8c2016a285fb84aa.png
pd.concat([((X ** 2).sum(axis=1) ** .5), y], axis=1).plot()
<Axes: >
../../_images/7ebabb2c83bfff31380861656e5b1266d2099ab583dff68fa418d3063febc2f2.png
from tadkit.catalog.rawtowideformatter import RawToWideFormatter

formatter = RawToWideFormatter(
        data=X,
        # timestamps=X.index,
        backend="pandas"
    )
formatter.available_properties
['synchronous', 'pandas_backend', 'fixed_time_step']
print(formatter.query_description)
{'target_period': {'description': 'Time period to select', 'family': 'time_interval', 'start': np.datetime64('2021-01-01T00:00:00.000000000'), 'stop': np.datetime64('2021-02-11T15:00:00.000000000'), 'default': (np.datetime64('2021-01-01T00:00:00.000000000'), np.datetime64('2021-02-11T15:00:00.000000000'))}, 'target_space': {'description': 'Columns or sensors to select', 'family': 'space', 'set': ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19'], 'default': ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19']}, 'resample': {'description': 'Resample data?', 'family': 'bool', 'default': False}, 'resample_freq': {'description': 'Resampling frequency in seconds.', 'family': 'time', 'start': 60, 'default': 120, 'stop': 3600}}
from query_selection import query_widget_selection

query_dict = query_widget_selection(formatter.query_description)
query_translated = {key: widget.value for key, widget in query_dict.items()}
formatter.backend = "numpy"
query = formatter.format(**query_translated)
query
array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00525208,  0.02472789,  0.02695162, ...,  0.05496634,
         0.03738528,  0.03539777],
       [ 0.0278881 , -0.03284016,  0.06281325, ...,  0.08250195,
         0.04655068,  0.01834615],
       ...,
       [-0.09361403, -0.88180157,  0.00947269, ..., -0.88462024,
         0.12293536,  0.69455948],
       [-0.07396143, -0.9167913 , -0.04366985, ..., -0.81078897,
         0.15956877,  0.69222348],
       [-0.08287074, -0.90335556, -0.11737827, ..., -0.84533462,
         0.08865186,  0.68593266]], shape=(985, 20))

Learners

from sklearn.ensemble import IsolationForest
from tadkit.base.tadlearner import TADLearner

isinstance(IsolationForest, TADLearner)
True
import matplotlib.pyplot as plt
learner = IsolationForest()
learner.fit(X)
anom_score = -learner.score_samples(X)

iso_score = pd.DataFrame(anom_score, index=X.index, columns=["iso forest anomaly score"])

ceil = .55

iso_anomalies = iso_score[iso_score > ceil]
iso_anomalies.rename(columns={"iso forest anomaly score": "iso forest anomalies"}, inplace=True)

pd.concat([iso_score, iso_anomalies, y], axis=1).plot()
plt.axhline(y=ceil, color='r', linestyle='-')
learner
IsolationForest()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
../../_images/a01789de1b7efe58b288fb6761ada2cce1b5ab64db3a53cc425ae93b1560742d.png
class ExponantialMean:
    def __init__(self, lags, sub_X=False) -> None:
        self.lags = np.array(lags)
        self.sub_X = sub_X
        assert sum(d != 1 for d in self.lags.shape) <= 1
        self.lags = self.lags.reshape(-1)

    def transform(self, X):
        lags_len, = self.lags.shape
        index = X.index
        columns = sum(
            ([f'{colname}_ema{i}' for i in range(lags_len)] for colname in X.columns),
            start=[]
        )
        t = index.astype('int64') * 1e-9
        X = X.values
        n, d = X.shape
        result = np.empty((n, d, lags_len))
        result[0] = X[0, :, None]
        for i in range(n-1):
            weight = np.exp(-(t[i+1] - t[i]) / self.lags)[None, None]
            result[i+1] = result[i] * weight + X[i+1, :, None] * (1 - weight)
        if self.sub_X:
            result -= X[:, :, None]
        result = pd.DataFrame(
            result.reshape((n, -1)),
            index=index,
            columns=columns
        )
        return result
MINUTE = 60
HOUR = 60 * MINUTE
DAY = 24 * HOUR
exponential_mean = ExponantialMean([DAY, 3* DAY], sub_X=True)
exponential_mean.transform(X)
X0_ema0 X0_ema1 X1_ema0 X1_ema1 X2_ema0 X2_ema1 X3_ema0 X3_ema1 X4_ema0 X4_ema1 ... X15_ema0 X15_ema1 X16_ema0 X16_ema1 X17_ema0 X17_ema1 X18_ema0 X18_ema1 X19_ema0 X19_ema1
2021-01-01 00:00:00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2021-01-01 01:00:00 -0.005038 -0.005180 -0.023719 -0.024387 -0.025852 -0.026580 0.021447 0.022051 0.028259 0.029055 ... 0.018768 0.019297 -0.050525 -0.051948 -0.052723 -0.054208 -0.035860 -0.036870 -0.033953 -0.034910
2021-01-01 02:00:00 -0.026544 -0.027432 0.032468 0.032724 -0.059195 -0.061580 0.022432 0.023659 0.022790 0.024216 ... 0.028586 0.029913 -0.051255 -0.054102 -0.076983 -0.080616 -0.043187 -0.045400 -0.016212 -0.017612
2021-01-01 03:00:00 0.007910 0.007258 0.057449 0.059319 -0.029818 -0.033011 0.009338 0.010811 0.020129 0.022103 ... 0.020701 0.022593 -0.068992 -0.073743 -0.038690 -0.043363 -0.036123 -0.039322 0.015212 0.014260
2021-01-01 04:00:00 -0.038613 -0.040344 0.046865 0.050029 0.025752 0.023329 -0.009712 -0.008532 -0.003929 -0.002092 ... 0.077882 0.081941 -0.080909 -0.087874 -0.068027 -0.074551 -0.070080 -0.075210 0.021705 0.021378
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2021-02-11 11:00:00 0.036233 0.162953 -0.157976 -0.312027 -0.054504 -0.208378 0.062005 0.036372 0.167697 0.220710 ... 0.100072 0.190646 0.084005 0.003536 -0.137007 -0.136692 0.030745 0.084516 0.230806 0.086975
2021-02-11 12:00:00 0.031122 0.156971 -0.114354 -0.269502 -0.044127 -0.197121 0.053261 0.029482 0.197236 0.255073 ... 0.044017 0.134582 0.114886 0.038763 -0.103956 -0.106573 0.070277 0.125286 0.180390 0.043624
2021-02-11 13:00:00 0.027402 0.152287 -0.045716 -0.200011 -0.045285 -0.197445 0.069461 0.047966 0.156919 0.218379 ... 0.073188 0.164565 0.112851 0.040956 -0.131628 -0.137917 0.099793 0.156854 0.208530 0.079523
2021-02-11 14:00:00 -0.055569 0.066028 -0.056419 -0.210175 -0.041419 -0.192647 0.041071 0.021030 0.176463 0.242045 ... 0.087108 0.179678 0.108483 0.040636 -0.156268 -0.166872 0.113054 0.172512 0.235442 0.114847
2021-02-11 15:00:00 -0.007441 0.112270 -0.052909 -0.206035 -0.044171 -0.194557 0.021008 0.001835 0.173221 0.242778 ... 0.015053 0.106770 0.125008 0.061618 -0.169757 -0.184996 0.121663 0.183728 0.262397 0.150856

1000 rows × 40 columns

from tadkit.utils.decomposable_tadlearner import decomposable_tadlearner_factory

EMAForestLearner = decomposable_tadlearner_factory(ExponantialMean, IsolationForest)
learner = EMAForestLearner(lags=np.geomspace(.1 * DAY, 2 * DAY, 10), sub_X=True)
learner.random_state = -1
learner.fit(X)
anom_score = -learner.score_samples(X)
ema_score = pd.DataFrame(anom_score, index=X.index, columns=["ema iso forest anomaly score"])

ceil = .5

ema_anomalies = ema_score[iso_score > ceil]
ema_anomalies.rename(columns={"ema iso forest anomaly score": "ema iso forest anomalies"}, inplace=True)

learner
<tadkit.utils.decomposable_tadlearner.ExponantialMeanIsolationForest at 0x221da1c5e80>
pd.concat([ema_score, ema_anomalies, y], axis=1).plot()
plt.axhline(y=ceil, color='r', linestyle='-')
<matplotlib.lines.Line2D at 0x221dba8c4a0>
../../_images/a153f0c00c2b55dfce166d472b2d8e51bcc00e6732a5d3b297d3a68f92a6d56d.png