Coverage for tdaad/anomaly_detectors.py: 84%

44 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-06-13 13:45 +0000

1"""Topological Anomaly Detectors.""" 

2 

3# Author: Martin Royer 

4 

5from typing import Sequence 

6from numbers import Integral 

7 

8import pandas as pd 

9 

10from sklearn.utils._param_validation import Interval 

11from sklearn.base import _fit_context, TransformerMixin 

12from tdaad.utils.remapping_functions import score_flat_fast_remapping 

13from tdaad.topological_embedding import TopologicalEmbedding 

14from tdaad.utils.local_elliptic_envelope import EllipticEnvelope 

15 

16 

17class TopologicalAnomalyDetector(EllipticEnvelope, TransformerMixin): 

18 """Object for detecting anomaly base on Topological Embedding and sklearn.covariance.EllipticEnvelope. 

19 

20 This object analyzes multiple time series data through the following operations: 

21 - run a sliding window algorithm and represent each time series window with topological features, 

22 see :ref:`Topological Embedding <topological_embedding>`, 

23 

24 - use a MinCovDet algorithm to robustly estimate the data mean and covariance in the embedding space, 

25 and use these to derive an embedding mahalanobis distance and associated outlier detection procedure, 

26 see :ref:`Elliptic Envelope <elliptic_envelope>`. 

27 

28 After fitting, it is able to produce an anomaly score from a time series describing normal / abnormal time segments. 

29 (the lower, the more abnormal) 

30 The predict method (inherited from EllipticEnvelope) allows to transform that score into 

31 binary normal / anomaly labels. 

32 

33 Read more in the :ref:`User Guide <topological_anomaly_detection>`. 

34 

35 Parameters 

36 ---------- 

37 window_size : int, default=40 

38 Size of the sliding window algorithm to extract subsequences as input to named_pipeline. 

39 step : int, default=5 

40 Size of the sliding window steps between each window. 

41 tda_max_dim : int, default=2 

42 The maximum dimension of the topological feature extraction. 

43 n_centers_by_dim : int, default=5 

44 The number of centroids to generate by dimension for vectorizing topological features. 

45 The resulting embedding will have total dimension =< tda_max_dim * n_centers_by_dim. 

46 The resulting embedding dimension might be smaller because of the KMeans algorithm in the Archipelago step. 

47 support_fraction : float, default=None 

48 The proportion of points to be included in the support of the raw 

49 MCD estimate. If None, the minimum value of support_fraction will 

50 be used within the algorithm: `[n_sample + n_features + 1] / 2`. 

51 Range is (0, 1). 

52 contamination : float, default=0.1 

53 The amount of contamination of the data set, i.e. the proportion 

54 of outliers in the data set. Range is (0, 0.5]. Only matters for computing the decision function. 

55 random_state : int, RandomState instance or None, default=None 

56 Determines the pseudo random number generator for shuffling 

57 the data. Pass an int for reproducible results across multiple function 

58 calls. 

59 

60 Attributes 

61 ---------- 

62 topological_embedding_ : object 

63 TopologicalEmbedding transformer object that is fitted at `fit`. 

64 

65 Examples 

66 -------- 

67 >>> import numpy as np 

68 >>> n_timestamps = 1000 

69 >>> n_sensors = 20 

70 >>> timestamps = pd.to_datetime('2024-01-01', utc=True) + pd.Timedelta(1, 'h') * np.arange(n_timestamps) 

71 >>> X = pd.DataFrame(np.random.random(size=(n_timestamps, n_sensors)), index=timestamps) 

72 >>> X.iloc[n_timestamps//2:,:10] = -X.iloc[n_timestamps//2:,10:20] 

73 >>> detector = TopologicalAnomalyDetector(n_centers_by_dim=2, tda_max_dim=1).fit(X) 

74 >>> anomaly_scores = detector.score_samples(X) 

75 >>> decision = detector.decision_function(X) 

76 >>> anomalies = detector.predict(X) 

77 """ 

78 

79 required_properties: Sequence[str] = ["multiple_time_series"] 

80 

81 _parameter_constraints: dict = { 

82 **EllipticEnvelope._parameter_constraints, 

83 "window_size": [Interval(Integral, left=2, right=None, closed="left")], 

84 "step": [Interval(Integral, left=1, right=None, closed="left")], 

85 "tda_max_dim": [Interval(Integral, left=0, right=3, closed="left")], 

86 "n_centers_by_dim": [Interval(Integral, left=1, right=None, closed="left")], 

87 } 

88 

89 def __init__( 

90 self, 

91 window_size: int = 100, 

92 step: int = 5, 

93 tda_max_dim: int = 2, 

94 n_centers_by_dim: int = 5, 

95 support_fraction: float = None, 

96 contamination: float = 0.1, 

97 random_state: int = 42, 

98 ): 

99 super().__init__( 

100 support_fraction=support_fraction, 

101 contamination=contamination, 

102 random_state=random_state 

103 ) 

104 self.window_size = window_size 

105 self.step = step 

106 self.tda_max_dim = tda_max_dim 

107 self.n_centers_by_dim = n_centers_by_dim 

108 

109 @_fit_context(prefer_skip_nested_validation=True) 

110 def fit(self, X, y=None): 

111 """Fit the TopologicalAnomalyDetector model. 

112 

113 Args 

114 ---- 

115 X : {array-like, sparse matrix} of shape (n_timestamps, n_sensors) 

116 Multiple time series to transform, where `n_timestamps` is the number of timestamps 

117 in the series X, and `n_sensors` is the number of sensors. 

118 y : Ignored 

119 Not used, present for API consistency by convention. 

120 

121 Returns 

122 ------- 

123 self : object 

124 Returns the instance itself. 

125 """ 

126 self.topological_embedding_ = TopologicalEmbedding( 

127 window_size=self.window_size, 

128 step=self.step, 

129 n_centers_by_dim=self.n_centers_by_dim, 

130 tda_max_dim=self.tda_max_dim, 

131 ) 

132 if not hasattr(X, "index"): 

133 X = pd.DataFrame(data=X, index=range(X.shape[0])) 

134 embedding = self.topological_embedding_.fit_transform(X) 

135 try: 

136 super().fit(embedding) 

137 except ValueError as e: 

138 print(f"Catching {e=}, will increase support fraction.") 

139 self.support_fraction = 1 

140 super().fit(embedding) 

141 self.support_fraction = None 

142 return self 

143 

144 def _warped_score_samples(self, X, y=None): # this exists to retrieve scores before remapping 

145 """Compute the negative Mahalanobis distances associated with the TopologicalEmbedding representation of X. 

146 

147 Args 

148 ---- 

149 X : {array-like, sparse matrix} of shape (n_timestamps, n_sensors) 

150 Multiple time series to transform, where `n_timestamps` is the number of timestamps 

151 in the series X, and `n_sensors` is the number of sensors. 

152 y : Ignored 

153 Not used, present for API consistency by convention. 

154 

155 Returns 

156 ------- 

157 negative_mahal_distances : pandas.DataFrame of shape (n_samples,) 

158 Opposite of the Mahalanobis distances. 

159 """ 

160 if not hasattr(X, "index"): 

161 X = pd.DataFrame(data=X, index=range(X.shape[0])) 

162 

163 imax = (X.shape[0] - self.window_size) // self.step 

164 self.padding_length_ = X.shape[0] - (self.step * imax + self.window_size) 

165 print(f"{X.shape[0]=}, {self.window_size=}, {self.step=}, so running {self.padding_length_=}...") 

166 

167 embedding = self.topological_embedding_.transform(X=X) 

168 return super().score_samples(embedding) 

169 

170 def score_samples(self, X, y=None): 

171 """Compute the negative Mahalanobis distances associated with the TopologicalEmbedding representation of X. 

172 

173 Args 

174 ---- 

175 X : {array-like, sparse matrix} of shape (n_timestamps, n_sensors) 

176 Multiple time series to transform, where `n_timestamps` is the number of timestamps 

177 in the series X, and `n_sensors` is the number of sensors. 

178 y : Ignored 

179 Not used, present for API consistency by convention. 

180 

181 Returns 

182 ------- 

183 negative_mahal_distances : ndarray of shape (n_samples,) 

184 Opposite of the Mahalanobis distances. 

185 """ 

186 warped_score_samples = self._warped_score_samples(X) 

187 

188 unwarped_scores = score_flat_fast_remapping(warped_score_samples, window_size=self.window_size, 

189 stride=self.step, padding_length=self.padding_length_) 

190 # print(f"...yields {remapped_scores.shape[0]=}") 

191 

192 return unwarped_scores 

193 

194 

195TopologicalAnomalyDetector.transform = TopologicalAnomalyDetector.score_samples