Coverage for tdaad/topological_embedding.py: 100%

37 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-16 16:23 +0000

1"""Topological Embedding Transformers.""" 

2 

3# Author: Martin Royer 

4 

5from functools import partial 

6 

7from sklearn.base import BaseEstimator, TransformerMixin 

8from sklearn.pipeline import Pipeline 

9from sklearn.preprocessing import StandardScaler 

10from sklearn.preprocessing import FunctionTransformer 

11from sklearn.compose import ColumnTransformer 

12from sklearn.cluster import KMeans 

13 

14from gudhi.representations.vector_methods import Atol 

15 

16from tdaad.utils.tda_functions import transform_to_persistence_diagram 

17from tdaad.utils.window_functions import sliding_window_ppl_pp 

18 

19 

20atol_vanilla_fit = Atol.fit 

21 

22 

23def local_atol_fit(self, X, y=None, sample_weight=None): 

24 """local modification to prevent FutureWarning triggered by np.concatenate(X) when X is a pd.Series.""" 

25 if hasattr(X, "values"): 

26 X = X.values 

27 return atol_vanilla_fit(self=self, X=X) 

28 

29 

30Atol.fit = local_atol_fit 

31 

32 

33class TopologicalEmbedding(BaseEstimator, TransformerMixin): 

34 """Topological embedding for multiple time series. 

35 

36 Slices time series into smaller time series windows, forms an affinity matrix on each window 

37 and applies a Rips procedure to produce persistence diagrams for each affinity 

38 matrix. Then uses Atol [ref:Atol] on each dimension through the 

39 gudhi.representation.Archipelago representation to produce topological vectorization. 

40 

41 Read more in the :ref:`User Guide <topological_embedding>`. 

42 

43 Parameters 

44 ---------- 

45 window_size : int, default=40 

46 Size of the sliding window algorithm to extract subsequences as input to named_pipeline. 

47 step : int, default=5 

48 Size of the sliding window steps between each window. 

49 n_centers_by_dim : int, default=5 

50 The number of centroids to generate by dimension for vectorizing topological features. 

51 The resulting embedding will have total dimension =< tda_max_dim * n_centers_by_dim. 

52 The resulting embedding dimension might be smaller because of the KMeans algorithm in the Archipelago step. 

53 tda_max_dim : int, default=2 

54 The maximum dimension of the topological feature extraction. 

55 

56 Examples 

57 ---------- 

58 >>> n_timestamps = 100 

59 >>> n_sensors = 5 

60 >>> timestamps = pd.to_datetime('2024-01-01', utc=True) + pd.Timedelta(1, 'h') * np.arange(n_timestamps) 

61 >>> X = pd.DataFrame(np.random.random(size=(n_timestamps, n_sensors)), index=timestamps) 

62 >>> TopologicalEmbedding(n_centers_by_dim=2, tda_max_dim=1).fit_transform(X) 

63 """ 

64 

65 def __init__( 

66 self, 

67 window_size: int = 40, 

68 step: int = 5, 

69 tda_max_dim: int = 2, 

70 n_centers_by_dim: int = 5, 

71 ): 

72 self.window_size = window_size 

73 self.step = step 

74 self.tda_max_dim = tda_max_dim 

75 self.n_centers_by_dim = n_centers_by_dim 

76 

77 def _build_pipeline(self): 

78 steps = [] 

79 steps.append(("Standard scaler", StandardScaler())) 

80 func = partial(transform_to_persistence_diagram, tda_max_dim=self.tda_max_dim) 

81 steps.append( 

82 ( 

83 "Sliding persistence diagram transformer", 

84 FunctionTransformer( 

85 func=sliding_window_ppl_pp, 

86 kw_args={ 

87 "window_size": self.window_size, 

88 "step": self.step, 

89 "func": func, 

90 }, 

91 ), 

92 ) 

93 ) 

94 steps.append( 

95 ( 

96 "Archipelago", 

97 ColumnTransformer( 

98 [ 

99 ( 

100 f"Atol{i}", 

101 Atol( 

102 quantiser=KMeans( 

103 n_clusters=self.n_centers_by_dim, 

104 random_state=202312, 

105 n_init="auto", 

106 ) 

107 ), 

108 i, 

109 ) 

110 for i in range(self.tda_max_dim + 1) 

111 ] 

112 ), 

113 ) 

114 ) 

115 return Pipeline(steps).set_output(transform="pandas") 

116 

117 def fit(self, X, y=None): 

118 """ 

119 Fit the internal pipeline to the data. 

120 

121 Parameters 

122 ---------- 

123 X : pandas.DataFrame 

124 Input feature matrix. 

125 

126 y : array-like, optional 

127 Target values (not used here, but accepted for compatibility with sklearn). 

128 

129 Returns 

130 ------- 

131 self : object 

132 Fitted transformer. 

133 """ 

134 self.pipeline_ = self._build_pipeline() 

135 self.pipeline_.fit(X, y) 

136 return self 

137 

138 def transform(self, X): 

139 """ 

140 Apply transformations to the input data using the fitted pipeline. 

141 

142 Parameters 

143 ---------- 

144 X : pandas.DataFrame 

145 Input data to transform. 

146 

147 Returns 

148 ------- 

149 X_transformed : array-like or DataFrame 

150 Transformed data. 

151 """ 

152 return self.pipeline_.transform(X) 

153 

154 def fit_transform(self, X, y=None, **fit_params): 

155 """ 

156 Fit to data, then transform it. 

157 

158 Returns 

159 ------- 

160 X_transformed : array-like 

161 """ 

162 return self.fit(X, y).transform(X)