Coverage for tdaad/topological_embedding.py: 100%

22 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-06-13 13:45 +0000

1"""Topological Embedding Transformers.""" 

2 

3# Author: Martin Royer 

4 

5from sklearn.preprocessing import StandardScaler, FunctionTransformer 

6from sklearn.compose import ColumnTransformer 

7from sklearn.cluster import KMeans 

8 

9from gudhi.representations.vector_methods import Atol 

10 

11from tdaad.persistencediagram_transformer import PersistenceDiagramTransformer 

12from tdaad.utils.local_pipeline import LocalPipeline 

13from tdaad.utils.window_functions import sliding_window_ppl 

14 

15 

16atol_vanilla_fit = Atol.fit 

17 

18 

19def local_atol_fit(self, X, y=None, sample_weight=None): 

20 """ local modification to prevent FutureWarning triggered by np.concatenate(X) when X is a pd.Series.""" 

21 if hasattr(X, "values"): 

22 X = X.values 

23 return atol_vanilla_fit(self=self, X=X) 

24 

25 

26Atol.fit = local_atol_fit 

27 

28 

29class TopologicalEmbedding(LocalPipeline): 

30 """Topological embedding for multiple time series. 

31 

32 Slices time series into smaller time series windows, forms an affinity matrix on each window 

33 and applies a Rips procedure to produce persistence diagrams for each affinity 

34 matrix. Then uses Atol [ref:Atol] on each dimension through the 

35 gudhi.representation.Archipelago representation to produce topological vectorization. 

36 

37 Read more in the :ref:`User Guide <topological_embedding>`. 

38 

39 Parameters 

40 ---------- 

41 window_size : int, default=40 

42 Size of the sliding window algorithm to extract subsequences as input to named_pipeline. 

43 step : int, default=5 

44 Size of the sliding window steps between each window. 

45 n_centers_by_dim : int, default=5 

46 The number of centroids to generate by dimension for vectorizing topological features. 

47 The resulting embedding will have total dimension =< tda_max_dim * n_centers_by_dim. 

48 The resulting embedding dimension might be smaller because of the KMeans algorithm in the Archipelago step. 

49 tda_max_dim : int, default=2 

50 The maximum dimension of the topological feature extraction. 

51 

52 Examples 

53 ---------- 

54 >>> n_timestamps = 100 

55 >>> n_sensors = 5 

56 >>> timestamps = pd.to_datetime('2024-01-01', utc=True) + pd.Timedelta(1, 'h') * np.arange(n_timestamps) 

57 >>> X = pd.DataFrame(np.random.random(size=(n_timestamps, n_sensors)), index=timestamps) 

58 >>> TopologicalEmbedding(n_centers_by_dim=2, tda_max_dim=1).fit_transform(X) 

59 """ 

60 

61 def __init__( 

62 self, 

63 window_size: int = 40, 

64 step: int = 5, 

65 tda_max_dim: int = 2, 

66 n_centers_by_dim: int = 5, 

67 ): 

68 self.window_size = window_size 

69 self.step = step 

70 self.tda_max_dim = tda_max_dim 

71 self.n_centers_by_dim = n_centers_by_dim 

72 named_ppl = PersistenceDiagramTransformer( 

73 tda_max_dim=self.tda_max_dim, 

74 ) 

75 super().__init__(steps=[ 

76 ("StandardScaler", 

77 StandardScaler() 

78 ), 

79 ("SlidingPersistenceDiagramTransformer", 

80 FunctionTransformer(func=sliding_window_ppl, kw_args={ 

81 "window_size": self.window_size, "step": self.step, "pipeline": named_ppl}) 

82 ), 

83 ("Archipelago", 

84 ColumnTransformer( 

85 [(f"Atol{i}", 

86 Atol(quantiser=KMeans(n_clusters=self.n_centers_by_dim, random_state=202312, n_init="auto")), i) 

87 for i in range(self.tda_max_dim + 1)]) 

88 ), 

89 ]) 

90 super().set_output(transform="pandas")