Coverage for tadkit / catalog / sklearners.py: 97%

61 statements  

« prev     ^ index     » next       coverage.py v7.13.3, created at 2026-02-03 15:41 +0000

1from typing import Callable 

2 

3import numpy as np 

4 

5from sklearn.neighbors import KernelDensity 

6from sklearn.mixture import GaussianMixture 

7 

8from tadkit.base.basedensitydetector import BaseDensityOutlierDetector 

9 

10 

11class KDEOutlierDetector(BaseDensityOutlierDetector): 

12 """ 

13 Density-based outlier detection using KernelDensity. 

14 

15 Parameters 

16 ---------- 

17 bandwidth : float, default=1.0 

18 The bandwidth of the kernel. 

19 algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto' 

20 The tree algorithm to use. 

21 kernel : str, default='gaussian' 

22 The kernel to use. Valid kernels are 

23 ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']. 

24 metric : str, default='euclidean' 

25 The distance metric to use. 

26 atol : float, default=0 

27 The desired absolute tolerance of the result. 

28 rtol : float, default=0 

29 The desired relative tolerance of the result. 

30 breadth_first : bool, default=True 

31 If true, use a breadth-first approach to the problem. 

32 leaf_size : int, default=40 

33 Leaf size passed to BallTree or KDTree. 

34 metric_params : dict, default=None 

35 Additional parameters for the metric function. 

36 contamination : float, default=0.1 

37 Proportion of outliers in the data set. 

38 """ 

39 

40 _parameter_constraints = KernelDensity._parameter_constraints.copy() 

41 

42 def __init__( 

43 self, 

44 bandwidth=1.0, 

45 algorithm="auto", 

46 kernel="gaussian", 

47 metric="euclidean", 

48 atol=0, 

49 rtol=0, 

50 breadth_first=True, 

51 leaf_size=40, 

52 metric_params=None, 

53 contamination: float = 0.1, 

54 ): 

55 super().__init__(contamination=contamination) 

56 

57 # store KDE parameters explicitly 

58 self.bandwidth = bandwidth 

59 self.algorithm = algorithm 

60 self.kernel = kernel 

61 self.metric = metric 

62 self.atol = atol 

63 self.rtol = rtol 

64 self.breadth_first = breadth_first 

65 self.leaf_size = leaf_size 

66 self.metric_params = metric_params 

67 

68 def _fit_density(self, X: np.ndarray): 

69 self.kde_ = KernelDensity( 

70 bandwidth=self.bandwidth, 

71 algorithm=self.algorithm, 

72 kernel=self.kernel, 

73 metric=self.metric, 

74 atol=self.atol, 

75 rtol=self.rtol, 

76 breadth_first=self.breadth_first, 

77 leaf_size=self.leaf_size, 

78 metric_params=self.metric_params, 

79 ) 

80 self.kde_.fit(X) 

81 

82 def _score_density(self, X: np.ndarray) -> np.ndarray: 

83 return self.kde_.score_samples(X) 

84 

85 

86class GMMOutlierDetector(BaseDensityOutlierDetector): 

87 """ 

88 Density-based outlier detection using GaussianMixture. 

89 

90 Parameters 

91 ---------- 

92 n_components : int, default=1 

93 The number of mixture components. 

94 covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full' 

95 Type of covariance parameters to use. 

96 tol : float, default=1e-3 

97 Convergence threshold. 

98 reg_covar : float, default=1e-6 

99 Non-negative regularization added to the diagonal of covariance matrices. 

100 max_iter : int, default=100 

101 The number of EM iterations to perform. 

102 n_init : int, default=1 

103 The number of initializations to perform. The best result is kept. 

104 init_params : {'kmeans', 'random'}, default='kmeans' 

105 Method used to initialize the weights, means, and precisions. 

106 weights_init : array-like of shape (n_components,), default=None 

107 The user-provided initial weights. 

108 means_init : array-like of shape (n_components, n_features), default=None 

109 The user-provided initial means. 

110 precisions_init : array-like, default=None 

111 The user-provided initial precisions. 

112 random_state : int, RandomState instance, default=None 

113 Controls the random seed. 

114 warm_start : bool, default=False 

115 If True, reuse the solution of the last fitting. 

116 verbose : int, default=0 

117 Enable verbose output. 

118 verbose_interval : int, default=10 

119 Number of iteration steps between printing progress. 

120 contamination : float, default=0.1 

121 Proportion of outliers in the dataset. 

122 """ 

123 

124 _parameter_constraints = GaussianMixture._parameter_constraints.copy() 

125 

126 def __init__( 

127 self, 

128 n_components=1, 

129 covariance_type="full", 

130 tol=1e-3, 

131 reg_covar=1e-6, 

132 max_iter=100, 

133 n_init=1, 

134 init_params="kmeans", 

135 weights_init=None, 

136 means_init=None, 

137 precisions_init=None, 

138 random_state=None, 

139 warm_start=False, 

140 verbose=0, 

141 verbose_interval=10, 

142 contamination: float = 0.1, 

143 ): 

144 super().__init__(contamination=contamination) 

145 

146 # Store GMM parameters explicitly 

147 self.n_components = n_components 

148 self.covariance_type = covariance_type 

149 self.tol = tol 

150 self.reg_covar = reg_covar 

151 self.max_iter = max_iter 

152 self.n_init = n_init 

153 self.init_params = init_params 

154 self.weights_init = weights_init 

155 self.means_init = means_init 

156 self.precisions_init = precisions_init 

157 self.random_state = random_state 

158 self.warm_start = warm_start 

159 self.verbose = verbose 

160 self.verbose_interval = verbose_interval 

161 

162 def _fit_density(self, X: np.ndarray): 

163 self.gmm_ = GaussianMixture( 

164 n_components=self.n_components, 

165 covariance_type=self.covariance_type, 

166 tol=self.tol, 

167 reg_covar=self.reg_covar, 

168 max_iter=self.max_iter, 

169 n_init=self.n_init, 

170 init_params=self.init_params, 

171 weights_init=self.weights_init, 

172 means_init=self.means_init, 

173 precisions_init=self.precisions_init, 

174 random_state=self.random_state, 

175 warm_start=self.warm_start, 

176 verbose=self.verbose, 

177 verbose_interval=self.verbose_interval, 

178 ) 

179 self.gmm_.fit(X) 

180 

181 def _score_density(self, X: np.ndarray) -> np.ndarray: 

182 return self.gmm_.score_samples(X) 

183 

184 

185class CustomScoreOutlierDetector(BaseDensityOutlierDetector): 

186 """ 

187 Parameters 

188 ---------- 

189 score_func : callable 

190 Function X -> scores (higher = inliers). Must accept 2D array and return 1D array. 

191 contamination : float, default=0.1 

192 Proportion of outliers. Must be in (0, 0.5). 

193 """ 

194 

195 score_func: Callable[[np.ndarray], np.ndarray] 

196 

197 def __init__( 

198 self, score_func: Callable[[np.ndarray], np.ndarray], contamination: float = 0.1 

199 ): 

200 super().__init__(contamination=contamination) 

201 if not callable(score_func): 

202 raise ValueError("score_func must be callable") 

203 self.score_func = score_func 

204 

205 def _fit_density(self, X: np.ndarray): 

206 # Nothing to fit 

207 pass 

208 

209 def _score_density(self, X: np.ndarray) -> np.ndarray: 

210 scores = self.score_func(X) 

211 scores = np.asarray(scores) 

212 if scores.ndim != 1: 

213 raise ValueError("score_func must return a 1D array") 

214 return scores