Coverage for tadkit / catalog / sklearners.py: 97%
61 statements
« prev ^ index » next coverage.py v7.13.3, created at 2026-02-03 15:41 +0000
« prev ^ index » next coverage.py v7.13.3, created at 2026-02-03 15:41 +0000
1from typing import Callable
3import numpy as np
5from sklearn.neighbors import KernelDensity
6from sklearn.mixture import GaussianMixture
8from tadkit.base.basedensitydetector import BaseDensityOutlierDetector
11class KDEOutlierDetector(BaseDensityOutlierDetector):
12 """
13 Density-based outlier detection using KernelDensity.
15 Parameters
16 ----------
17 bandwidth : float, default=1.0
18 The bandwidth of the kernel.
19 algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto'
20 The tree algorithm to use.
21 kernel : str, default='gaussian'
22 The kernel to use. Valid kernels are
23 ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'].
24 metric : str, default='euclidean'
25 The distance metric to use.
26 atol : float, default=0
27 The desired absolute tolerance of the result.
28 rtol : float, default=0
29 The desired relative tolerance of the result.
30 breadth_first : bool, default=True
31 If true, use a breadth-first approach to the problem.
32 leaf_size : int, default=40
33 Leaf size passed to BallTree or KDTree.
34 metric_params : dict, default=None
35 Additional parameters for the metric function.
36 contamination : float, default=0.1
37 Proportion of outliers in the data set.
38 """
40 _parameter_constraints = KernelDensity._parameter_constraints.copy()
42 def __init__(
43 self,
44 bandwidth=1.0,
45 algorithm="auto",
46 kernel="gaussian",
47 metric="euclidean",
48 atol=0,
49 rtol=0,
50 breadth_first=True,
51 leaf_size=40,
52 metric_params=None,
53 contamination: float = 0.1,
54 ):
55 super().__init__(contamination=contamination)
57 # store KDE parameters explicitly
58 self.bandwidth = bandwidth
59 self.algorithm = algorithm
60 self.kernel = kernel
61 self.metric = metric
62 self.atol = atol
63 self.rtol = rtol
64 self.breadth_first = breadth_first
65 self.leaf_size = leaf_size
66 self.metric_params = metric_params
68 def _fit_density(self, X: np.ndarray):
69 self.kde_ = KernelDensity(
70 bandwidth=self.bandwidth,
71 algorithm=self.algorithm,
72 kernel=self.kernel,
73 metric=self.metric,
74 atol=self.atol,
75 rtol=self.rtol,
76 breadth_first=self.breadth_first,
77 leaf_size=self.leaf_size,
78 metric_params=self.metric_params,
79 )
80 self.kde_.fit(X)
82 def _score_density(self, X: np.ndarray) -> np.ndarray:
83 return self.kde_.score_samples(X)
86class GMMOutlierDetector(BaseDensityOutlierDetector):
87 """
88 Density-based outlier detection using GaussianMixture.
90 Parameters
91 ----------
92 n_components : int, default=1
93 The number of mixture components.
94 covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'
95 Type of covariance parameters to use.
96 tol : float, default=1e-3
97 Convergence threshold.
98 reg_covar : float, default=1e-6
99 Non-negative regularization added to the diagonal of covariance matrices.
100 max_iter : int, default=100
101 The number of EM iterations to perform.
102 n_init : int, default=1
103 The number of initializations to perform. The best result is kept.
104 init_params : {'kmeans', 'random'}, default='kmeans'
105 Method used to initialize the weights, means, and precisions.
106 weights_init : array-like of shape (n_components,), default=None
107 The user-provided initial weights.
108 means_init : array-like of shape (n_components, n_features), default=None
109 The user-provided initial means.
110 precisions_init : array-like, default=None
111 The user-provided initial precisions.
112 random_state : int, RandomState instance, default=None
113 Controls the random seed.
114 warm_start : bool, default=False
115 If True, reuse the solution of the last fitting.
116 verbose : int, default=0
117 Enable verbose output.
118 verbose_interval : int, default=10
119 Number of iteration steps between printing progress.
120 contamination : float, default=0.1
121 Proportion of outliers in the dataset.
122 """
124 _parameter_constraints = GaussianMixture._parameter_constraints.copy()
126 def __init__(
127 self,
128 n_components=1,
129 covariance_type="full",
130 tol=1e-3,
131 reg_covar=1e-6,
132 max_iter=100,
133 n_init=1,
134 init_params="kmeans",
135 weights_init=None,
136 means_init=None,
137 precisions_init=None,
138 random_state=None,
139 warm_start=False,
140 verbose=0,
141 verbose_interval=10,
142 contamination: float = 0.1,
143 ):
144 super().__init__(contamination=contamination)
146 # Store GMM parameters explicitly
147 self.n_components = n_components
148 self.covariance_type = covariance_type
149 self.tol = tol
150 self.reg_covar = reg_covar
151 self.max_iter = max_iter
152 self.n_init = n_init
153 self.init_params = init_params
154 self.weights_init = weights_init
155 self.means_init = means_init
156 self.precisions_init = precisions_init
157 self.random_state = random_state
158 self.warm_start = warm_start
159 self.verbose = verbose
160 self.verbose_interval = verbose_interval
162 def _fit_density(self, X: np.ndarray):
163 self.gmm_ = GaussianMixture(
164 n_components=self.n_components,
165 covariance_type=self.covariance_type,
166 tol=self.tol,
167 reg_covar=self.reg_covar,
168 max_iter=self.max_iter,
169 n_init=self.n_init,
170 init_params=self.init_params,
171 weights_init=self.weights_init,
172 means_init=self.means_init,
173 precisions_init=self.precisions_init,
174 random_state=self.random_state,
175 warm_start=self.warm_start,
176 verbose=self.verbose,
177 verbose_interval=self.verbose_interval,
178 )
179 self.gmm_.fit(X)
181 def _score_density(self, X: np.ndarray) -> np.ndarray:
182 return self.gmm_.score_samples(X)
185class CustomScoreOutlierDetector(BaseDensityOutlierDetector):
186 """
187 Parameters
188 ----------
189 score_func : callable
190 Function X -> scores (higher = inliers). Must accept 2D array and return 1D array.
191 contamination : float, default=0.1
192 Proportion of outliers. Must be in (0, 0.5).
193 """
195 score_func: Callable[[np.ndarray], np.ndarray]
197 def __init__(
198 self, score_func: Callable[[np.ndarray], np.ndarray], contamination: float = 0.1
199 ):
200 super().__init__(contamination=contamination)
201 if not callable(score_func):
202 raise ValueError("score_func must be callable")
203 self.score_func = score_func
205 def _fit_density(self, X: np.ndarray):
206 # Nothing to fit
207 pass
209 def _score_density(self, X: np.ndarray) -> np.ndarray:
210 scores = self.score_func(X)
211 scores = np.asarray(scores)
212 if scores.ndim != 1:
213 raise ValueError("score_func must return a 1D array")
214 return scores