Coverage for uqmodels/preprocessing/Custom_Preprocessor.py: 28%
116 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:29 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:29 +0000
1#####################################################################################
2# Source link to Preprocessor class :
3# Preprocessing pipeline can combine several Preprocessing.
4# We suggest to split : Raw_data -> (Raw_data_preprossing) -> Clean_data -> (ML-Preprocessor_Porcessing) -> ML-Dataset
5# Then we can produce from a same clean_data several ML-Dataset
6# Cache mecanism aim to avoid to do same preprocessing calculation
9import numpy as np
11import uqmodels.preprocessing.features_processing as FE_proc
12from uqmodels.preprocessing.Preprocessor import Preprocessor
15class dict_to_TS_Dataset(Preprocessor):
16 def __init__(self, name="dict_to_TS_Dataset"):
17 """Init Preprocessor that turn a dict_data that contains all preprocessed info into a dataset"""
18 super().__init__(name=name)
20 def fit(self, data, query):
21 """Do nothing"""
22 super().fit(data, query)
24 def transform(self, data, query):
25 """Provide dataset as list of array : [X,y,context,train,test,X_split]"""
26 data = super().transform(data, query)
27 X = data["X"]
29 y = data["Y"].reshape(len(X), -1)
31 context = None
32 if "context" in data.keys():
33 context = data["context"]
35 sample_weight = None
36 if "sample_weight" in data.keys():
37 sample_weight = data["sample_weight"]
39 objective = None
40 if "objective" in data.keys():
41 objective = data["objective"]
43 name = "data"
44 if "name" in query.keys():
45 name = query["name"]
47 x_split = np.zeros(len(X))
48 if "X_split" in data.keys():
49 x_split = data["X_split"]
51 if "train" in data.keys():
52 x_split = data["train"]
54 return (X, y, sample_weight, x_split, context, objective, name)
57class Generic_Features_processor(Preprocessor):
58 def __init__(
59 self,
60 name="Generic_Features_processor",
61 cache=None,
62 structure=None,
63 update_query=None,
64 list_params_features=[],
65 list_fit_features=[],
66 list_compute_features=[],
67 list_update_params_features=None,
68 list_params_targets=[],
69 list_fit_targets=[],
70 list_compute_targets=[],
71 list_update_params_targets=None,
72 normalise_data=False,
73 normalise_context=False,
74 dataset_formalizer=None,
75 min_size=1,
76 concat_features=False,
77 concat_targets=True,
78 **kwargs
79 ):
80 """Preprocessor class (inherit from Processor) : that aim to preprocess data in a (fit/transform) scheme
81 and hold a cache manager functionality to save/load object
83 Args:
84 name (str, optional): Name of processor. Defaults to 'processor'.
85 cache (Cache_manager or None, optional): Cache manager. Defaults to None : no save/load procedure
86 structure (obj or None): structure that contains specification about how data has to be structured
87 after preprocessing
88 update_query (function, optional): Function to update query due to Processor application if needed.
89 Defaults to default_update_query : no update/
90 fit_function = function to apply in fit procedure. Defaults to fit_default that does nothing.
91 transform_function = function to apply in tranform procedure. Defaults to transform_default that
92 does nothing.
94 """
96 super().__init__(
97 name=name,
98 cache=cache,
99 structure=structure,
100 update_query=update_query,
101 list_params_features=list_params_features,
102 list_fit_features=list_fit_features,
103 list_compute_features=list_compute_features,
104 list_update_params_features=list_update_params_features,
105 list_params_targets=list_params_targets,
106 list_fit_targets=list_fit_targets,
107 list_compute_targets=list_compute_targets,
108 list_update_params_targets=list_update_params_targets,
109 normalise_data=normalise_data,
110 normalise_context=normalise_context,
111 dataset_formalizer=dataset_formalizer,
112 min_size=min_size,
113 concat_features=concat_features,
114 concat_targets=concat_targets,
115 **kwargs
116 )
118 def fit(self, data, query={}, **kwargs):
119 """Fit Preprocessing using data and fit_function procedure
121 Args:
122 data (obj, optional): data. Defaults to None.
123 query: dict_query that generated the data
124 save_formaliser (bool, optional): boolean flag that inform if we have to save preprocessor or not
125 """
126 if len(data) == 2:
127 data, context = data
128 if self.normalise_context:
129 context, context_scaler = FE_proc.normalise_panda(
130 context, mode="fit_transform"
131 )
132 self.context_scaler = context_scaler
133 else:
134 context = None
136 if self.normalise_data:
137 data, data_scaler = FE_proc.normalise_panda(data, mode="fit_transform")
138 self.data_scaler = data_scaler
140 for n, (params, fit_func) in enumerate(
141 zip(self.list_params_features, self.list_fit_features)
142 ):
143 if fit_func is not None:
144 if self.list_update_params_features is not None:
145 if self.list_update_params_features[n] is not None:
146 params = self.list_update_params_features[n](query, params)
147 params["params_"] = fit_func(data, context, **params)
149 for n, (params, fit_func) in enumerate(
150 zip(self.list_params_targets, self.list_fit_targets)
151 ):
152 if fit_func is not None:
153 if self.list_update_params_targets is not None:
154 if self.list_update_params_targets[n] is not None:
155 params = self.list_update_params_targets[n](query, params)
156 params["params_"] = fit_func(data, context, **params)
157 return super().fit(data)
159 def transform(self, data, query={}, training=True, **kwarg):
160 """Apply transform_function to data
161 Args:
162 data (obj, optional): data. Defaults to None.
163 query: dict_query that generated the data
165 Return
166 data : Preprocessed data
167 """
168 if len(data) < self.min_size:
169 raise (ValueError("not enough data. min_size: " + str(self.min_size)))
171 if len(data) == 2:
172 data, context = data
174 if self.normalise_context:
175 context = FE_proc.normalise_panda(
176 context, mode="transform", scaler=self.context_scaler
177 )
179 else:
180 context = None
182 if self.normalise_data:
183 data = FE_proc.normalise_panda(
184 data, mode="transform", scaler=self.data_scaler
185 )
187 list_features = []
188 list_targets = []
190 # Computation of features
191 for n, (params, compute_func) in enumerate(
192 zip(self.list_params_features, self.list_compute_features)
193 ):
194 if compute_func is not None:
195 if self.list_update_params_features is not None:
196 if self.list_update_params_features[n] is not None:
198 params = self.list_update_params_features[n](query, params)
200 features, _ = compute_func(data, context, **params)
201 list_features.append(features)
203 # Computation of Targets
204 for n, (params, compute_func) in enumerate(
205 zip(self.list_params_targets, self.list_compute_targets)
206 ):
207 if compute_func is not None:
208 if self.list_update_params_targets is not None:
209 if self.list_update_params_targets[n] is not None:
210 params = self.list_update_params_targets[n](query, params)
212 targets, _ = compute_func(data, context, **params)
213 list_targets.append(targets)
215 X = list_features
216 if self.concat_features:
217 X = np.concatenate(X, axis=1)
219 y = list_targets
220 if self.concat_targets:
221 y = np.concatenate(list_targets, axis=1)
223 if self.dataset_formalizer is not None:
224 return self.dataset_formalizer(X, y, query)
225 else:
226 return (X, y)
228 def fit_transform(self, data, query={}, **kwarg):
229 self.fit(data, query)
230 data = self.transform(data, query)
231 return data
234def init_Features_processor(
235 name="Features_processor",
236 dict_params_FE_ctx=None,
237 dict_params_FE_dyn=None,
238 dict_params_FE_targets=None,
239 update_params_FE_ctx=None,
240 update_params_FE_dyn=None,
241 update_params_FE_targets=None,
242 normalise_data=False,
243 normalise_context=False,
244 dataset_formalizer=None,
245 min_size=1,
246 structure=None,
247 cache=None,
248):
249 list_params_features = []
250 list_fit_features = []
251 list_compute_features = []
252 list_update_params_features = []
254 if dict_params_FE_ctx is not None:
255 list_params_features.append({"dict_FE_params": dict_params_FE_ctx})
256 list_fit_features.append(FE_proc.fit_feature_engeenering)
257 list_compute_features.append(FE_proc.compute_feature_engeenering)
258 list_update_params_features.append(update_params_FE_ctx)
260 if dict_params_FE_dyn is not None:
261 list_params_features.append({"dict_FE_params": dict_params_FE_dyn})
262 list_fit_features.append(FE_proc.fit_feature_engeenering)
263 list_compute_features.append(FE_proc.compute_feature_engeenering)
264 list_update_params_features.append(update_params_FE_dyn)
266 list_params_targets = [{"dict_FE_params": dict_params_FE_targets}]
267 list_fit_targets = [FE_proc.fit_feature_engeenering]
268 list_compute_targets = [FE_proc.compute_feature_engeenering]
269 list_update_params_targets = [update_params_FE_targets]
271 Formalizer = Generic_Features_processor(
272 name=name,
273 cache=cache,
274 structure=structure,
275 list_params_features=list_params_features,
276 list_fit_features=list_fit_features,
277 list_compute_features=list_compute_features,
278 list_update_params_features=list_update_params_features,
279 list_params_targets=list_params_targets,
280 list_fit_targets=list_fit_targets,
281 list_compute_targets=list_compute_targets,
282 list_update_params_targets=list_update_params_targets,
283 normalise_data=normalise_data,
284 normalise_context=normalise_context,
285 dataset_formalizer=dataset_formalizer,
286 min_size=min_size,
287 )
289 return Formalizer