Coverage for uqmodels/preprocessing/Preprocessor.py: 55%
77 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:29 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:29 +0000
1#####################################################################################
2# Source link to Preprocessor class :
3# Preprocessing pipeline can combine several Preprocessing.
4# We suggest to split : Raw_data -> (Raw_data_preprossing) -> Clean_data -> (ML-Preprocessor_Porcessing) -> ML-Dataset
5# Then we can produce from a same clean_data several ML-Dataset
6# Cache mecanism aim to avoid to do same preprocessing calculation
8import copy
10from uqmodels.preprocessing.structure import Structure
11from uqmodels.processing import Processor
14class Preprocessor(Processor):
15 def __init__(
16 self, name="formaliser", cache=None, structure=None, update_query=None, **kwargs
17 ):
18 """Preprocessor class (inherit from Processor) : that aim to preprocess data in a (fit/transform) scheme
19 and hold a cache manager functionality to save/load object
21 Args:
22 name (str, optional): Name of processor. Defaults to 'processor'.
23 cache (Cache_manager or None, optional): Cache manager. Defaults to None : no save/load procedure
24 structure (obj or None): structure that contains specification about how data has to be structured
25 after preprocessing
26 update_query (function, optional): Function to update query due to Processor application if needed.
27 Defaults to default_update_query : no update/
28 """
29 super().__init__(
30 name=name, cache=cache, structure=None, update_query=update_query, **kwargs
31 )
33 if structure is None:
34 self.structure = Structure("Data")
35 else:
36 self.structure = structure
38 for key in kwargs:
39 self.structure.__setattr__(key, kwargs[key])
41 self.is_fitted = False
43 def default_update_query(self, query, name):
44 if isinstance(query, dict):
45 new_query = query.copy()
46 if "processing" in query.keys():
47 new_query["processing"].append(name)
48 else:
49 new_query["processing"] = [name]
50 else:
51 new_query = super().default_update_query(query)
52 return new_query
54 def get(self, keys, default_value=None):
55 """Get obj from structure using structure.get
57 Args:
58 keys (_type_): key or list of keys related to attributes to get
59 default_value (_type_, optional): default_value if no attribute. Defaults to None.
60 """
61 self.structure.get(keys, default_value)
63 def set(self, key, obj):
64 """Set ogj in structure using structure.get
66 Args:
67 keys (_type_): key or list of keys related to attributes to get
69 obj (_type_): _description_
70 """
71 self.structure.set(key, obj)
73 def fit(self, data=None, query={}, save_preprocessor=False):
74 """Fit Preprocessing using data
76 Args:
77 data (obj, optional): data. Defaults to None.
78 query: dict_query that generated the data
79 save_formaliser (bool, optional): boolean flag that inform if we have to save preprocessor or not
80 """
81 # Fit formaliser using train_data
82 super().fit()
83 if save_preprocessor:
84 new_query = copy.copy(query)
85 query["name"] = self.name
86 self.save(new_query)
88 def transform(self, data=None, query={}):
89 """Apply Preprocessor to data
90 Args:
91 data (obj, optional): data. Defaults to None.
92 query: dict_query that generated the data
94 Return
95 data : Preprocessed data
96 """
97 query = self.update_query(query)
98 if self.cache is not None:
99 self.cache.save(query, data)
100 super().transform(data)
101 return data
103 def fit_transform(self, data=None, query={}):
104 """Fit Processor and apply it on data
106 Args:
107 data (obj, optional): data. Defaults to None.
108 query: dict_query that generated the data.
110 Return
111 data : Preprocessed data
112 """
114 self.fit(data, query)
115 data = self.transform(data, query)
116 return data
118 def update_query(self, query={}):
119 """Apply the update_query_function provided at init to update query
120 Args:
121 query (dict): dict_query that generated the data.
123 Returns:
124 new_query: updated query
125 """
126 if self._update_query is None:
127 new_query = self.default_update_query(query, self.name)
128 else:
129 new_query = self._update_query(query, self.name)
130 return new_query
132 def use_cache(self, query={}):
133 """Use_cache manager to check if there is cache link to data already processed
135 Args:
136 query (dict): dict_query that generated the data.
138 Raises:
139 FileNotFoundError: cache Not Found error caught by method that called use_case
141 Returns:
142 data: if file is found else error
143 """
144 try:
145 data = super().use_cache(query)
147 except (FileNotFoundError, NotADirectoryError):
148 raise FileNotFoundError()
150 return data
152 def save(self, query={}, object=None, name="data"):
153 """Save method to store object at query+name location using cache_manager
155 Args:
156 query (dict, optional): dict_query that generated the data.
157 object (obj, optional): object to store. Defaults to None.
158 name (_type_, optional): filename of obj to store. Defaults to None.
159 """
160 super().save(query, object, name)
162 def load(self, query={}, name="data"):
163 """Load method to load Preprocessor at query+name location using cache_manager and use it parameters
165 Args:
166 query (dict, optional): query_paramaters. Defaults to None.
167 name (_type_, optional): filename of obj to load. Defaults to None.
168 """
169 # Load fitted formaliser
170 object = super().load(query, name)
171 return object
174# GENERIC_Preprocessor
177def fit_default(self, data, query={}, structure=None):
178 """fit function that done nothing
180 Args:
181 data (obj): data
182 query (dict): dict_query that generated the data.
183 structure (structure obj, optional): structure object that provide all meta information about data.
184 """
187def transform_default(self, data, query={}, structure=None):
188 """Transform+ function that done nothing
190 Args:
191 data (obj): data
192 query (dict): dict_query that generated the data.
193 structure (structure obj, optional): structure object that provide all meta information about data.
194 """
195 return data
198# Default Preprocessor :
201class Generic_Preprocessor(Preprocessor):
202 def __init__(
203 self,
204 name="Generic_preprocessor",
205 cache=None,
206 structure=None,
207 update_query=None,
208 fit_function=fit_default,
209 transform_function=transform_default,
210 **kwargs
211 ):
212 """Preprocessor class (inherit from Processor) : that aim to preprocess data in a (fit/transform) scheme and
213 hold a cache manager functionality to save/load object
215 Args:
216 name (str, optional): Name of processor. Defaults to 'processor'.
217 cache (Cache_manager or None, optional): Cache manager. Defaults to None : no save/load procedure
218 structure (obj or None): structure that contains specification about how data has to be
219 structured after preprocessing
220 update_query (function, optional): Function to update query due to Processor application if needed.
221 Defaults to default_update_query : no update/
222 fit_function = function to apply in fit procedure. Defaults to fit_default that does nothing.
223 transform_function = function to apply in tranform procedure.
224 Defaults to transform_default that does nothing.
226 """
228 super().__init__(
229 name=name,
230 cache=cache,
231 structure=structure,
232 update_query=update_query,
233 **kwargs
234 )
236 self.fit_function = fit_function
237 self.transform_function = transform_function
239 def fit(self, data, query={}):
240 """Apply fit_function on data with query as query and self.structure as metadata
241 if query has an "source" attribute:
242 try to access to corrrespoding substructure by structure.get_structure(query[source])
244 Args:
245 data (obj, optional): data. Defaults to None.
246 query: dict_query that generated the data
247 save_formaliser (bool, optional): boolean flag that inform if we have to save preprocessor or not
248 """
249 structure = self.structure
250 if "source" in query.keys():
251 structure = self.structure.get_structure(query["source"])
253 self.fit_function(self, data, query, structure)
254 return super().fit(data)
256 def transform(self, data, query={}, **kwarg):
257 """Apply transform_function on data with query as query and self.structure as metadata
258 if query has an "source" attribute:
259 try to access to corrrespoding substructure by structure.get_structure(query[source])
260 Args:
261 data (obj, optional): data. Defaults to None.
262 query: dict_query that generated the data
264 Return
265 data : Preprocessed data
266 """
267 structure = self.structure
268 if "source" in query.keys():
269 structure = self.structure.get_structure(query["source"])
271 data = self.transform_function(self, data, query, structure)
272 data = super().transform(data, query)
273 return data