Coverage for uqmodels/preprocessing/Preprocessor.py: 55%

77 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-05 14:29 +0000

1##################################################################################### 

2# Source link to Preprocessor class : 

3# Preprocessing pipeline can combine several Preprocessing. 

4# We suggest to split : Raw_data -> (Raw_data_preprossing) -> Clean_data -> (ML-Preprocessor_Porcessing) -> ML-Dataset 

5# Then we can produce from a same clean_data several ML-Dataset 

6# Cache mecanism aim to avoid to do same preprocessing calculation 

7 

8import copy 

9 

10from uqmodels.preprocessing.structure import Structure 

11from uqmodels.processing import Processor 

12 

13 

14class Preprocessor(Processor): 

15 def __init__( 

16 self, name="formaliser", cache=None, structure=None, update_query=None, **kwargs 

17 ): 

18 """Preprocessor class (inherit from Processor) : that aim to preprocess data in a (fit/transform) scheme 

19 and hold a cache manager functionality to save/load object 

20 

21 Args: 

22 name (str, optional): Name of processor. Defaults to 'processor'. 

23 cache (Cache_manager or None, optional): Cache manager. Defaults to None : no save/load procedure 

24 structure (obj or None): structure that contains specification about how data has to be structured 

25 after preprocessing 

26 update_query (function, optional): Function to update query due to Processor application if needed. 

27 Defaults to default_update_query : no update/ 

28 """ 

29 super().__init__( 

30 name=name, cache=cache, structure=None, update_query=update_query, **kwargs 

31 ) 

32 

33 if structure is None: 

34 self.structure = Structure("Data") 

35 else: 

36 self.structure = structure 

37 

38 for key in kwargs: 

39 self.structure.__setattr__(key, kwargs[key]) 

40 

41 self.is_fitted = False 

42 

43 def default_update_query(self, query, name): 

44 if isinstance(query, dict): 

45 new_query = query.copy() 

46 if "processing" in query.keys(): 

47 new_query["processing"].append(name) 

48 else: 

49 new_query["processing"] = [name] 

50 else: 

51 new_query = super().default_update_query(query) 

52 return new_query 

53 

54 def get(self, keys, default_value=None): 

55 """Get obj from structure using structure.get 

56 

57 Args: 

58 keys (_type_): key or list of keys related to attributes to get 

59 default_value (_type_, optional): default_value if no attribute. Defaults to None. 

60 """ 

61 self.structure.get(keys, default_value) 

62 

63 def set(self, key, obj): 

64 """Set ogj in structure using structure.get 

65 

66 Args: 

67 keys (_type_): key or list of keys related to attributes to get 

68 

69 obj (_type_): _description_ 

70 """ 

71 self.structure.set(key, obj) 

72 

73 def fit(self, data=None, query={}, save_preprocessor=False): 

74 """Fit Preprocessing using data 

75 

76 Args: 

77 data (obj, optional): data. Defaults to None. 

78 query: dict_query that generated the data 

79 save_formaliser (bool, optional): boolean flag that inform if we have to save preprocessor or not 

80 """ 

81 # Fit formaliser using train_data 

82 super().fit() 

83 if save_preprocessor: 

84 new_query = copy.copy(query) 

85 query["name"] = self.name 

86 self.save(new_query) 

87 

88 def transform(self, data=None, query={}): 

89 """Apply Preprocessor to data 

90 Args: 

91 data (obj, optional): data. Defaults to None. 

92 query: dict_query that generated the data 

93 

94 Return 

95 data : Preprocessed data 

96 """ 

97 query = self.update_query(query) 

98 if self.cache is not None: 

99 self.cache.save(query, data) 

100 super().transform(data) 

101 return data 

102 

103 def fit_transform(self, data=None, query={}): 

104 """Fit Processor and apply it on data 

105 

106 Args: 

107 data (obj, optional): data. Defaults to None. 

108 query: dict_query that generated the data. 

109 

110 Return 

111 data : Preprocessed data 

112 """ 

113 

114 self.fit(data, query) 

115 data = self.transform(data, query) 

116 return data 

117 

118 def update_query(self, query={}): 

119 """Apply the update_query_function provided at init to update query 

120 Args: 

121 query (dict): dict_query that generated the data. 

122 

123 Returns: 

124 new_query: updated query 

125 """ 

126 if self._update_query is None: 

127 new_query = self.default_update_query(query, self.name) 

128 else: 

129 new_query = self._update_query(query, self.name) 

130 return new_query 

131 

132 def use_cache(self, query={}): 

133 """Use_cache manager to check if there is cache link to data already processed 

134 

135 Args: 

136 query (dict): dict_query that generated the data. 

137 

138 Raises: 

139 FileNotFoundError: cache Not Found error caught by method that called use_case 

140 

141 Returns: 

142 data: if file is found else error 

143 """ 

144 try: 

145 data = super().use_cache(query) 

146 

147 except (FileNotFoundError, NotADirectoryError): 

148 raise FileNotFoundError() 

149 

150 return data 

151 

152 def save(self, query={}, object=None, name="data"): 

153 """Save method to store object at query+name location using cache_manager 

154 

155 Args: 

156 query (dict, optional): dict_query that generated the data. 

157 object (obj, optional): object to store. Defaults to None. 

158 name (_type_, optional): filename of obj to store. Defaults to None. 

159 """ 

160 super().save(query, object, name) 

161 

162 def load(self, query={}, name="data"): 

163 """Load method to load Preprocessor at query+name location using cache_manager and use it parameters 

164 

165 Args: 

166 query (dict, optional): query_paramaters. Defaults to None. 

167 name (_type_, optional): filename of obj to load. Defaults to None. 

168 """ 

169 # Load fitted formaliser 

170 object = super().load(query, name) 

171 return object 

172 

173 

174# GENERIC_Preprocessor 

175 

176 

177def fit_default(self, data, query={}, structure=None): 

178 """fit function that done nothing 

179 

180 Args: 

181 data (obj): data 

182 query (dict): dict_query that generated the data. 

183 structure (structure obj, optional): structure object that provide all meta information about data. 

184 """ 

185 

186 

187def transform_default(self, data, query={}, structure=None): 

188 """Transform+ function that done nothing 

189 

190 Args: 

191 data (obj): data 

192 query (dict): dict_query that generated the data. 

193 structure (structure obj, optional): structure object that provide all meta information about data. 

194 """ 

195 return data 

196 

197 

198# Default Preprocessor : 

199 

200 

201class Generic_Preprocessor(Preprocessor): 

202 def __init__( 

203 self, 

204 name="Generic_preprocessor", 

205 cache=None, 

206 structure=None, 

207 update_query=None, 

208 fit_function=fit_default, 

209 transform_function=transform_default, 

210 **kwargs 

211 ): 

212 """Preprocessor class (inherit from Processor) : that aim to preprocess data in a (fit/transform) scheme and 

213 hold a cache manager functionality to save/load object 

214 

215 Args: 

216 name (str, optional): Name of processor. Defaults to 'processor'. 

217 cache (Cache_manager or None, optional): Cache manager. Defaults to None : no save/load procedure 

218 structure (obj or None): structure that contains specification about how data has to be 

219 structured after preprocessing 

220 update_query (function, optional): Function to update query due to Processor application if needed. 

221 Defaults to default_update_query : no update/ 

222 fit_function = function to apply in fit procedure. Defaults to fit_default that does nothing. 

223 transform_function = function to apply in tranform procedure. 

224 Defaults to transform_default that does nothing. 

225 

226 """ 

227 

228 super().__init__( 

229 name=name, 

230 cache=cache, 

231 structure=structure, 

232 update_query=update_query, 

233 **kwargs 

234 ) 

235 

236 self.fit_function = fit_function 

237 self.transform_function = transform_function 

238 

239 def fit(self, data, query={}): 

240 """Apply fit_function on data with query as query and self.structure as metadata 

241 if query has an "source" attribute: 

242 try to access to corrrespoding substructure by structure.get_structure(query[source]) 

243 

244 Args: 

245 data (obj, optional): data. Defaults to None. 

246 query: dict_query that generated the data 

247 save_formaliser (bool, optional): boolean flag that inform if we have to save preprocessor or not 

248 """ 

249 structure = self.structure 

250 if "source" in query.keys(): 

251 structure = self.structure.get_structure(query["source"]) 

252 

253 self.fit_function(self, data, query, structure) 

254 return super().fit(data) 

255 

256 def transform(self, data, query={}, **kwarg): 

257 """Apply transform_function on data with query as query and self.structure as metadata 

258 if query has an "source" attribute: 

259 try to access to corrrespoding substructure by structure.get_structure(query[source]) 

260 Args: 

261 data (obj, optional): data. Defaults to None. 

262 query: dict_query that generated the data 

263 

264 Return 

265 data : Preprocessed data 

266 """ 

267 structure = self.structure 

268 if "source" in query.keys(): 

269 structure = self.structure.get_structure(query["source"]) 

270 

271 data = self.transform_function(self, data, query, structure) 

272 data = super().transform(data, query) 

273 return data