Coverage for uqmodels/preprocessing/Custom_Preprocessor.py: 28%

116 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-05 14:29 +0000

1##################################################################################### 

2# Source link to Preprocessor class : 

3# Preprocessing pipeline can combine several Preprocessing. 

4# We suggest to split : Raw_data -> (Raw_data_preprossing) -> Clean_data -> (ML-Preprocessor_Porcessing) -> ML-Dataset 

5# Then we can produce from a same clean_data several ML-Dataset 

6# Cache mecanism aim to avoid to do same preprocessing calculation 

7 

8 

9import numpy as np 

10 

11import uqmodels.preprocessing.features_processing as FE_proc 

12from uqmodels.preprocessing.Preprocessor import Preprocessor 

13 

14 

15class dict_to_TS_Dataset(Preprocessor): 

16 def __init__(self, name="dict_to_TS_Dataset"): 

17 """Init Preprocessor that turn a dict_data that contains all preprocessed info into a dataset""" 

18 super().__init__(name=name) 

19 

20 def fit(self, data, query): 

21 """Do nothing""" 

22 super().fit(data, query) 

23 

24 def transform(self, data, query): 

25 """Provide dataset as list of array : [X,y,context,train,test,X_split]""" 

26 data = super().transform(data, query) 

27 X = data["X"] 

28 

29 y = data["Y"].reshape(len(X), -1) 

30 

31 context = None 

32 if "context" in data.keys(): 

33 context = data["context"] 

34 

35 sample_weight = None 

36 if "sample_weight" in data.keys(): 

37 sample_weight = data["sample_weight"] 

38 

39 objective = None 

40 if "objective" in data.keys(): 

41 objective = data["objective"] 

42 

43 name = "data" 

44 if "name" in query.keys(): 

45 name = query["name"] 

46 

47 x_split = np.zeros(len(X)) 

48 if "X_split" in data.keys(): 

49 x_split = data["X_split"] 

50 

51 if "train" in data.keys(): 

52 x_split = data["train"] 

53 

54 return (X, y, sample_weight, x_split, context, objective, name) 

55 

56 

57class Generic_Features_processor(Preprocessor): 

58 def __init__( 

59 self, 

60 name="Generic_Features_processor", 

61 cache=None, 

62 structure=None, 

63 update_query=None, 

64 list_params_features=[], 

65 list_fit_features=[], 

66 list_compute_features=[], 

67 list_update_params_features=None, 

68 list_params_targets=[], 

69 list_fit_targets=[], 

70 list_compute_targets=[], 

71 list_update_params_targets=None, 

72 normalise_data=False, 

73 normalise_context=False, 

74 dataset_formalizer=None, 

75 min_size=1, 

76 concat_features=False, 

77 concat_targets=True, 

78 **kwargs 

79 ): 

80 """Preprocessor class (inherit from Processor) : that aim to preprocess data in a (fit/transform) scheme 

81 and hold a cache manager functionality to save/load object 

82 

83 Args: 

84 name (str, optional): Name of processor. Defaults to 'processor'. 

85 cache (Cache_manager or None, optional): Cache manager. Defaults to None : no save/load procedure 

86 structure (obj or None): structure that contains specification about how data has to be structured 

87 after preprocessing 

88 update_query (function, optional): Function to update query due to Processor application if needed. 

89 Defaults to default_update_query : no update/ 

90 fit_function = function to apply in fit procedure. Defaults to fit_default that does nothing. 

91 transform_function = function to apply in tranform procedure. Defaults to transform_default that 

92 does nothing. 

93 

94 """ 

95 

96 super().__init__( 

97 name=name, 

98 cache=cache, 

99 structure=structure, 

100 update_query=update_query, 

101 list_params_features=list_params_features, 

102 list_fit_features=list_fit_features, 

103 list_compute_features=list_compute_features, 

104 list_update_params_features=list_update_params_features, 

105 list_params_targets=list_params_targets, 

106 list_fit_targets=list_fit_targets, 

107 list_compute_targets=list_compute_targets, 

108 list_update_params_targets=list_update_params_targets, 

109 normalise_data=normalise_data, 

110 normalise_context=normalise_context, 

111 dataset_formalizer=dataset_formalizer, 

112 min_size=min_size, 

113 concat_features=concat_features, 

114 concat_targets=concat_targets, 

115 **kwargs 

116 ) 

117 

118 def fit(self, data, query={}, **kwargs): 

119 """Fit Preprocessing using data and fit_function procedure 

120 

121 Args: 

122 data (obj, optional): data. Defaults to None. 

123 query: dict_query that generated the data 

124 save_formaliser (bool, optional): boolean flag that inform if we have to save preprocessor or not 

125 """ 

126 if len(data) == 2: 

127 data, context = data 

128 if self.normalise_context: 

129 context, context_scaler = FE_proc.normalise_panda( 

130 context, mode="fit_transform" 

131 ) 

132 self.context_scaler = context_scaler 

133 else: 

134 context = None 

135 

136 if self.normalise_data: 

137 data, data_scaler = FE_proc.normalise_panda(data, mode="fit_transform") 

138 self.data_scaler = data_scaler 

139 

140 for n, (params, fit_func) in enumerate( 

141 zip(self.list_params_features, self.list_fit_features) 

142 ): 

143 if fit_func is not None: 

144 if self.list_update_params_features is not None: 

145 if self.list_update_params_features[n] is not None: 

146 params = self.list_update_params_features[n](query, params) 

147 params["params_"] = fit_func(data, context, **params) 

148 

149 for n, (params, fit_func) in enumerate( 

150 zip(self.list_params_targets, self.list_fit_targets) 

151 ): 

152 if fit_func is not None: 

153 if self.list_update_params_targets is not None: 

154 if self.list_update_params_targets[n] is not None: 

155 params = self.list_update_params_targets[n](query, params) 

156 params["params_"] = fit_func(data, context, **params) 

157 return super().fit(data) 

158 

159 def transform(self, data, query={}, training=True, **kwarg): 

160 """Apply transform_function to data 

161 Args: 

162 data (obj, optional): data. Defaults to None. 

163 query: dict_query that generated the data 

164 

165 Return 

166 data : Preprocessed data 

167 """ 

168 if len(data) < self.min_size: 

169 raise (ValueError("not enough data. min_size: " + str(self.min_size))) 

170 

171 if len(data) == 2: 

172 data, context = data 

173 

174 if self.normalise_context: 

175 context = FE_proc.normalise_panda( 

176 context, mode="transform", scaler=self.context_scaler 

177 ) 

178 

179 else: 

180 context = None 

181 

182 if self.normalise_data: 

183 data = FE_proc.normalise_panda( 

184 data, mode="transform", scaler=self.data_scaler 

185 ) 

186 

187 list_features = [] 

188 list_targets = [] 

189 

190 # Computation of features 

191 for n, (params, compute_func) in enumerate( 

192 zip(self.list_params_features, self.list_compute_features) 

193 ): 

194 if compute_func is not None: 

195 if self.list_update_params_features is not None: 

196 if self.list_update_params_features[n] is not None: 

197 

198 params = self.list_update_params_features[n](query, params) 

199 

200 features, _ = compute_func(data, context, **params) 

201 list_features.append(features) 

202 

203 # Computation of Targets 

204 for n, (params, compute_func) in enumerate( 

205 zip(self.list_params_targets, self.list_compute_targets) 

206 ): 

207 if compute_func is not None: 

208 if self.list_update_params_targets is not None: 

209 if self.list_update_params_targets[n] is not None: 

210 params = self.list_update_params_targets[n](query, params) 

211 

212 targets, _ = compute_func(data, context, **params) 

213 list_targets.append(targets) 

214 

215 X = list_features 

216 if self.concat_features: 

217 X = np.concatenate(X, axis=1) 

218 

219 y = list_targets 

220 if self.concat_targets: 

221 y = np.concatenate(list_targets, axis=1) 

222 

223 if self.dataset_formalizer is not None: 

224 return self.dataset_formalizer(X, y, query) 

225 else: 

226 return (X, y) 

227 

228 def fit_transform(self, data, query={}, **kwarg): 

229 self.fit(data, query) 

230 data = self.transform(data, query) 

231 return data 

232 

233 

234def init_Features_processor( 

235 name="Features_processor", 

236 dict_params_FE_ctx=None, 

237 dict_params_FE_dyn=None, 

238 dict_params_FE_targets=None, 

239 update_params_FE_ctx=None, 

240 update_params_FE_dyn=None, 

241 update_params_FE_targets=None, 

242 normalise_data=False, 

243 normalise_context=False, 

244 dataset_formalizer=None, 

245 min_size=1, 

246 structure=None, 

247 cache=None, 

248): 

249 list_params_features = [] 

250 list_fit_features = [] 

251 list_compute_features = [] 

252 list_update_params_features = [] 

253 

254 if dict_params_FE_ctx is not None: 

255 list_params_features.append({"dict_FE_params": dict_params_FE_ctx}) 

256 list_fit_features.append(FE_proc.fit_feature_engeenering) 

257 list_compute_features.append(FE_proc.compute_feature_engeenering) 

258 list_update_params_features.append(update_params_FE_ctx) 

259 

260 if dict_params_FE_dyn is not None: 

261 list_params_features.append({"dict_FE_params": dict_params_FE_dyn}) 

262 list_fit_features.append(FE_proc.fit_feature_engeenering) 

263 list_compute_features.append(FE_proc.compute_feature_engeenering) 

264 list_update_params_features.append(update_params_FE_dyn) 

265 

266 list_params_targets = [{"dict_FE_params": dict_params_FE_targets}] 

267 list_fit_targets = [FE_proc.fit_feature_engeenering] 

268 list_compute_targets = [FE_proc.compute_feature_engeenering] 

269 list_update_params_targets = [update_params_FE_targets] 

270 

271 Formalizer = Generic_Features_processor( 

272 name=name, 

273 cache=cache, 

274 structure=structure, 

275 list_params_features=list_params_features, 

276 list_fit_features=list_fit_features, 

277 list_compute_features=list_compute_features, 

278 list_update_params_features=list_update_params_features, 

279 list_params_targets=list_params_targets, 

280 list_fit_targets=list_fit_targets, 

281 list_compute_targets=list_compute_targets, 

282 list_update_params_targets=list_update_params_targets, 

283 normalise_data=normalise_data, 

284 normalise_context=normalise_context, 

285 dataset_formalizer=dataset_formalizer, 

286 min_size=min_size, 

287 ) 

288 

289 return Formalizer