Coverage for uqmodels/preprocessing/features_processing.py: 10%

369 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-05 14:29 +0000

1""" 

2Data preprocessing module. 

3""" 

4 

5import inspect 

6 

7import numpy as np 

8import pandas as pd 

9import tsfresh 

10from sklearn.decomposition import PCA 

11from sklearn.ensemble import RandomForestRegressor 

12from sklearn.preprocessing import StandardScaler 

13from tsfresh import extract_features 

14from tsfresh.feature_extraction import EfficientFCParameters 

15 

16import uqmodels.preprocessing.features_processing as f_proc 

17from uqmodels.preprocessing.preprocessing import downscale_series, upscale_series 

18from uqmodels.utils import base_cos_freq, convolute_1D, corr_matrix_array 

19 

20# import uqmodels.test as UQ_test 

21 

22 

23def check_transform_input_to_panda(input, name=""): 

24 """Check if input is dataframe. 

25 if it's a np.ndarray turn it to dataframe 

26 else raise error. 

27 

28 Args: 

29 input (_type_): input to check or tranforaam 

30 name (str, optional): name of input 

31 

32 Raises: 

33 TypeError: Input have a wrong type 

34 

35 Returns: 

36 input: pd.dataframe 

37 """ 

38 if isinstance(input, np.ndarray): 

39 # print('Change ndarray in pd.dataframe') 

40 return pd.DataFrame(input) 

41 

42 if not isinstance(input, pd.DataFrame): 

43 print("Type issues in " + inspect.stack()[1].function) 

44 print(name + "should be pd.DataFrame rather than " + type(input)) 

45 raise TypeError 

46 return input 

47 

48 

49def normalise_panda(dataframe, mode, scaler=None): 

50 """Apply normalisation on a dataframe 

51 

52 Args: 

53 dataframe (_type_): _description_ 

54 mode (_type_): _description_ 

55 

56 Returns: 

57 _type_: _description_ 

58 """ 

59 if mode == "fit": 

60 scaler = StandardScaler() 

61 scaler.fit(dataframe.values) 

62 return scaler 

63 if mode == "fit_transform": 

64 scaler = StandardScaler() 

65 values = scaler.fit_transform(dataframe.values) 

66 dataframe = pd.DataFrame( 

67 values, columns=dataframe.columns, index=dataframe.index 

68 ) 

69 return (dataframe, scaler) 

70 if mode == "transform": 

71 values = scaler.fit_transform(dataframe.values) 

72 dataframe = pd.DataFrame( 

73 values, columns=dataframe.columns, index=dataframe.index 

74 ) 

75 return dataframe 

76 

77 

78# Target selection from data : 

79 

80 

81def select_tsfresh_params( 

82 list_keys=["variance", "skewness", "fft", "cwt", "fourrier", "mean" "trend"] 

83): 

84 dict_params = dict() 

85 for key in list_keys: 

86 for k in EfficientFCParameters().keys(): 

87 if key in k: 

88 dict_params[k] = EfficientFCParameters()[k] 

89 return dict_params 

90 

91 

92def select_data(data, context=None, ind_data=None, **kwargs): 

93 """Select data from ind_data indice array 

94 

95 Args: 

96 data (ndarray): data 

97 ind_data (ind_array, optional): selected data. Defaults to None : all dim are pick 

98 

99 Returns: 

100 data_selected : Ndarray that contains np.concatenation of all selected features 

101 """ 

102 data_selected = select_data_and_context( 

103 data, context=None, ind_data=ind_data, ind_context=None 

104 ) 

105 return data_selected 

106 

107 

108# ----------------------------------------- # 

109# PCA transformation form data & context 

110 

111 

112def select_data_and_context( 

113 data, context=None, ind_data=None, ind_context=None, **kwargs 

114): 

115 """Select data and context using ind_data & ind_context. 

116 

117 Args: 

118 data (ndarray): data 

119 context (ndarray, optional): context_data. Defaults to None. 

120 n_components (int, optional): n_components of pca. Defaults to 3. 

121 ind_data (ind_array, optional): selected data. Defaults to None : all dim are pick 

122 ind_context (ind_array, optional): seletected data context. 

123 Defaults to None : all dim are pick if there is context 

124 Returns: 

125 data_selected : Ndarray that contains np.concatenation of all selected features 

126 """ 

127 

128 data = check_transform_input_to_panda(data, "data") 

129 

130 if context is not None: 

131 context = check_transform_input_to_panda(context, "data") 

132 

133 data_selected = [] 

134 if ind_data is None: 

135 pass 

136 else: 

137 data_selected.append(data.loc[:, ind_data]) 

138 

139 if ind_context is None: 

140 pass 

141 else: 

142 data_selected.append(context.loc[:, ind_context]) 

143 

144 if len(data_selected) == 0: 

145 data_selected = data 

146 else: 

147 data_selected = pd.concat(data_selected, axis=1) 

148 return data_selected.values 

149 

150 

151def fit_pca( 

152 data, 

153 context=None, 

154 n_components=3, 

155 data_lag=1, 

156 ind_data=None, 

157 ind_context=None, 

158 **kwargs 

159): 

160 """Fit&Compute for PCA features generation: fit PCA from selected data & context. 

161 

162 Args: 

163 data (ndarray): data 

164 context (ndarray, optional): context_data. Defaults to None. 

165 n_components (int, optional): n_components of pca. Defaults to 3. 

166 ind_data (ind_array, optional): selected data. 

167 Defaults to None : all dim are pick 

168 ind_context (ind_array, optional): seletected data context. Defaults to None : all dim are pick 

169 if there is context 

170 """ 

171 data = np.roll(data, data_lag, axis=0) 

172 data_to_reduce = select_data_and_context( 

173 data=data, context=context, ind_data=ind_data, ind_context=ind_context, **kwargs 

174 ) 

175 PCA_model = PCA(n_components=n_components) 

176 PCA_model.fit(data_to_reduce) 

177 return PCA_model 

178 

179 

180def compute_pca( 

181 data, 

182 context=None, 

183 n_components=3, 

184 data_lag=1, 

185 ind_data=None, 

186 ind_context=None, 

187 params_=None, 

188 **kwargs 

189): 

190 """Fit&Compute for PCA features generation: 

191 compute PCA from selected data & context and params which contains fitted pca. 

192 if params is none call fit_pca to get a fitted PCA_model 

193 

194 Args: 

195 data (ndarray): data 

196 context (ndarray, optional): context_data. Defaults to None. 

197 n_components (int, optional): n_components of pca. Defaults to 3. 

198 ind_data (ind_array, optional): selected data. Defaults to None : all dim are pick 

199 ind_context (ind_array, optional): seletected data context. 

200 Defaults to None : all dim are pick if there is context 

201 

202 Return: 

203 data_reduced,PCA_model 

204 

205 """ 

206 if params_ is None: 

207 PCA_model = fit_pca( 

208 data, 

209 context=context, 

210 n_components=n_components, 

211 data_lag=data_lag, 

212 ind_data=ind_data, 

213 ind_context=ind_context, 

214 **kwargs 

215 ) 

216 else: 

217 PCA_model = params_ 

218 

219 data = np.roll(data, data_lag, axis=0) 

220 data_to_reduce = select_data_and_context( 

221 data=data, context=context, ind_data=ind_data, ind_context=ind_context 

222 ) 

223 data_reduced = PCA_model.transform(data_to_reduce) 

224 return data_reduced, PCA_model 

225 

226 

227def automatic_periods_detection(array): 

228 autocorr = np.argmax(pd.Series.autocorr(array) + 4) # N_ech periodicity 

229 return autocorr 

230 

231 

232def fit_compute_periods( 

233 data, 

234 context=None, 

235 ind_data=None, 

236 ind_context=None, 

237 periodicities=[1], 

238 freqs=[1], 

239 params_=None, 

240 **kwargs 

241): 

242 """Turn step_scale context array into cos/sin periodic features 

243 

244 Args: 

245 context (_type_): context_data 

246 ind_context (_type_): ind of step_scale 

247 modularity (_type_): modularity of data 

248 freq (list, optional): frequence of sin/cos. Defaults to [1]. 

249 """ 

250 step_scale = select_data_and_context( 

251 data=data, context=context, ind_data=None, ind_context=ind_context, **kwargs 

252 ) 

253 

254 list_features = [] 

255 for period in periodicities: 

256 features_tmp = base_cos_freq((step_scale % period) / period, freqs) 

257 list_features.append(features_tmp) 

258 

259 list_features = np.concatenate(list_features, axis=1) 

260 

261 return (list_features, params_) 

262 

263 

264def fit_compute_lag_values( 

265 data, 

266 context=None, 

267 ind_data=None, 

268 ind_context=None, 

269 derivs=[0], 

270 windows=[1], 

271 lag=[0], 

272 delay=0, 

273 params=None, 

274 **kwargs 

275): 

276 """Turn step_scale context array into cos/sin periodic features 

277 

278 Args: 

279 context (_type_): context_data 

280 ind_context (_type_): ind of step_scale 

281 modularity (_type_): modularity of data 

282 freq (list, optional): frequence of sin/cos. Defaults to [1]. 

283 """ 

284 selected_data = select_data_and_context( 

285 data=data, context=context, ind_data=ind_data, ind_context=ind_context, **kwargs 

286 ) 

287 MA_derivate, _ = fit_compute_MA_derivate( 

288 selected_data, derivs=derivs, windows=windows 

289 ) 

290 lag_features, _ = fit_compute_lag(MA_derivate, lag=lag, delay=delay) 

291 return (lag_features, params) 

292 

293 

294def fit_compute_MA_derivate( 

295 data, 

296 context=None, 

297 ind_data=None, 

298 ind_context=None, 

299 windows=[1], 

300 lags=[0], 

301 derivs=[0], 

302 params=None, 

303 **kwargs 

304): 

305 """Compute a MA-values of the window last values, then apply lags, then derivates and returns values. 

306 Apply a 1-lag by default 

307 """ 

308 

309 data = select_data_and_context( 

310 data=data, context=context, ind_data=ind_data, ind_context=ind_context, **kwargs 

311 ) 

312 

313 dim = data.shape 

314 features = [] 

315 for w in windows: 

316 conv_array = np.roll(data, w - 1, axis=0) 

317 if w > 1: 

318 filter_ = np.concatenate([np.ones(w) * 1 / w]) 

319 lag_array = np.roll(data, w - 1, axis=0) 

320 conv_array = convolute_1D(lag_array, filter_) 

321 

322 for lag in lags: 

323 lag_array = conv_array 

324 if lag != 0: 

325 lag_array = np.roll(conv_array, lag, axis=0) 

326 for deriv in derivs: 

327 deriv_array = lag_array 

328 if deriv != 0: 

329 deriv_array = np.diff( 

330 lag_array, deriv, prepend=np.ones((1, dim[1])) * data[0], axis=0 

331 ) 

332 features.append(deriv_array) 

333 

334 deriv_features = np.stack(features).reshape(-1, dim[0], dim[1]) 

335 deriv_features = np.swapaxes(deriv_features, 2, 1).reshape(-1, dim[0]).T 

336 return deriv_features, params 

337 

338 

339def fit_compute_lag( 

340 data, 

341 context=None, 

342 lag=[0], 

343 delay=0, 

344 ind_data=None, 

345 ind_context=None, 

346 params=None, 

347 **kwargs 

348): 

349 """Create lag features from a numerical array 

350 Args: 

351 Y (float array): Target to extract lag-feature 

352 lag (int, optional): Lag number. Defaults to 3. 

353 delay (int, optional): Delay before 1 lag feature. Defaults to 0. 

354 """ 

355 

356 data = select_data_and_context( 

357 data=data, context=context, ind_data=ind_data, ind_context=ind_context, **kwargs 

358 ) 

359 

360 dim = data.shape 

361 new_features_list = [] 

362 new_features_name = [] 

363 for i in np.array(lag) + delay: 

364 Y_cur = np.roll(data, i, axis=0) 

365 if i > 0: 

366 Y_cur[0:i] = 0 

367 for g in range(dim[1]): 

368 new_features_list.append(Y_cur[:, g]) 

369 new_features_name.append("lag_" + str(i) + "_dim:" + str(g)) 

370 new_features_name = np.array(new_features_name) 

371 return np.array(new_features_list).T, params 

372 

373 

374def mask_corr_feature_target(X, y, v_seuil=0.05): 

375 v_corr = np.abs(corr_matrix_array(X, y[:, 0])) 

376 for i in np.arange(y.shape[1] - 1): 

377 v_corr = np.maximum(v_corr, np.abs(corr_matrix_array(X, y[:, i + 1]))) 

378 

379 return v_corr > v_seuil 

380 

381 

382def select_features_from_FI(X, y, model="RF", threesold=0.01, **kwargs): 

383 if model == "RF": 

384 estimator = RandomForestRegressor( 

385 ccp_alpha=1e-05, 

386 max_depth=15, 

387 max_features=0.5, 

388 max_samples=0.5, 

389 min_impurity_decrease=0.0001, 

390 min_samples_leaf=2, 

391 min_samples_split=8, 

392 n_estimators=100, 

393 random_state=0, 

394 ) 

395 

396 estimator.fit(X, y) 

397 features_mask = estimator.feature_importances_ > threesold 

398 return features_mask 

399 

400 

401def build_window_representation(y, step=1, window=10): 

402 if step > 1: 

403 mask = np.arange(len(y)) % step == step - 1 

404 else: 

405 mask = np.ones(len(y)) == 1 

406 

407 list_df = [] 

408 for i in range(window): 

409 dict_df_ts = { 

410 "id": np.arange(len(y[mask])), 

411 "time": np.roll(np.arange(len(y[mask])), i), 

412 } 

413 for k in range(y.shape[1]): 

414 dict_df_ts["value_" + str(k + 1)] = np.roll(y[:, k], i, axis=0)[mask] 

415 df_ts = pd.DataFrame(dict_df_ts) 

416 list_df.append(df_ts) 

417 df_ts = pd.concat(list_df) 

418 y_target = y[mask] 

419 if len(y) == window: 

420 df_ts = df_ts[df_ts["id"] == len(y) - 1] 

421 y_target = y_target[-1:] 

422 

423 return (df_ts, y_target) 

424 

425 

426def fit_tsfresh_feature_engeenering( 

427 data, 

428 context=None, 

429 window=10, 

430 step=None, 

431 ts_fresh_params=None, 

432 ind_data=None, 

433 ind_context=None, 

434 **kwargs 

435): 

436 data = select_data_and_context( 

437 data=data, context=context, ind_data=ind_data, ind_context=ind_context 

438 ) 

439 

440 if step is None: 

441 step = window 

442 

443 df_ts, y_target = build_window_representation(data, step, window) 

444 if (ts_fresh_params) is None: 

445 ts_fresh_params = EfficientFCParameters() 

446 

447 X_extracted = extract_features( 

448 df_ts, ts_fresh_params, column_id="id", column_sort="time" 

449 ) 

450 filter_ = np.isnan(X_extracted).sum(axis=0) == 0 

451 

452 X_extracted = X_extracted.loc[:, filter_] 

453 filter_ = mask_corr_feature_target(X_extracted, y_target) 

454 X_extracted = X_extracted.loc[:, filter_] 

455 

456 X_selected = tsfresh.feature_selection.select_features(X_extracted, y_target[:, 0]) 

457 

458 for i in range(data.shape[1] - 1): 

459 X_selected_tmp = tsfresh.feature_selection.select_features( 

460 X_extracted, y_target[:, i + 1] 

461 ) 

462 

463 mask = [ 

464 colums_tmp not in X_selected.columns.values.tolist() 

465 for colums_tmp in X_selected_tmp.columns.values 

466 ] 

467 X_selected = pd.concat([X_selected, X_selected_tmp.loc[:, mask]], axis=1) 

468 

469 mask_featutre_selection = select_features_from_FI(X_selected.values, y_target) 

470 name_feature_selected = X_selected.columns[mask_featutre_selection].values 

471 kind_to_fc_parameters = tsfresh.feature_extraction.settings.from_columns( 

472 name_feature_selected 

473 ) 

474 return kind_to_fc_parameters 

475 

476 

477def compute_tsfresh_feature_engeenering( 

478 data, 

479 context=None, 

480 window=10, 

481 step=10, 

482 ind_data=None, 

483 ind_context=None, 

484 params_=None, 

485 **kwargs 

486): 

487 data = select_data_and_context( 

488 data=data, context=context, ind_data=ind_data, ind_context=ind_context 

489 ) 

490 

491 if step is None: 

492 step = window 

493 

494 if params_ is None: 

495 params_ = fit_tsfresh_feature_engeenering(data, window, step) 

496 

497 df_ts, y_target = build_window_representation(data, step, window) 

498 X_extracted = extract_features( 

499 df_ts, 

500 kind_to_fc_parameters=params_, 

501 column_id="id", 

502 column_sort="time", 

503 disable_progressbar=True, 

504 ) 

505 return (X_extracted.values, y_target), params_ 

506 

507 

508def fit_FE_by_estimator( 

509 data, 

510 context, 

511 ind_data=None, 

512 ind_context=None, 

513 estimator=None, 

514 estimator_params=dict(), 

515 data_lag=[1], 

516 **kwargs 

517): 

518 data = pd.DataFrame(np.roll(data.values, data_lag, axis=0), columns=data.columns) 

519 data = select_data_and_context( 

520 data=data, context=context, ind_data=ind_data, ind_context=ind_context, **kwargs 

521 ) 

522 estimator = estimator(**estimator_params) 

523 estimator.fit(data) 

524 return estimator 

525 

526 

527def compute_FE_by_estimator( 

528 data, 

529 context, 

530 ind_data=None, 

531 ind_context=None, 

532 estimator=None, 

533 estimator_params=dict(), 

534 data_lag=[1], 

535 params_=None, 

536 **kwargs 

537): 

538 data = pd.DataFrame(np.roll(data.values, data_lag, axis=0), columns=data.columns) 

539 

540 data = select_data_and_context( 

541 data=data, context=context, ind_data=ind_data, ind_context=ind_context, **kwargs 

542 ) 

543 

544 if params_ is None: 

545 params_ = fit_FE_by_estimator( 

546 data, context, ind_data, ind_context, estimator, **kwargs 

547 ) 

548 estimator = params_ 

549 feature = estimator.transform(data) 

550 return (feature, estimator) 

551 

552 

553def fit_feature_engeenering(data, context=None, dict_FE_params=dict(), **kwargs): 

554 if "resample_data_params" in dict_FE_params.keys(): 

555 resample_data_params = dict_FE_params["resample_data_params"] 

556 if resample_data_params["type"] == "upscale": 

557 data = upscale_series(data, **resample_data_params) 

558 

559 elif resample_data_params["type"] == "downscale": 

560 data = downscale_series(data, **resample_data_params) 

561 

562 if "resample_context_params" in dict_FE_params.keys(): 

563 resample_context_params = dict_FE_params["resample_context_params"] 

564 if resample_data_params["type"] == "upscale": 

565 context = upscale_series(context, **resample_context_params) 

566 

567 elif resample_data_params["type"] == "downscale": 

568 context = downscale_series(context, **resample_context_params) 

569 

570 list_features = [] 

571 if "raw_selection" in dict_FE_params.keys(): 

572 if not isinstance(dict_FE_params["raw_selection"], list): 

573 dict_FE_params["raw_selection"] = [dict_FE_params["raw_selection"]] 

574 

575 for dict_params in dict_FE_params["raw_selection"]: 

576 feature_tmp = select_data_and_context( 

577 data=data, context=context, **dict_params 

578 ) 

579 list_features.append(feature_tmp) 

580 

581 if "periods" in dict_FE_params.keys(): 

582 if not isinstance(dict_FE_params["periods"], list): 

583 dict_FE_params["periods"] = [dict_FE_params["periods"]] 

584 for dict_params in dict_FE_params["periods"]: 

585 feature_tmp, _ = fit_compute_periods(data, context=context, **dict_params) 

586 dict_params["params_"] = None 

587 list_features.append(feature_tmp) 

588 

589 if "MA_derivate" in dict_FE_params.keys(): 

590 if not isinstance(dict_FE_params["MA_derivate"], list): 

591 dict_FE_params["MA_derivate"] = [dict_FE_params["MA_derivate"]] 

592 for dict_params in dict_FE_params["MA_derivate"]: 

593 feature_tmp, _ = fit_compute_MA_derivate( 

594 data, context=context, **dict_params 

595 ) 

596 dict_params["params_"] = None 

597 list_features.append(feature_tmp) 

598 print(feature_tmp.shape) 

599 

600 if "FE_by_estimator" in dict_FE_params.keys(): 

601 if not isinstance(dict_FE_params["FE_by_estimator"], list): 

602 dict_FE_params["FE_by_estimator"] = [dict_FE_params["FE_by_estimator"]] 

603 

604 for dict_params in dict_FE_params["FE_by_estimator"]: 

605 params_ = fit_FE_by_estimator(data, context=context, **dict_params) 

606 dict_params["params_"] = params_ 

607 feature_tmp, _ = compute_FE_by_estimator(data, context, **dict_params) 

608 list_features.append(feature_tmp) 

609 

610 if "MV_features" in dict_FE_params.keys(): 

611 if not isinstance(dict_FE_params["MV_features"], list): 

612 dict_FE_params["MV_features"] = [dict_FE_params["MV_features"]] 

613 

614 for dict_params in dict_FE_params["MV_features"]: 

615 params_ = fit_MV_features(data, context=context, **dict_params) 

616 dict_params["params_"] = params_ 

617 feature_tmp, _ = compute_MV_features(data, context, **dict_params) 

618 list_features.append(feature_tmp) 

619 

620 if "ctx_features" in dict_FE_params.keys(): 

621 if not isinstance(dict_FE_params["ctx_features"], list): 

622 dict_FE_params["ctx_features"] = [dict_FE_params["ctx_features"]] 

623 

624 for dict_params in dict_FE_params["ctx_features"]: 

625 params_ = fit_ctx_features(data, context=context, **dict_params) 

626 dict_params["params_"] = params_ 

627 feature_tmp, _ = compute_ctx_features(data, context, **dict_params) 

628 list_features.append(feature_tmp) 

629 

630 if "ts_fresh" in dict_FE_params.keys(): 

631 if not isinstance(dict_FE_params["ts_fresh"], list): 

632 dict_FE_params["ts_fresh"] = [dict_FE_params["ts_fresh"]] 

633 

634 for dict_params in dict_FE_params["ts_fresh"]: 

635 ts_fresh_params_ = fit_tsfresh_feature_engeenering( 

636 data, context=context, **dict_params 

637 ) 

638 dict_params["params_"] = ts_fresh_params_ 

639 dict_params["step"] = 1 

640 (feature_tmp, _), _ = compute_tsfresh_feature_engeenering( 

641 data, context=context, **dict_params 

642 ) 

643 list_features.append(feature_tmp) 

644 

645 X = np.concatenate(list_features, axis=1) 

646 if "selection" in dict_FE_params.keys(): 

647 dict_FE_params["selection"]["params_"] = select_features_from_FI( 

648 X, data, **dict_FE_params["selection"] 

649 ) 

650 return dict_FE_params 

651 

652 

653def compute_feature_engeenering( 

654 data, context=None, dict_FE_params=dict(), params_=None 

655): 

656 if "resample_data_params" in dict_FE_params.keys(): 

657 resample_data_params = dict_FE_params["resample_data_params"] 

658 if resample_data_params["type"] == "upscale": 

659 data = upscale_series(data, **resample_data_params) 

660 

661 elif resample_data_params["type"] == "downscale": 

662 data = downscale_series(data, **resample_data_params) 

663 

664 if "resample_context_params" in dict_FE_params.keys(): 

665 resample_context_params = dict_FE_params["resample_context_params"] 

666 if resample_data_params["type"] == "upscale": 

667 context = upscale_series(context, **resample_context_params) 

668 

669 elif resample_data_params["type"] == "downscale": 

670 context = downscale_series(context, **resample_context_params) 

671 

672 if params_ is None: 

673 params_ = fit_feature_engeenering(data, context, dict_FE_params=dict_FE_params) 

674 dict_FE_params = params_ 

675 

676 list_features = [] 

677 if "raw_selection" in dict_FE_params.keys(): 

678 for dict_params in dict_FE_params["raw_selection"]: 

679 list_features.append( 

680 select_data_and_context(data=data, context=context, **dict_params) 

681 ) 

682 

683 if "periods" in dict_FE_params.keys(): 

684 for dict_params in dict_FE_params["periods"]: 

685 feature_tmp, _ = fit_compute_periods(data, context=context, **dict_params) 

686 list_features.append(feature_tmp) 

687 

688 if "MA_derivate" in dict_FE_params.keys(): 

689 for dict_params in dict_FE_params["MA_derivate"]: 

690 feature_tmp, _ = fit_compute_MA_derivate( 

691 data, context=context, **dict_params 

692 ) 

693 list_features.append(feature_tmp) 

694 

695 if "MV_features" in dict_FE_params.keys(): 

696 for dict_params in dict_FE_params["MV_features"]: 

697 feature_tmp, _ = compute_MV_features(data, context=context, **dict_params) 

698 list_features.append(feature_tmp) 

699 

700 if "ctx_features" in dict_FE_params.keys(): 

701 for dict_params in dict_FE_params["ctx_features"]: 

702 feature_tmp, _ = compute_ctx_features(data, context=context, **dict_params) 

703 list_features.append(feature_tmp) 

704 

705 if "ts_fresh" in dict_FE_params.keys(): 

706 for dict_params in dict_FE_params["ts_fresh"]: 

707 (feature_tmp, _), _ = compute_tsfresh_feature_engeenering( 

708 data, context=context, **dict_params 

709 ) 

710 list_features.append(feature_tmp) 

711 

712 X = np.concatenate(list_features, axis=1) 

713 if "selection" in dict_FE_params.keys(): 

714 mask = dict_FE_params["selection"]["params_"] 

715 X = X[:, mask] 

716 

717 return X, dict_FE_params 

718 

719 

720def get_FE_params(delta=None): 

721 """Provide defaults parameters for features engenering 

722 

723 Args: 

724 delta (_type_, optional): resample step parameters 

725 """ 

726 

727 dict_FE = { 

728 "raw_selection": {"ind_context": [3]}, 

729 "fit_compute_MA_derivate": { 

730 "windows": [1, 10, 60], 

731 "lags": [1], 

732 "derivs": [0, 1, 10], 

733 }, 

734 "periods": {"ind_context": [0], "periodicities": [10, 100], "freqs": [1, 2]}, 

735 "ts_fresh": { 

736 "window": 20, 

737 "step": 5, 

738 "ts_fresh_params": f_proc.select_tsfresh_params(["mean", "cwt"]), 

739 }, 

740 } 

741 if delta is not None: 

742 dict_FE["resample_data_params"] = { 

743 "type": "downscale", 

744 "delta": delta, 

745 "mode": "mean", 

746 } 

747 dict_FE["resample_context_params"] = { 

748 "type": "downscale", 

749 "delta": delta, 

750 "mode": "first", 

751 } 

752 return dict_FE 

753 

754 

755def fit_MV_features( 

756 data, 

757 context, 

758 ind_data=None, 

759 ind_context=None, 

760 focus=None, 

761 n_components=3, 

762 n_neighboor=4, 

763 lags=[0], 

764 derivs=[0], 

765 windows=[1], 

766 **kwargs 

767): 

768 """Naive FE function : Fit function to select features having stronger correlation to targets, 

769 plus compute PCA synthesis of them 

770 

771 Args: 

772 data (_type_): _description_ 

773 context (_type_): _description_ 

774 ind_data (_type_, optional): _description_. Defaults to None. 

775 ind_context (_type_, optional): _description_. Defaults to None. 

776 focus (_type_, optional): _description_. Defaults to None. 

777 n_components (int, optional): _description_. Defaults to 3. 

778 n_neighboor (int, optional): _description_. Defaults to 4. 

779 lags (list, optional): _description_. Defaults to [0]. 

780 derivs (list, optional): _description_. Defaults to [0]. 

781 windows (list, optional): _description_. Defaults to [1]. 

782 

783 Returns: 

784 _type_: _description_ 

785 """ 

786 series = select_data(data, context, ind_data=ind_data, ind_context=ind_context) 

787 ind_focus = ind_data.index(focus) 

788 

789 order = None 

790 if n_neighboor > 0: 

791 print(series.shape) 

792 corr_matrice = np.corrcoef(series, rowvar=False) 

793 order = np.argsort(corr_matrice[ind_focus])[::-1][1:] 

794 

795 estimator = None 

796 scaler = None 

797 if n_components > 0: 

798 estimator = PCA(n_components=n_components) 

799 scaler = StandardScaler() 

800 series = scaler.fit_transform(series) 

801 estimator.fit(series) 

802 

803 return order, scaler, estimator 

804 

805 

806def compute_MV_features( 

807 data, 

808 context, 

809 ind_data=None, 

810 ind_context=None, 

811 focus=None, 

812 n_components=3, 

813 n_neighboor=4, 

814 lags=[0], 

815 derivs=[0], 

816 windows=[1], 

817 params_=None, 

818 **kwargs 

819): 

820 """Naive FE function : Fit function to select features having stronger correlation to targets, 

821 plus compute PCA synthesis of them 

822 

823 Args: 

824 data (_type_): _description_ 

825 context (_type_): _description_ 

826 ind_data (_type_, optional): _description_. Defaults to None. 

827 ind_context (_type_, optional): _description_. Defaults to None. 

828 focus (_type_, optional): _description_. Defaults to None. 

829 n_components (int, optional): _description_. Defaults to 3. 

830 n_neighboor (int, optional): _description_. Defaults to 4. 

831 lags (list, optional): _description_. Defaults to [0]. 

832 derivs (list, optional): _description_. Defaults to [0]. 

833 windows (list, optional): _description_. Defaults to [1]. 

834 params_ (_type_, optional): _description_. Defaults to None. 

835 

836 Returns: 

837 _type_: _description_ 

838 """ 

839 if params_ is None: 

840 params_ = fit_MV_features( 

841 data, 

842 context, 

843 ind_data=ind_data, 

844 ind_context=ind_context, 

845 focus=focus, 

846 n_components=n_components, 

847 ) 

848 

849 order, scaler, estimator = params_ 

850 

851 series = select_data(data, context, ind_data=ind_data, ind_context=ind_context) 

852 

853 list_series = [] 

854 if n_neighboor > 0: 

855 list_series.append(series[:, order[:n_neighboor]]) 

856 

857 if n_components > 0: 

858 series = scaler.transform(series) 

859 pca_series = estimator.transform(series) 

860 list_series.append(pca_series) 

861 

862 if len(list_series) == 1: 

863 list_series = list_series[0] 

864 else: 

865 list_series = np.concatenate(list_series, axis=1) 

866 

867 list_series, _ = fit_compute_MA_derivate( 

868 list_series, derivs=derivs, lags=lags, windows=windows 

869 ) 

870 

871 return (list_series, params_) 

872 

873 

874def fit_ctx_features( 

875 data, context, ind_data=None, ind_context=None, n_components=3, lags=[0], **kwargs 

876): 

877 """Produce contextual information by apply a PCA on ctx_measure + nan_series if provided 

878 

879 Args: 

880 list_channels (_type_): ctx_sources to synthesize 2D (times, features) array 

881 nan_series (_type_, optional): nan_series : capteurs issues localisation. 

882 list_target_channels (list, optional): Defaults to [0]. 

883 

884 Returns: 

885 X_ctx 

886 """ 

887 

888 selected_data = select_data(data, context, ind_data=ind_data, ind_context=context) 

889 

890 scaler = StandardScaler() 

891 selected_data = scaler.fit_transform(selected_data) 

892 estimator = PCA(n_components) 

893 estimator.fit_transform(selected_data) 

894 return scaler, estimator 

895 

896 

897def compute_ctx_features( 

898 data, 

899 context, 

900 ind_data=None, 

901 ind_context=None, 

902 n_components=3, 

903 lag=0, 

904 params_=None, 

905 **kwargs 

906): 

907 """Produce contextual information by apply a PCA on ctx_measure + nan_series if provided 

908 

909 Args: 

910 list_channels (_type_): ctx_sources to synthesize 2D (times, features) array 

911 nan_series (_type_, optional): nan_series : capteurs issues localisation. 

912 list_target_channels (list, optional): Defaults to [0]. 

913 

914 Returns: 

915 X_ctx 

916 """ 

917 if params_ is None: 

918 params_ = fit_ctx_features( 

919 data, 

920 context, 

921 ind_data=ind_data, 

922 ind_context=ind_context, 

923 n_components=n_components, 

924 ) 

925 

926 scaler, estimator = params_ 

927 

928 selected_data = select_data(data, context, ind_data=ind_data, ind_context=context) 

929 

930 selected_data = scaler.transform(selected_data) 

931 selected_data = estimator.transform(selected_data) 

932 

933 selected_data, _ = fit_compute_lag(selected_data, lag=[lag]) 

934 

935 return selected_data, params_