Coverage for uqmodels / data_generation / Gen_multivariate_normal_data.py: 83%

386 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-09 08:15 +0000

1import math 

2import matplotlib.pyplot as plt 

3import numpy as np 

4import scipy 

5from scipy.signal import convolve2d, savgol_filter 

6 

7 

8def base_cos_freq(array, freq): 

9 list_ = [] 

10 for i in freq: 

11 list_.append(np.cos(i * math.pi * array)) 

12 list_.append(np.sin(i * math.pi * array)) 

13 return np.concatenate(list_).reshape(len(freq) * 2, -1).T 

14 

15 

16def trunc(x): 

17 """Round function for values visualisation""" 

18 return np.round(x, 5) 

19 

20 

21"""Return gaussian constant link to the percentile""" 

22 

23 

24# Motifs generation 

25def gen_motif(verbose=0, dim=(400, 20, 3), seed=0): 

26 dim_n, dim_t, dim_g = dim 

27 int(dim_g * 2) 

28 Pattern_day_mean = [] 

29 Pattern_day_var = [] 

30 Day_Motif = np.ones(dim_t) 

31 n_melange = 12 

32 

33 for n, i in enumerate([0.25, 0.5, 0.70, 0.90]): 

34 Day_Motif[n] = i 

35 Day_Motif[-n] = i 

36 

37 for g in range(dim_g): 

38 dist = [] 

39 for i in range(n_melange): 

40 rng0 = np.random.RandomState(seed + g * n_melange * 4 + i * 4 + 0) 

41 rng1 = np.random.RandomState(seed + g * n_melange * 4 + i * 4 + 1) 

42 rng2 = np.random.RandomState(seed + g * n_melange * 4 + i * 4 + 2) 

43 rng3 = np.random.RandomState(seed + g * n_melange * 4 + i * 4 + 3) 

44 

45 dist.append( 

46 rng0.normal( 

47 rng1.uniform(0, 1), 

48 rng2.uniform(0.02, 0.3), 

49 np.maximum(10000, rng3.normal(2000, 1000)).astype(int), 

50 ) 

51 ) 

52 a = np.roll(np.histogram(dist, bins=dim_t)[0], rng0.randint(dim_t)) 

53 Pattern_day_mean.append(0.03 + Day_Motif * savgol_filter((a) / (a).sum(), 9, 4)) 

54 dist = [] 

55 for i in range(n_melange): 

56 rng0 = np.random.RandomState( 

57 seed + g * n_melange * 4 + i * 4 + 0 + dim_g * n_melange * 4 

58 ) 

59 rng1 = np.random.RandomState( 

60 seed + g * n_melange * 4 + i * 4 + 1 + dim_g * n_melange * 4 

61 ) 

62 rng2 = np.random.RandomState( 

63 seed + g * n_melange * 4 + i * 4 + 2 + dim_g * n_melange * 4 

64 ) 

65 rng3 = np.random.RandomState( 

66 seed + g * n_melange * 4 + i * 4 + 3 + dim_g * n_melange * 4 

67 ) 

68 

69 dist.append( 

70 rng0.normal( 

71 rng1.uniform(0, 1), 

72 rng2.uniform(0.02, 0.3), 

73 np.maximum(10000, rng3.normal(2000, 1000)).astype(int), 

74 ) 

75 ) 

76 a = np.roll(np.histogram(dist, bins=dim_t)[0], rng0.randint(dim_t)) 

77 var_aux = np.maximum( 

78 np.zeros(dim_t) + 0.0001, savgol_filter(dim_t * ((a) / (a).sum()), 9, 4) 

79 ) 

80 Pattern_day_var.append(var_aux) 

81 if verbose != 0: 

82 plt.figure(figsize=(14, 4)) 

83 plt.subplot(2, 1, 1) 

84 [plt.plot(i) for i in Pattern_day_mean] 

85 plt.subplot(2, 1, 2) 

86 [plt.plot(i) for i in Pattern_day_var] 

87 plt.show() 

88 return (Pattern_day_mean, Pattern_day_var) 

89 

90 

91# Seasonal partern generation 

92def gen_seas_contexte(seas=140, pond=3, dim=(400, 20, 3), seed=0): 

93 rng = np.random.RandomState(int(seas + seed)) 

94 dim_n, dim_t, dim_g = dim 

95 seas = ( 

96 np.cos(2 * math.pi * (rng.rand() + np.arange(dim_t * dim_n) / seas)) 

97 ) * pond # 6 : Seas2 magnitude ponderation 

98 return seas 

99 

100 

101def gen_permuts_contexte(seas=140, pond=3, dim=(400, 20, 3), seed=0): 

102 rng = np.random.RandomState(seed) 

103 dim_n, dim_t, dim_g = dim 

104 seas = ( 

105 np.cos(2 * math.pi * (rng.rand() + np.arange(dim_t * dim_n) / seas)) 

106 ) * pond # 6 : Seas2 magnitude ponderation 

107 perm = rng.permutation(np.arange(dim_n * dim_t)) 

108 seas = seas[perm] 

109 return seas 

110 

111 

112# Pattern catégorielle aléatoire. 

113def gen_cat_contexte(size=5, effect=0.2, per=0.1, dim=(400, 20, 3), seed=0): 

114 rng = np.random.RandomState(seed) 

115 dim_n, dim_t, dim_g = dim 

116 n_event = int((per * (dim_n * dim_t)) / size) 

117 impact_ctx_mean = np.zeros((dim_t * dim_n, dim_g)) 

118 for n, i in enumerate(rng.choice(np.arange(dim_n * dim_t - size), n_event, False)): 

119 impact_ctx_mean[i: (i + size), :] += effect 

120 return impact_ctx_mean 

121 

122 

123# Anomaly pattern genaration : 

124# Experiment have been essentially realised with point anomaly. 

125# Detection based on non-point anomalies required specific methodology not defined in these implementations. 

126def gen_anom_motif(anom_point=True, verbose=False, dim=(400, 20, 3), seed=0): 

127 dim_t, dim_n, dim_g = dim 

128 rng = np.random.RandomState(seed) 

129 rng1 = np.random.RandomState(seed + 1) 

130 if anom_point: 

131 p2 = np.array([1, 0, 0, 0, 0]) 

132 P_list = [p2] 

133 motif1 = np.stack([P_list[int(i)] for i in np.zeros(dim_g)]) 

134 

135 motif = [motif1, -motif1] 

136 # Non-point anomaly case. 

137 else: 

138 # Poisson law motifs. 

139 # p1=np.histogram(np.random.gamma(np.random.normal(11,2,1), 

140 # scale=np.random.normal(0.8,0,1), 

141 # size=10000), 

142 # bins=5)[0]/10000 

143 # p2=np.histogram(np.random.gamma(np.random.normal(11,2,1), 

144 # scale=np.random.normal(0.8,0,1), 

145 # size=10000), 

146 # bins=5)[0]/10000 

147 

148 # Mannual motifs. 

149 # p0 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) 

150 p0 = np.array([1.7, 1.7, 1.4, 1.4, 1, 1, 1, 0.2, 0.1]) 

151 p1 = np.array([1.4, 1.1, 0.7, 0.4, 0.3, 0, 0, 0, 0]) 

152 p2 = np.array([1, 1.5, 1.3, 0.8, -0.8, -1, -0.8, 0, 0]) 

153 p3 = np.array([0.5, 1.1, -0.6, -0.7, -0.3, 0, 0, 0, 0]) 

154 

155 P_list = [p0, p1, p2, p3] 

156 

157 motif1 = np.stack([p1 for i in np.ones(dim_g)]) 

158 motif1 = np.stack([P_list[int(i)] for i in np.ones(dim_g)]) 

159 motif2 = np.stack( 

160 [P_list[int(i)] for i in rng.choice(np.arange(len(P_list)), dim_g)] 

161 ) 

162 motif3 = np.stack( 

163 [P_list[int(i)] for i in rng1.choice(np.arange(len(P_list)), dim_g)] 

164 ) 

165 

166 motif = [motif1, -motif1, motif2, motif3] 

167 txt_anom = [ 

168 "Anom positive faible", 

169 "Anom negative forte", 

170 "Anom negative faible", 

171 "Anom negative forte", 

172 ] 

173 if verbose: 

174 fig = plt.figure(figsize=(6, 4)) 

175 for n, i in enumerate(motif[0:4]): 

176 plt.subplot(2, 2, n + 1) 

177 plt.title(txt_anom[n]) 

178 im = plt.imshow(i * (1 + (n // 2) * 0.4), vmin=-2, vmax=2, cmap="seismic") 

179 fig.colorbar(im, orientation="vertical", shrink=1) 

180 plt.xticks(np.arange(5), np.arange(5) - 1) 

181 plt.show() 

182 return motif 

183 

184 

185# Anomaly generation 

186def gen_fake_anom(n_anom, motif, dim=(400, 20, 3)): 

187 dim_n, dim_t, dim_g = dim 

188 len(motif) 

189 len_motif = motif[0].shape[1] 

190 anom = np.zeros((dim_n * dim_t)) 

191 impact_anom = np.zeros((dim_t * dim_n, dim_g)) 

192 anom_list = [] 

193 ttype = [1, 2, 1, 1] 

194 for n, i in enumerate( 

195 [ 

196 dim_t + dim_t / 3, 

197 2 * dim_t + dim_t / 4, 

198 2 * dim_t + dim_t * 4 / 5, 

199 3 * dim_t + dim_t * 1 / 2, 

200 ] 

201 ): 

202 i = int(i) 

203 anom[i] = i 

204 anom_list.append((i // dim_t, i % dim_t, i, 1)) 

205 impact_anom[i: (i + len_motif), :] += (motif[ttype[n] - 1]).T 

206 return (anom_list, anom, impact_anom) 

207 

208 

209# Anomaly generation 

210def gen_anom(n_anom, motif, dim=(400, 20, 3), seed=0): 

211 rng = np.random.RandomState(seed) 

212 dim_n, dim_t, dim_g = dim 

213 n_motif = len(motif) 

214 len_motif = motif[0].shape[1] 

215 anom = np.zeros((dim_n * dim_t)) 

216 impact_anom = np.zeros((dim_t * dim_n, dim_g)) 

217 anom_list = [] 

218 for n, i in enumerate( 

219 rng.choice(np.arange(dim_n * dim_t - len_motif), n_anom, False) 

220 ): 

221 rng_bis = np.random.RandomState(seed + n) 

222 type_ = rng_bis.randint(1, n_motif + 1) 

223 anom[i] = type_ * (1 + (n % 2)) 

224 anom_list.append((i // dim_t, i % dim_t, type_, n % 2)) 

225 # impact_anom[i:(i+len_motif),:]+=(motif[type_-1]*(1+(n%2)*0.5)).T 

226 impact_anom[i: (i + len_motif), :] += (motif[type_ - 1]).T 

227 return (anom_list, anom, impact_anom) 

228 

229 

230# Noise generation 

231def gen_noise(var, dim=(400, 20, 3), seed=0): 

232 rng = np.random.RandomState(seed) 

233 dim_n, dim_t, dim_g = dim 

234 noise = rng.normal(0, np.sqrt(var), (dim_t * dim_n, dim_g)) 

235 return noise 

236 

237 

238# Convolution by dimension for g filters 

239def convolve_signal(filters, signal, dim): 

240 dim_n, dim_t, dim_g = dim 

241 signal_conv = [] 

242 for g in range(dim_g): 

243 signal_conv.append( 

244 convolve2d(signal, filters[g][:, ::-1], mode="same", boundary="wrap")[ 

245 :, g 

246 ].reshape((-1, 1)) 

247 ) 

248 signal_conv = np.concatenate(signal_conv, axis=1) 

249 return signal_conv 

250 

251 

252# Generation of dynamic pattern through an arbitrary autoregressive pattern + randomly daily magnitude. 

253def gen_data_dyn( 

254 data_stat, 

255 noise_stat, 

256 var_mult, 

257 var_add, 

258 c_dyn, 

259 dim=(400, 20, 3), 

260 verbose=True, 

261 seed=0, 

262): 

263 dim_n, dim_t, dim_g = dim 

264 

265 autocor = int(dim_t / 4) + (int(dim_t / 4)) % 2 

266 # signal_dyn = [ 

267 # [], 

268 # [], 

269 # [], 

270 # [], 

271 # ] # Moyenne bruité, Moyenne non bruité, noise_mult, noise_add 

272 filters = [] 

273 for g in range(dim_g): 

274 list_ = [] 

275 for i in range(dim_g): # Autocorelation pattern generation 

276 rng1 = np.random.RandomState(seed + 100 + g * dim_g * 4 + i * 4) 

277 rng2 = np.random.RandomState(seed + 100 + g * dim_g * 4 + i * 4 + 1) 

278 rng3 = np.random.RandomState(seed + 100 + g * dim_g * 4 + i * 4 + 2) 

279 rng4 = np.random.RandomState(seed + 100 + g * dim_g * 4 + i * 4 + 3) 

280 

281 list_.append( 

282 rng1.normal(0.2, 0.5, 1) 

283 * np.histogram( 

284 rng2.normal( 

285 0 + rng3.normal(1, 3, 1), max(0, 1 + rng4.normal(1, 1, 1)), 5000 

286 ), 

287 bins=autocor, 

288 range=(-5, 15), 

289 )[0] 

290 / 5000 

291 ) 

292 

293 filter_ = np.zeros((autocor * 2 + 1, dim_g)) 

294 filter_[:autocor] = np.concatenate([list_], axis=1).T 

295 filter_ = filter_ / np.abs(filter_).sum() 

296 filters.append(filter_) 

297 

298 np.zeros(dim_t) 

299 data_dyn = convolve_signal(filters, np.copy(data_stat), dim) 

300 noise_dyn = convolve_signal(filters, np.copy(noise_stat), dim) 

301 # Somme de bruit blanc : mu = Sum(mu) = Sum(0) var = sqrt(sum(var)^2)) 

302 var_dyn_add = np.array( 

303 [np.sum(np.power(filt, 2) * var_add) for filt in filters] 

304 ) * np.ones((dim_n * dim_t, dim_g)) 

305 

306 # Somme de bruit blanc : mu = Sum(mu) = Sum(0) var= sqrt(sum(var)^2)) 

307 var_dyn_mult = convolve_signal( 

308 [np.power(filt, 2) for filt in filters], var_mult, dim 

309 ) 

310 

311 return (data_dyn, noise_dyn, filters, var_dyn_mult, var_dyn_add) 

312 

313 

314# def : percentile 

315 

316 

317# Noise mult + Noise add + signal 

318def add_noise(data, noise_mult, noise_add): 

319 signal = (data * (1 + noise_mult)) + noise_add 

320 return signal 

321 

322 

323def add_noise_aux( 

324 data_dyn, 

325 data_stat, 

326 noise_dyn_mult, 

327 noise_dyn_add, 

328 noise_stat_mult, 

329 noise_stat_add, 

330 c_dyn, 

331): 

332 signal_stat = add_noise(data_stat, noise_stat_mult, noise_stat_add) 

333 signal_dyn = add_noise(data_dyn, noise_dyn_mult, noise_dyn_add) 

334 signal = (1 - c_dyn) * signal_stat + (c_dyn) * signal_dyn 

335 return signal 

336 

337 

338# Core Generation function. 

339def core_gen( 

340 c_dyn=0.4, # Magnitude of dynamic pattern. 

341 var_jour=0.15, # Magnitude of random daily. 

342 v_ratio=1, 

343 per_anom=0.05, # Percentage of anomaly 

344 var_mult=0.075, # mutliplicatife noise magnitude 

345 var_add=0.01, # additive noise magnitude 

346 f_anom=1, # General anomaly magnitude 

347 anom_seuil=0, # Minimal flat impact of anomaly 

348 verbose=0, # Display type 

349 anom_point=True, # Anomaly type 

350 seasons_ctx=[], # List of (Periodicity,Impact Ponderation) 

351 categorials_ctx=[], # List of ( 

352 permutations_ctx=[], 

353 v_mean=200, 

354 dim=(400, 20, 3), 

355 seed=None, 

356): # Dimension 

357 if seed is None: 

358 seed = np.random.randint(2**31 - 1) 

359 

360 dim_n, dim_t, dim_g = dim 

361 n_anom = int(per_anom * (dim_n * dim_t)) 

362 # Generation of mean and variance basic pattern. 

363 Pattern_day_mean, Pattern_day_var = gen_motif(verbose, dim=dim, seed=seed) 

364 layer = [] 

365 # Generation of seasonal influence 

366 seas_ctx = [ 

367 gen_seas_contexte(seas=seas, pond=pond, dim=dim, seed=seed + n) 

368 for n, (seas, pond) in enumerate(seasons_ctx) 

369 ] 

370 seas_ctx.append(np.ones(dim_n * dim_t)) 

371 seas_ctx = np.array(seas_ctx).sum(axis=0) 

372 influence_ctx = np.tile(seas_ctx, dim_g).reshape(dim_g, -1).T 

373 layer.append(seas_ctx) 

374 

375 permuts_ctx = [ 

376 gen_permuts_contexte(seas=seas, pond=pond, dim=dim, seed=seed + n) 

377 for n, (seas, pond) in enumerate(permutations_ctx) 

378 ] 

379 permuts_ctx.append(np.zeros((dim_n * dim_t))) 

380 permuts_ctx = np.array(permuts_ctx).sum(axis=0) 

381 influence_ctx += np.tile(permuts_ctx, dim_g).reshape(dim_g, -1).T 

382 layer.append(permuts_ctx) 

383 

384 cats_ctx = [ 

385 gen_cat_contexte(size=size, effect=effect, per=per, dim=dim, seed=seed + n) 

386 for n, (size, effect, per) in enumerate(categorials_ctx) 

387 ] 

388 cats_ctx.append(np.zeros((dim_n * dim_t, dim_g))) 

389 influence_ctx += np.array(cats_ctx).sum(axis=0) 

390 layer.append(cats_ctx) 

391 

392 influence_ctx = np.sqrt(np.abs(1 + (influence_ctx - influence_ctx.mean()))) 

393 influence_ctx[influence_ctx < 0.05] = 0.05 

394 inf_max = np.percentile(influence_ctx, 99.5) 

395 influence_ctx[influence_ctx > inf_max] = inf_max 

396 

397 # Concatenation of the mean pattern of each spatial dimension 

398 Mean_pattern = np.concatenate( 

399 [np.tile((pattern), dim_n)[:, None] for pattern in Pattern_day_mean], axis=1 

400 ) 

401 

402 # Concatenation of the variance pattern of each spatial dimension 

403 variance = np.concatenate( 

404 [np.tile((pattern), dim_n)[:, None] for pattern in Pattern_day_var], axis=1 

405 ) * np.sqrt(influence_ctx) 

406 variance = np.array([savgol_filter(i, 5, 3) for i in variance.T]).T 

407 

408 # ramdomly daily magnitude 

409 rng = np.random.RandomState(seed) 

410 Ampleur_day = np.repeat( 

411 rng.normal(1, np.sqrt(var_jour), (dim_n, dim_g)), dim_t, axis=0 

412 ) 

413 

414 # Signal composition 

415 data_stat = Mean_pattern / Mean_pattern.mean(axis=0) * influence_ctx * Ampleur_day 

416 data_stat = v_mean * data_stat 

417 layer.append(Ampleur_day) 

418 layer.append(Mean_pattern / Mean_pattern.mean(axis=0)) 

419 

420 # Generation of noises 

421 var_add_apply = var_add * v_mean 

422 data_bruit_add = gen_noise(var_add_apply, dim, seed=seed - 1) 

423 

424 # Injection of variance (Combination of Variance pattern and mutliplicatif noise) and additif noise. 

425 var_mult_apply = np.power(np.abs(data_stat), v_ratio) 

426 var_mult_apply = ( 

427 var_mult_apply / var_mult_apply.mean() * var_mult * variance 

428 ) # !!! chg ref sigma->var 

429 var_mult_apply[var_mult_apply <= 0] = 0 

430 

431 data_bruit_mult = gen_noise(var_mult_apply, dim, seed=seed - 2) 

432 noise_stat = data_bruit_mult + data_bruit_add 

433 

434 # Generation of dynamic pattern over the crossing of mean motifs with seasonal influences. 

435 data_dyn, noise_dyn, filters, var_dyn_mult, var_dyn_add = gen_data_dyn( 

436 data_stat, noise_stat, var_mult_apply, var_add_apply, c_dyn, dim, seed 

437 ) 

438 

439 data_full = add_noise_aux(data_dyn, data_stat, 0, 0, 0, 0, c_dyn) 

440 data_full_dyn_noised = add_noise_aux( 

441 data_dyn + noise_dyn, data_stat, 0, 0, 0, 0, c_dyn 

442 ) 

443 

444 data_full_stat_noised = add_noise_aux( 

445 data_dyn, data_stat + noise_stat, 0, 0, 0, 0, c_dyn 

446 ) 

447 data_full_noised = add_noise_aux( 

448 data_dyn + noise_dyn, data_stat + noise_stat, 0, 0, 0, 0, c_dyn 

449 ) 

450 

451 # Injection of anomaly. 

452 # Anomaly magnitude taking into account the variance and the noise to be dicernable. 

453 

454 # Var Referential -> coef ² 

455 var_dyn_mult = np.power(c_dyn, 2) * var_dyn_mult 

456 var_dyn_add = np.power(c_dyn, 2) * var_dyn_add # Var Referential -> coef ² 

457 

458 var_stat_mult = np.power((1 - c_dyn), 2) * var_mult_apply 

459 var_stat_add = np.power((1 - c_dyn), 2) * var_add_apply 

460 

461 var_stat = ( 

462 var_stat_mult + var_stat_add 

463 ) # Variance statiques (facteurs explicatifs non temporelles) 

464 

465 var_dyn = ( 

466 var_dyn_mult + var_dyn_add 

467 ) # Variance dynamique (Autocorélation de la variance) 

468 

469 var_full = var_dyn + var_stat 

470 

471 print("test", var_stat.mean(), var_full.mean()) 

472 

473 # Generation of anomaly 

474 motif = gen_anom_motif(anom_point, verbose=0, dim=dim, seed=seed) 

475 anom_list, anom, impact_anom = gen_anom(n_anom, motif, dim, seed=seed) 

476 # anom_list,anom,impact_anom=gen_fake_anom(n_anom,motif,dim) 

477 var_sum_noise = 0 

478 if per_anom != 0: 

479 var_sum_noise = scipy.stats.norm.ppf(1 - per_anom / 2, 0, np.sqrt(var_full)) 

480 

481 # Injection of anomaly. 

482 # Anomaly magnitude taking into account the variance and the noise to be dicernable. 

483 Y = data_full_noised + (2 * var_sum_noise * f_anom * impact_anom) 

484 

485 print( 

486 "Var_noise_dyn_emp:", 

487 trunc((noise_dyn * (c_dyn)).var()), 

488 "var_noise_stat_emp:", 

489 trunc((noise_stat * (1 - c_dyn)).var()), 

490 "Var_emp", 

491 trunc((Y - data_full).var()), 

492 ) 

493 

494 per_list = [0.01, 1, 2.5, 10, 25, 75, 90, 97.5, 99, 99.99] 

495 per = [] 

496 for n, i in enumerate(per_list): 

497 coverage = [] 

498 per.append( 

499 add_noise(data_full, 0, scipy.stats.norm.ppf(i / 100, 0, np.sqrt(var_full))) 

500 ) 

501 for g in range(dim_g): 

502 coverage.append(100 * ((Y[:, g] < per[n][:, g]).sum() / len(Y))) 

503 print(i, np.round(coverage, 2)) 

504 

505 # Features 

506 Features_name = [] 

507 Features_list = [] 

508 Features_list.append((np.arange(dim_t * dim_n) % dim_t)[:, None]) 

509 Features_name.append("main_period") 

510 

511 if len(seasons_ctx) > 0: 

512 for n, (i, _) in enumerate(seasons_ctx): 

513 Features_list.append((np.arange(dim_t * dim_n) % i)[:, None]) 

514 Features_name.append("period_" + str(i)) 

515 

516 if len(permutations_ctx) > 0: 

517 for n, (i, _) in enumerate(permutations_ctx): 

518 permuts = np.random.RandomState(seed + n).permutation( 

519 np.arange(dim_n * dim_t) 

520 ) 

521 Features_list.append((np.arange(dim_t * dim_n) % i)[permuts][:, None]) 

522 Features_name.append("period_cat_" + str(i)) 

523 

524 if len(categorials_ctx) > 0: 

525 for n, (size, effect, per) in enumerate(categorials_ctx): 

526 # Flatten context impact : To modify if impact can be different by dimension 

527 X_cat = ( 

528 gen_cat_contexte( 

529 size=size, effect=effect, per=per, dim=dim, seed=seed + n 

530 ) 

531 != 0 

532 ).astype(int)[:, 0][:, None] 

533 

534 Features_list.append(X_cat) 

535 Features_name.append("cat_" + str(n) + "_" + str(np.max(X_cat))) 

536 

537 X = np.concatenate(Features_list, axis=1) 

538 X_name = Features_name 

539 

540 train_rf = np.arange(dim_t * dim_n) < (dim_t * dim_n * 2 / 3) 

541 test_rf = np.arange(dim_t * dim_n) >= (dim_t * dim_n * 2 / 3) 

542 

543 print(X.shape, X_name) 

544 

545 return ( 

546 Y, 

547 data_full, 

548 data_full_stat_noised, 

549 data_full_dyn_noised, 

550 impact_anom, 

551 anom, 

552 anom_list, 

553 var_full, 

554 var_stat, 

555 var_dyn, 

556 X, 

557 X_name, 

558 train_rf, 

559 test_rf, 

560 seed, 

561 ) 

562 

563 

564def moving_average(a, n): 

565 ret = np.cumsum(a, dtype=float) 

566 ret[n:] = ret[n:] - ret[:-n] 

567 return np.concatenate([ret[: n - 1] / (np.arange(n - 1) + 1), ret[n - 1:] / n]) 

568 

569 

570def factory( 

571 X_row, Y, Features_name, train, test, freq, dim=(400, 20, 3), lags=0, lags_mean=[] 

572): 

573 dim_n, dim_t, dim_g = dim 

574 Y = Y.reshape(dim_n * dim_t, dim_g) 

575 new_features_list = [] 

576 new_features_name = [] 

577 for i, name in enumerate(Features_name): 

578 if "period" in name: 

579 X_cur = X_row[:, i] 

580 seas = np.argmax(X_cur) 

581 new_features_list.append(base_cos_freq((X_cur % seas) / seas, freq)) 

582 new_features_name.append("cos" + name) 

583 new_features_name.append("sin" + name) 

584 else: 

585 X_cur = X_row[:, i] 

586 new_features_list.append(X_cur[:, None]) 

587 new_features_name.append(name) 

588 

589 for g in range(dim_g): 

590 for i in lags: 

591 new_features_list.append(np.roll(Y[:, g], i)[:, None]) 

592 new_features_name.append("dim:" + str(g) + " lag:" + str(i)) 

593 for i, j in lags_mean: 

594 new_features_list.append( 

595 moving_average(np.roll(Y[:, g], i), j - i)[:, None] 

596 ) 

597 new_features_name.append( 

598 "dim:" + str(g) + " lag_mean:" + str(i) + " " + str(j) 

599 ) 

600 

601 X = np.concatenate(new_features_list, axis=1) 

602 name = np.array(new_features_name) 

603 return (X, Y, name, train, test) 

604 

605 

606# Display of data curve with mean and variance. 

607def plot_var( 

608 Y, 

609 data_full, 

610 variance, 

611 impact_anom=None, 

612 anom=None, 

613 f_obs=None, 

614 dim=(400, 20, 3), 

615 g=0, 

616 res_flag=False, 

617 fig_s=(20, 3), 

618 title=None, 

619 ylim=None, 

620): 

621 dim_n, dim_t, dim_g = dim 

622 anom_pred = (np.abs(impact_anom).sum(axis=-1) > 0).astype(int) - (anom > 0).astype( 

623 int 

624 ) 

625 

626 if anom_pred.sum() < 1: 

627 anom_pred[0] = 1 

628 anom_pred[-1] = 1 

629 

630 step = g 

631 plt.figure(figsize=fig_s) 

632 plt.title(title) 

633 # norm = Y.mean(axis=0) 

634 if anom_pred.sum() < 1: 

635 anom_pred[0] = 1 

636 anom_pred[-1] = 1 

637 

638 ni = [100, 98, 95, 80, 50] 

639 color_full = [ 

640 (0.5, 0.0, 0.5), 

641 (0.8, 0, 0), 

642 (0.8, 0.6, 0), 

643 (0, 0.8, 0), 

644 (0, 0.4, 0), 

645 (0, 0.8, 0), 

646 (0.8, 0.6, 0), 

647 (0.8, 0, 0), 

648 (0.5, 0.0, 0.5), 

649 ] 

650 color_full2 = [ 

651 (0.5, 0.0, 0.5), 

652 (0.8, 0, 0), 

653 (0.8, 0.6, 0), 

654 (0, 0.8, 0), 

655 (0, 0.4, 0), 

656 (0, 0.4, 0), 

657 (0, 0.8, 0), 

658 (0.8, 0.6, 0), 

659 (0.8, 0, 0), 

660 (0.5, 0.0, 0.5), 

661 ] 

662 

663 per_list = [0.01, 1, 2.5, 10, 25, 75, 90, 97.5, 99, 99.99] 

664 per = [] 

665 

666 res = data_full * 0 

667 if res_flag: 

668 res = data_full 

669 

670 for i in per_list: 

671 per.append( 

672 add_noise( 

673 data_full - res, 

674 0, 

675 scipy.stats.norm.ppf((i / 100), 0, np.sqrt(variance)), 

676 ), 

677 ) 

678 

679 for i in range(len(per) - 1): 

680 plt.fill_between( 

681 np.arange(len(f_obs)), 

682 per[i][f_obs, step], 

683 per[i + 1][f_obs, step], 

684 color=color_full[i], 

685 alpha=0.20, 

686 ) 

687 for i in range(len(per)): 

688 plt.plot( 

689 np.arange(len(f_obs)), 

690 per[i][f_obs, step], 

691 color=color_full2[i], 

692 linewidth=0.5, 

693 alpha=0.40, 

694 ) 

695 if i > 4: 

696 plt.fill_between( 

697 [], 

698 [], 

699 [], 

700 color=color_full2[i], 

701 label=str(ni[9 - i]) + "% Coverage", 

702 alpha=0.20, 

703 ) 

704 

705 plt.plot( 

706 Y[f_obs, step] - res[f_obs, step], 

707 label="Series", 

708 color="black", 

709 linewidth=1.5, 

710 marker="o", 

711 ms=3, 

712 ) 

713 flag = impact_anom[f_obs, step] != 0 

714 print(flag.sum()) 

715 plt.plot( 

716 np.arange(len(f_obs))[flag], 

717 Y[f_obs, step][flag] - res[f_obs, step][flag], 

718 label="Anom", 

719 color="red", 

720 ls="", 

721 marker="X", 

722 ms=10, 

723 alpha=0.8, 

724 ) 

725 

726 if False: 

727 f_anom = np.repeat(np.abs(impact_anom)[f_obs, step] > 0, 7) 

728 for i in range(8): 

729 f_anom += np.roll(f_anom, i - 4) 

730 f_anom = f_anom > 0 

731 plt.fill_between( 

732 np.arange(len(f_obs) * 7) / 7 - (3 / 7), 

733 -1000, 

734 Y[f_obs, step].max() * 1.2, 

735 where=f_anom, 

736 facecolor="blue", 

737 alpha=0.2, 

738 label="Anomaly", 

739 zorder=-10, 

740 ) 

741 

742 if ylim is None: 

743 plt.ylim(per[0][f_obs, step].min() * 1.05, per[-1][f_obs, step].max() * 1.05) 

744 else: 

745 plt.ylim(ylim[0], ylim[1]) 

746 plt.legend(ncol=7, fontsize=14) 

747 plt.xlim(0, len(f_obs)) 

748 plt.tight_layout() 

749 plt.show() 

750 return (per, per_list) 

751 

752 

753def generate_default( 

754 Name_data="data_test", storing="", dict_param=dict(), seed=7885000 

755): 

756 pass 

757 

758 # Multivariate time series génération of size : (Dim_n, dim_t, dim_g) : (Day, Hour, Station) 

759 n_year = 10 

760 dim_g = 3 

761 n_day = 3 

762 dim_t = 20 

763 dim_n = n_day * dim_t * n_year 

764 dim_g = 3 

765 dim = (dim_n, dim_t, dim_g) 

766 

767 # Param generation 

768 dict_param_default = { 

769 "c_dyn": 0.6, # 0.7 

770 "n_year": n_year, 

771 "n_day": n_day, 

772 "var_jour": 0.01, # 0.08 

773 "per_anom": 0.01, 

774 "var_mult": 0.3, # Pourcentage de bruit multiplicatifs. 0.30 

775 "var_add": 0.03, # Pourcentage de bruit additifs. 0.12 

776 "f_anom": 0.8, 

777 "anom_seuil": 0.01, 

778 "verbose": 0, 

779 # [(dim_t*n_day,1.0),((dim_n*dim_t)/n_year,0.8)], 

780 "seas": [(dim_t * n_day, 2.0), ((dim_n * dim_t) / n_year, 1.5)], 

781 # [(dim_t*n_day,0.5),(255,0.2)], 

782 "permuts": [], 

783 # [(dim_t*n_day, 0.35), (255, 0.15)], 

784 # [(7,0.3,0.4),(3,0.8,0.2)],#[(7,0.4,0.3),(1,0.2,0.5),(3,0.8,0.2)], 

785 "cats": [], 

786 # [(7, 0.3, 0.4)], 

787 "v_mean": 2.5, 

788 "anom_point": True, 

789 } 

790 

791 for key in dict_param_default.keys(): 

792 if key not in dict_param.keys(): 

793 dict_param[key] = dict_param_default[key] 

794 

795 # seed=7885912 

796 

797 # Génération d'un dataset 

798 gen_variable = core_gen( 

799 c_dyn=dict_param["c_dyn"], 

800 var_jour=dict_param["var_jour"], 

801 v_ratio=0.3, 

802 per_anom=dict_param["per_anom"], 

803 var_mult=dict_param["var_mult"], 

804 var_add=dict_param["var_add"], 

805 f_anom=dict_param["f_anom"], 

806 anom_seuil=dict_param["anom_seuil"], 

807 verbose=dict_param["verbose"], 

808 anom_point=dict_param["anom_point"], 

809 seasons_ctx=dict_param["seas"], 

810 categorials_ctx=dict_param["cats"], 

811 permutations_ctx=dict_param["permuts"], 

812 v_mean=dict_param["v_mean"], 

813 dim=dim, 

814 seed=seed, 

815 ) 

816 

817 ( 

818 Y, 

819 data_mean, 

820 data_mean_stat, 

821 data_mean_dyn, 

822 impact_anom, 

823 anom, 

824 anom_list, 

825 var, 

826 var_stat, 

827 var_dyn, 

828 X_init, 

829 X_name_init, 

830 train, 

831 test, 

832 seed, 

833 ) = gen_variable 

834 

835 # Remove multivariate other dimension 

836 

837 X, Y, X_name, train, test = factory( 

838 X_init, 

839 Y, 

840 X_name_init, 

841 train, 

842 test, 

843 freq=[2], 

844 lags=[1, 2], 

845 lags_mean=[(1, int(dim_t / 2)), (1, n_day * dim_t)], 

846 dim=(dim_n, dim_t, dim_g), 

847 ) 

848 

849 Cat_flag = np.array(["cat" in i for i in X_name_init]) 

850 X_cat = ( 

851 (X_init[:, Cat_flag]) 

852 * np.array([np.power(2, i) for i in range(0, Cat_flag.sum())]) 

853 ).sum(axis=1) 

854 

855 context = np.concatenate( 

856 [X_init[:, np.invert(Cat_flag)], X_cat[:, None], Y, var], axis=1 

857 ) 

858 context_name = np.concatenate( 

859 [ 

860 np.array(X_name_init)[np.invert(Cat_flag)].tolist(), 

861 ["Cat_contexts"], 

862 ["lag_Y_Numerical", "lag_dim1_Numerical", "lag_dim2_Numerical"], 

863 ["Var_Y_Numerical", "var_dim1_Numerical", "var_dim2_Numerical"], 

864 ] 

865 ) 

866 

867 context[:, 0] = np.floor( 

868 (context[:, 0] / (context[:, 0].max() / (dim_t - 0.000001))) 

869 ).astype(int) 

870 context[:, 1] = np.floor( 

871 (context[:, 1] / (context[:, 1].max() / (n_day - 0.000001))) 

872 ).astype(int) 

873 context[:, 2] = np.floor((context[:, 2] / (context[:, 2].max() / (5.99)))).astype( 

874 int 

875 ) 

876 

877 np.arange(dim_n) % dict_param["n_day"] 

878 anom_pred = (np.abs(impact_anom).sum(axis=-1) > 0).astype(int) - (anom > 0).astype( 

879 int 

880 ) 

881 if anom_pred.sum() < 1: 

882 anom_pred[0] = 1 

883 anom_pred[-1] = 1 

884 

885 f_obs = np.arange(len(Y))[test][0:200] 

886 

887 # Env var 

888 for g in range(dim_g): 

889 per, per_info = plot_var( 

890 Y, data_mean, var, impact_anom, anom, f_obs=f_obs, dim=dim, g=g 

891 ) 

892 

893 f_obs = np.arange(len(Y)) 

894 per, per_info = plot_var( 

895 Y, data_mean, var, impact_anom, anom, f_obs=f_obs, dim=dim, g=g 

896 ) 

897 

898 # Env full stat 

899 # per_bis,per_info=Gen.plot_var(Y,data_mean_dyn,var_stat,impact_anom,anom,f_obs=f_obs,dim=dim,g=0) 

900 

901 # Env var dyn 

902 # per_bis,per_info=Gen.plot_var(Y,data_mean_stat,var_dyn,impact_anom,anom,f_obs=f_obs,dim=dim,g=0) 

903 

904 plt.figure(figsize=(14, 2)) 

905 plt.subplot(1, 2, 1) 

906 plt.scatter( 

907 data_mean[:, 0], 

908 np.abs(Y[:, 0] - data_mean[:, 0]), 

909 s=1, 

910 c=np.arange(dim_t * dim_n) % dim_t, 

911 cmap="jet", 

912 ) 

913 plt.subplot(1, 2, 2) 

914 plt.scatter( 

915 data_mean[:, 0], var[:, 0], s=1, c=np.arange(dim_t * dim_n) % dim_t, cmap="jet" 

916 ) 

917 plt.show() 

918 

919 dict_data = {} 

920 dict_data["X"] = X 

921 dict_data["Y"] = Y 

922 dict_data["context"] = context 

923 dict_data["X_name"] = X_name 

924 dict_data["train"] = train 

925 dict_data["test"] = test 

926 dict_data["X_split"] = np.arange(len(X)) // (len(X) / 5) 

927 dict_data["context_name"] = context_name 

928 dict_data["aux"] = { 

929 "data_mean": data_mean, 

930 "data_mean_dyn": data_mean_dyn, 

931 "impact_anom": impact_anom, 

932 "anom": anom, 

933 "var": var, 

934 "var_dyn": var_dyn, 

935 "var_stat": var_stat, 

936 "f_obs": f_obs, 

937 "dim": dim, 

938 } 

939 

940 # write(storing, [Name_data], dict_data) 

941 return dict_data