Coverage for uqmodels / data_generation / Gen_multivariate_normal_data.py: 83%
386 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-09 08:15 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-09 08:15 +0000
1import math
2import matplotlib.pyplot as plt
3import numpy as np
4import scipy
5from scipy.signal import convolve2d, savgol_filter
8def base_cos_freq(array, freq):
9 list_ = []
10 for i in freq:
11 list_.append(np.cos(i * math.pi * array))
12 list_.append(np.sin(i * math.pi * array))
13 return np.concatenate(list_).reshape(len(freq) * 2, -1).T
16def trunc(x):
17 """Round function for values visualisation"""
18 return np.round(x, 5)
21"""Return gaussian constant link to the percentile"""
24# Motifs generation
25def gen_motif(verbose=0, dim=(400, 20, 3), seed=0):
26 dim_n, dim_t, dim_g = dim
27 int(dim_g * 2)
28 Pattern_day_mean = []
29 Pattern_day_var = []
30 Day_Motif = np.ones(dim_t)
31 n_melange = 12
33 for n, i in enumerate([0.25, 0.5, 0.70, 0.90]):
34 Day_Motif[n] = i
35 Day_Motif[-n] = i
37 for g in range(dim_g):
38 dist = []
39 for i in range(n_melange):
40 rng0 = np.random.RandomState(seed + g * n_melange * 4 + i * 4 + 0)
41 rng1 = np.random.RandomState(seed + g * n_melange * 4 + i * 4 + 1)
42 rng2 = np.random.RandomState(seed + g * n_melange * 4 + i * 4 + 2)
43 rng3 = np.random.RandomState(seed + g * n_melange * 4 + i * 4 + 3)
45 dist.append(
46 rng0.normal(
47 rng1.uniform(0, 1),
48 rng2.uniform(0.02, 0.3),
49 np.maximum(10000, rng3.normal(2000, 1000)).astype(int),
50 )
51 )
52 a = np.roll(np.histogram(dist, bins=dim_t)[0], rng0.randint(dim_t))
53 Pattern_day_mean.append(0.03 + Day_Motif * savgol_filter((a) / (a).sum(), 9, 4))
54 dist = []
55 for i in range(n_melange):
56 rng0 = np.random.RandomState(
57 seed + g * n_melange * 4 + i * 4 + 0 + dim_g * n_melange * 4
58 )
59 rng1 = np.random.RandomState(
60 seed + g * n_melange * 4 + i * 4 + 1 + dim_g * n_melange * 4
61 )
62 rng2 = np.random.RandomState(
63 seed + g * n_melange * 4 + i * 4 + 2 + dim_g * n_melange * 4
64 )
65 rng3 = np.random.RandomState(
66 seed + g * n_melange * 4 + i * 4 + 3 + dim_g * n_melange * 4
67 )
69 dist.append(
70 rng0.normal(
71 rng1.uniform(0, 1),
72 rng2.uniform(0.02, 0.3),
73 np.maximum(10000, rng3.normal(2000, 1000)).astype(int),
74 )
75 )
76 a = np.roll(np.histogram(dist, bins=dim_t)[0], rng0.randint(dim_t))
77 var_aux = np.maximum(
78 np.zeros(dim_t) + 0.0001, savgol_filter(dim_t * ((a) / (a).sum()), 9, 4)
79 )
80 Pattern_day_var.append(var_aux)
81 if verbose != 0:
82 plt.figure(figsize=(14, 4))
83 plt.subplot(2, 1, 1)
84 [plt.plot(i) for i in Pattern_day_mean]
85 plt.subplot(2, 1, 2)
86 [plt.plot(i) for i in Pattern_day_var]
87 plt.show()
88 return (Pattern_day_mean, Pattern_day_var)
91# Seasonal partern generation
92def gen_seas_contexte(seas=140, pond=3, dim=(400, 20, 3), seed=0):
93 rng = np.random.RandomState(int(seas + seed))
94 dim_n, dim_t, dim_g = dim
95 seas = (
96 np.cos(2 * math.pi * (rng.rand() + np.arange(dim_t * dim_n) / seas))
97 ) * pond # 6 : Seas2 magnitude ponderation
98 return seas
101def gen_permuts_contexte(seas=140, pond=3, dim=(400, 20, 3), seed=0):
102 rng = np.random.RandomState(seed)
103 dim_n, dim_t, dim_g = dim
104 seas = (
105 np.cos(2 * math.pi * (rng.rand() + np.arange(dim_t * dim_n) / seas))
106 ) * pond # 6 : Seas2 magnitude ponderation
107 perm = rng.permutation(np.arange(dim_n * dim_t))
108 seas = seas[perm]
109 return seas
112# Pattern catégorielle aléatoire.
113def gen_cat_contexte(size=5, effect=0.2, per=0.1, dim=(400, 20, 3), seed=0):
114 rng = np.random.RandomState(seed)
115 dim_n, dim_t, dim_g = dim
116 n_event = int((per * (dim_n * dim_t)) / size)
117 impact_ctx_mean = np.zeros((dim_t * dim_n, dim_g))
118 for n, i in enumerate(rng.choice(np.arange(dim_n * dim_t - size), n_event, False)):
119 impact_ctx_mean[i: (i + size), :] += effect
120 return impact_ctx_mean
123# Anomaly pattern genaration :
124# Experiment have been essentially realised with point anomaly.
125# Detection based on non-point anomalies required specific methodology not defined in these implementations.
126def gen_anom_motif(anom_point=True, verbose=False, dim=(400, 20, 3), seed=0):
127 dim_t, dim_n, dim_g = dim
128 rng = np.random.RandomState(seed)
129 rng1 = np.random.RandomState(seed + 1)
130 if anom_point:
131 p2 = np.array([1, 0, 0, 0, 0])
132 P_list = [p2]
133 motif1 = np.stack([P_list[int(i)] for i in np.zeros(dim_g)])
135 motif = [motif1, -motif1]
136 # Non-point anomaly case.
137 else:
138 # Poisson law motifs.
139 # p1=np.histogram(np.random.gamma(np.random.normal(11,2,1),
140 # scale=np.random.normal(0.8,0,1),
141 # size=10000),
142 # bins=5)[0]/10000
143 # p2=np.histogram(np.random.gamma(np.random.normal(11,2,1),
144 # scale=np.random.normal(0.8,0,1),
145 # size=10000),
146 # bins=5)[0]/10000
148 # Mannual motifs.
149 # p0 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])
150 p0 = np.array([1.7, 1.7, 1.4, 1.4, 1, 1, 1, 0.2, 0.1])
151 p1 = np.array([1.4, 1.1, 0.7, 0.4, 0.3, 0, 0, 0, 0])
152 p2 = np.array([1, 1.5, 1.3, 0.8, -0.8, -1, -0.8, 0, 0])
153 p3 = np.array([0.5, 1.1, -0.6, -0.7, -0.3, 0, 0, 0, 0])
155 P_list = [p0, p1, p2, p3]
157 motif1 = np.stack([p1 for i in np.ones(dim_g)])
158 motif1 = np.stack([P_list[int(i)] for i in np.ones(dim_g)])
159 motif2 = np.stack(
160 [P_list[int(i)] for i in rng.choice(np.arange(len(P_list)), dim_g)]
161 )
162 motif3 = np.stack(
163 [P_list[int(i)] for i in rng1.choice(np.arange(len(P_list)), dim_g)]
164 )
166 motif = [motif1, -motif1, motif2, motif3]
167 txt_anom = [
168 "Anom positive faible",
169 "Anom negative forte",
170 "Anom negative faible",
171 "Anom negative forte",
172 ]
173 if verbose:
174 fig = plt.figure(figsize=(6, 4))
175 for n, i in enumerate(motif[0:4]):
176 plt.subplot(2, 2, n + 1)
177 plt.title(txt_anom[n])
178 im = plt.imshow(i * (1 + (n // 2) * 0.4), vmin=-2, vmax=2, cmap="seismic")
179 fig.colorbar(im, orientation="vertical", shrink=1)
180 plt.xticks(np.arange(5), np.arange(5) - 1)
181 plt.show()
182 return motif
185# Anomaly generation
186def gen_fake_anom(n_anom, motif, dim=(400, 20, 3)):
187 dim_n, dim_t, dim_g = dim
188 len(motif)
189 len_motif = motif[0].shape[1]
190 anom = np.zeros((dim_n * dim_t))
191 impact_anom = np.zeros((dim_t * dim_n, dim_g))
192 anom_list = []
193 ttype = [1, 2, 1, 1]
194 for n, i in enumerate(
195 [
196 dim_t + dim_t / 3,
197 2 * dim_t + dim_t / 4,
198 2 * dim_t + dim_t * 4 / 5,
199 3 * dim_t + dim_t * 1 / 2,
200 ]
201 ):
202 i = int(i)
203 anom[i] = i
204 anom_list.append((i // dim_t, i % dim_t, i, 1))
205 impact_anom[i: (i + len_motif), :] += (motif[ttype[n] - 1]).T
206 return (anom_list, anom, impact_anom)
209# Anomaly generation
210def gen_anom(n_anom, motif, dim=(400, 20, 3), seed=0):
211 rng = np.random.RandomState(seed)
212 dim_n, dim_t, dim_g = dim
213 n_motif = len(motif)
214 len_motif = motif[0].shape[1]
215 anom = np.zeros((dim_n * dim_t))
216 impact_anom = np.zeros((dim_t * dim_n, dim_g))
217 anom_list = []
218 for n, i in enumerate(
219 rng.choice(np.arange(dim_n * dim_t - len_motif), n_anom, False)
220 ):
221 rng_bis = np.random.RandomState(seed + n)
222 type_ = rng_bis.randint(1, n_motif + 1)
223 anom[i] = type_ * (1 + (n % 2))
224 anom_list.append((i // dim_t, i % dim_t, type_, n % 2))
225 # impact_anom[i:(i+len_motif),:]+=(motif[type_-1]*(1+(n%2)*0.5)).T
226 impact_anom[i: (i + len_motif), :] += (motif[type_ - 1]).T
227 return (anom_list, anom, impact_anom)
230# Noise generation
231def gen_noise(var, dim=(400, 20, 3), seed=0):
232 rng = np.random.RandomState(seed)
233 dim_n, dim_t, dim_g = dim
234 noise = rng.normal(0, np.sqrt(var), (dim_t * dim_n, dim_g))
235 return noise
238# Convolution by dimension for g filters
239def convolve_signal(filters, signal, dim):
240 dim_n, dim_t, dim_g = dim
241 signal_conv = []
242 for g in range(dim_g):
243 signal_conv.append(
244 convolve2d(signal, filters[g][:, ::-1], mode="same", boundary="wrap")[
245 :, g
246 ].reshape((-1, 1))
247 )
248 signal_conv = np.concatenate(signal_conv, axis=1)
249 return signal_conv
252# Generation of dynamic pattern through an arbitrary autoregressive pattern + randomly daily magnitude.
253def gen_data_dyn(
254 data_stat,
255 noise_stat,
256 var_mult,
257 var_add,
258 c_dyn,
259 dim=(400, 20, 3),
260 verbose=True,
261 seed=0,
262):
263 dim_n, dim_t, dim_g = dim
265 autocor = int(dim_t / 4) + (int(dim_t / 4)) % 2
266 # signal_dyn = [
267 # [],
268 # [],
269 # [],
270 # [],
271 # ] # Moyenne bruité, Moyenne non bruité, noise_mult, noise_add
272 filters = []
273 for g in range(dim_g):
274 list_ = []
275 for i in range(dim_g): # Autocorelation pattern generation
276 rng1 = np.random.RandomState(seed + 100 + g * dim_g * 4 + i * 4)
277 rng2 = np.random.RandomState(seed + 100 + g * dim_g * 4 + i * 4 + 1)
278 rng3 = np.random.RandomState(seed + 100 + g * dim_g * 4 + i * 4 + 2)
279 rng4 = np.random.RandomState(seed + 100 + g * dim_g * 4 + i * 4 + 3)
281 list_.append(
282 rng1.normal(0.2, 0.5, 1)
283 * np.histogram(
284 rng2.normal(
285 0 + rng3.normal(1, 3, 1), max(0, 1 + rng4.normal(1, 1, 1)), 5000
286 ),
287 bins=autocor,
288 range=(-5, 15),
289 )[0]
290 / 5000
291 )
293 filter_ = np.zeros((autocor * 2 + 1, dim_g))
294 filter_[:autocor] = np.concatenate([list_], axis=1).T
295 filter_ = filter_ / np.abs(filter_).sum()
296 filters.append(filter_)
298 np.zeros(dim_t)
299 data_dyn = convolve_signal(filters, np.copy(data_stat), dim)
300 noise_dyn = convolve_signal(filters, np.copy(noise_stat), dim)
301 # Somme de bruit blanc : mu = Sum(mu) = Sum(0) var = sqrt(sum(var)^2))
302 var_dyn_add = np.array(
303 [np.sum(np.power(filt, 2) * var_add) for filt in filters]
304 ) * np.ones((dim_n * dim_t, dim_g))
306 # Somme de bruit blanc : mu = Sum(mu) = Sum(0) var= sqrt(sum(var)^2))
307 var_dyn_mult = convolve_signal(
308 [np.power(filt, 2) for filt in filters], var_mult, dim
309 )
311 return (data_dyn, noise_dyn, filters, var_dyn_mult, var_dyn_add)
314# def : percentile
317# Noise mult + Noise add + signal
318def add_noise(data, noise_mult, noise_add):
319 signal = (data * (1 + noise_mult)) + noise_add
320 return signal
323def add_noise_aux(
324 data_dyn,
325 data_stat,
326 noise_dyn_mult,
327 noise_dyn_add,
328 noise_stat_mult,
329 noise_stat_add,
330 c_dyn,
331):
332 signal_stat = add_noise(data_stat, noise_stat_mult, noise_stat_add)
333 signal_dyn = add_noise(data_dyn, noise_dyn_mult, noise_dyn_add)
334 signal = (1 - c_dyn) * signal_stat + (c_dyn) * signal_dyn
335 return signal
338# Core Generation function.
339def core_gen(
340 c_dyn=0.4, # Magnitude of dynamic pattern.
341 var_jour=0.15, # Magnitude of random daily.
342 v_ratio=1,
343 per_anom=0.05, # Percentage of anomaly
344 var_mult=0.075, # mutliplicatife noise magnitude
345 var_add=0.01, # additive noise magnitude
346 f_anom=1, # General anomaly magnitude
347 anom_seuil=0, # Minimal flat impact of anomaly
348 verbose=0, # Display type
349 anom_point=True, # Anomaly type
350 seasons_ctx=[], # List of (Periodicity,Impact Ponderation)
351 categorials_ctx=[], # List of (
352 permutations_ctx=[],
353 v_mean=200,
354 dim=(400, 20, 3),
355 seed=None,
356): # Dimension
357 if seed is None:
358 seed = np.random.randint(2**31 - 1)
360 dim_n, dim_t, dim_g = dim
361 n_anom = int(per_anom * (dim_n * dim_t))
362 # Generation of mean and variance basic pattern.
363 Pattern_day_mean, Pattern_day_var = gen_motif(verbose, dim=dim, seed=seed)
364 layer = []
365 # Generation of seasonal influence
366 seas_ctx = [
367 gen_seas_contexte(seas=seas, pond=pond, dim=dim, seed=seed + n)
368 for n, (seas, pond) in enumerate(seasons_ctx)
369 ]
370 seas_ctx.append(np.ones(dim_n * dim_t))
371 seas_ctx = np.array(seas_ctx).sum(axis=0)
372 influence_ctx = np.tile(seas_ctx, dim_g).reshape(dim_g, -1).T
373 layer.append(seas_ctx)
375 permuts_ctx = [
376 gen_permuts_contexte(seas=seas, pond=pond, dim=dim, seed=seed + n)
377 for n, (seas, pond) in enumerate(permutations_ctx)
378 ]
379 permuts_ctx.append(np.zeros((dim_n * dim_t)))
380 permuts_ctx = np.array(permuts_ctx).sum(axis=0)
381 influence_ctx += np.tile(permuts_ctx, dim_g).reshape(dim_g, -1).T
382 layer.append(permuts_ctx)
384 cats_ctx = [
385 gen_cat_contexte(size=size, effect=effect, per=per, dim=dim, seed=seed + n)
386 for n, (size, effect, per) in enumerate(categorials_ctx)
387 ]
388 cats_ctx.append(np.zeros((dim_n * dim_t, dim_g)))
389 influence_ctx += np.array(cats_ctx).sum(axis=0)
390 layer.append(cats_ctx)
392 influence_ctx = np.sqrt(np.abs(1 + (influence_ctx - influence_ctx.mean())))
393 influence_ctx[influence_ctx < 0.05] = 0.05
394 inf_max = np.percentile(influence_ctx, 99.5)
395 influence_ctx[influence_ctx > inf_max] = inf_max
397 # Concatenation of the mean pattern of each spatial dimension
398 Mean_pattern = np.concatenate(
399 [np.tile((pattern), dim_n)[:, None] for pattern in Pattern_day_mean], axis=1
400 )
402 # Concatenation of the variance pattern of each spatial dimension
403 variance = np.concatenate(
404 [np.tile((pattern), dim_n)[:, None] for pattern in Pattern_day_var], axis=1
405 ) * np.sqrt(influence_ctx)
406 variance = np.array([savgol_filter(i, 5, 3) for i in variance.T]).T
408 # ramdomly daily magnitude
409 rng = np.random.RandomState(seed)
410 Ampleur_day = np.repeat(
411 rng.normal(1, np.sqrt(var_jour), (dim_n, dim_g)), dim_t, axis=0
412 )
414 # Signal composition
415 data_stat = Mean_pattern / Mean_pattern.mean(axis=0) * influence_ctx * Ampleur_day
416 data_stat = v_mean * data_stat
417 layer.append(Ampleur_day)
418 layer.append(Mean_pattern / Mean_pattern.mean(axis=0))
420 # Generation of noises
421 var_add_apply = var_add * v_mean
422 data_bruit_add = gen_noise(var_add_apply, dim, seed=seed - 1)
424 # Injection of variance (Combination of Variance pattern and mutliplicatif noise) and additif noise.
425 var_mult_apply = np.power(np.abs(data_stat), v_ratio)
426 var_mult_apply = (
427 var_mult_apply / var_mult_apply.mean() * var_mult * variance
428 ) # !!! chg ref sigma->var
429 var_mult_apply[var_mult_apply <= 0] = 0
431 data_bruit_mult = gen_noise(var_mult_apply, dim, seed=seed - 2)
432 noise_stat = data_bruit_mult + data_bruit_add
434 # Generation of dynamic pattern over the crossing of mean motifs with seasonal influences.
435 data_dyn, noise_dyn, filters, var_dyn_mult, var_dyn_add = gen_data_dyn(
436 data_stat, noise_stat, var_mult_apply, var_add_apply, c_dyn, dim, seed
437 )
439 data_full = add_noise_aux(data_dyn, data_stat, 0, 0, 0, 0, c_dyn)
440 data_full_dyn_noised = add_noise_aux(
441 data_dyn + noise_dyn, data_stat, 0, 0, 0, 0, c_dyn
442 )
444 data_full_stat_noised = add_noise_aux(
445 data_dyn, data_stat + noise_stat, 0, 0, 0, 0, c_dyn
446 )
447 data_full_noised = add_noise_aux(
448 data_dyn + noise_dyn, data_stat + noise_stat, 0, 0, 0, 0, c_dyn
449 )
451 # Injection of anomaly.
452 # Anomaly magnitude taking into account the variance and the noise to be dicernable.
454 # Var Referential -> coef ²
455 var_dyn_mult = np.power(c_dyn, 2) * var_dyn_mult
456 var_dyn_add = np.power(c_dyn, 2) * var_dyn_add # Var Referential -> coef ²
458 var_stat_mult = np.power((1 - c_dyn), 2) * var_mult_apply
459 var_stat_add = np.power((1 - c_dyn), 2) * var_add_apply
461 var_stat = (
462 var_stat_mult + var_stat_add
463 ) # Variance statiques (facteurs explicatifs non temporelles)
465 var_dyn = (
466 var_dyn_mult + var_dyn_add
467 ) # Variance dynamique (Autocorélation de la variance)
469 var_full = var_dyn + var_stat
471 print("test", var_stat.mean(), var_full.mean())
473 # Generation of anomaly
474 motif = gen_anom_motif(anom_point, verbose=0, dim=dim, seed=seed)
475 anom_list, anom, impact_anom = gen_anom(n_anom, motif, dim, seed=seed)
476 # anom_list,anom,impact_anom=gen_fake_anom(n_anom,motif,dim)
477 var_sum_noise = 0
478 if per_anom != 0:
479 var_sum_noise = scipy.stats.norm.ppf(1 - per_anom / 2, 0, np.sqrt(var_full))
481 # Injection of anomaly.
482 # Anomaly magnitude taking into account the variance and the noise to be dicernable.
483 Y = data_full_noised + (2 * var_sum_noise * f_anom * impact_anom)
485 print(
486 "Var_noise_dyn_emp:",
487 trunc((noise_dyn * (c_dyn)).var()),
488 "var_noise_stat_emp:",
489 trunc((noise_stat * (1 - c_dyn)).var()),
490 "Var_emp",
491 trunc((Y - data_full).var()),
492 )
494 per_list = [0.01, 1, 2.5, 10, 25, 75, 90, 97.5, 99, 99.99]
495 per = []
496 for n, i in enumerate(per_list):
497 coverage = []
498 per.append(
499 add_noise(data_full, 0, scipy.stats.norm.ppf(i / 100, 0, np.sqrt(var_full)))
500 )
501 for g in range(dim_g):
502 coverage.append(100 * ((Y[:, g] < per[n][:, g]).sum() / len(Y)))
503 print(i, np.round(coverage, 2))
505 # Features
506 Features_name = []
507 Features_list = []
508 Features_list.append((np.arange(dim_t * dim_n) % dim_t)[:, None])
509 Features_name.append("main_period")
511 if len(seasons_ctx) > 0:
512 for n, (i, _) in enumerate(seasons_ctx):
513 Features_list.append((np.arange(dim_t * dim_n) % i)[:, None])
514 Features_name.append("period_" + str(i))
516 if len(permutations_ctx) > 0:
517 for n, (i, _) in enumerate(permutations_ctx):
518 permuts = np.random.RandomState(seed + n).permutation(
519 np.arange(dim_n * dim_t)
520 )
521 Features_list.append((np.arange(dim_t * dim_n) % i)[permuts][:, None])
522 Features_name.append("period_cat_" + str(i))
524 if len(categorials_ctx) > 0:
525 for n, (size, effect, per) in enumerate(categorials_ctx):
526 # Flatten context impact : To modify if impact can be different by dimension
527 X_cat = (
528 gen_cat_contexte(
529 size=size, effect=effect, per=per, dim=dim, seed=seed + n
530 )
531 != 0
532 ).astype(int)[:, 0][:, None]
534 Features_list.append(X_cat)
535 Features_name.append("cat_" + str(n) + "_" + str(np.max(X_cat)))
537 X = np.concatenate(Features_list, axis=1)
538 X_name = Features_name
540 train_rf = np.arange(dim_t * dim_n) < (dim_t * dim_n * 2 / 3)
541 test_rf = np.arange(dim_t * dim_n) >= (dim_t * dim_n * 2 / 3)
543 print(X.shape, X_name)
545 return (
546 Y,
547 data_full,
548 data_full_stat_noised,
549 data_full_dyn_noised,
550 impact_anom,
551 anom,
552 anom_list,
553 var_full,
554 var_stat,
555 var_dyn,
556 X,
557 X_name,
558 train_rf,
559 test_rf,
560 seed,
561 )
564def moving_average(a, n):
565 ret = np.cumsum(a, dtype=float)
566 ret[n:] = ret[n:] - ret[:-n]
567 return np.concatenate([ret[: n - 1] / (np.arange(n - 1) + 1), ret[n - 1:] / n])
570def factory(
571 X_row, Y, Features_name, train, test, freq, dim=(400, 20, 3), lags=0, lags_mean=[]
572):
573 dim_n, dim_t, dim_g = dim
574 Y = Y.reshape(dim_n * dim_t, dim_g)
575 new_features_list = []
576 new_features_name = []
577 for i, name in enumerate(Features_name):
578 if "period" in name:
579 X_cur = X_row[:, i]
580 seas = np.argmax(X_cur)
581 new_features_list.append(base_cos_freq((X_cur % seas) / seas, freq))
582 new_features_name.append("cos" + name)
583 new_features_name.append("sin" + name)
584 else:
585 X_cur = X_row[:, i]
586 new_features_list.append(X_cur[:, None])
587 new_features_name.append(name)
589 for g in range(dim_g):
590 for i in lags:
591 new_features_list.append(np.roll(Y[:, g], i)[:, None])
592 new_features_name.append("dim:" + str(g) + " lag:" + str(i))
593 for i, j in lags_mean:
594 new_features_list.append(
595 moving_average(np.roll(Y[:, g], i), j - i)[:, None]
596 )
597 new_features_name.append(
598 "dim:" + str(g) + " lag_mean:" + str(i) + " " + str(j)
599 )
601 X = np.concatenate(new_features_list, axis=1)
602 name = np.array(new_features_name)
603 return (X, Y, name, train, test)
606# Display of data curve with mean and variance.
607def plot_var(
608 Y,
609 data_full,
610 variance,
611 impact_anom=None,
612 anom=None,
613 f_obs=None,
614 dim=(400, 20, 3),
615 g=0,
616 res_flag=False,
617 fig_s=(20, 3),
618 title=None,
619 ylim=None,
620):
621 dim_n, dim_t, dim_g = dim
622 anom_pred = (np.abs(impact_anom).sum(axis=-1) > 0).astype(int) - (anom > 0).astype(
623 int
624 )
626 if anom_pred.sum() < 1:
627 anom_pred[0] = 1
628 anom_pred[-1] = 1
630 step = g
631 plt.figure(figsize=fig_s)
632 plt.title(title)
633 # norm = Y.mean(axis=0)
634 if anom_pred.sum() < 1:
635 anom_pred[0] = 1
636 anom_pred[-1] = 1
638 ni = [100, 98, 95, 80, 50]
639 color_full = [
640 (0.5, 0.0, 0.5),
641 (0.8, 0, 0),
642 (0.8, 0.6, 0),
643 (0, 0.8, 0),
644 (0, 0.4, 0),
645 (0, 0.8, 0),
646 (0.8, 0.6, 0),
647 (0.8, 0, 0),
648 (0.5, 0.0, 0.5),
649 ]
650 color_full2 = [
651 (0.5, 0.0, 0.5),
652 (0.8, 0, 0),
653 (0.8, 0.6, 0),
654 (0, 0.8, 0),
655 (0, 0.4, 0),
656 (0, 0.4, 0),
657 (0, 0.8, 0),
658 (0.8, 0.6, 0),
659 (0.8, 0, 0),
660 (0.5, 0.0, 0.5),
661 ]
663 per_list = [0.01, 1, 2.5, 10, 25, 75, 90, 97.5, 99, 99.99]
664 per = []
666 res = data_full * 0
667 if res_flag:
668 res = data_full
670 for i in per_list:
671 per.append(
672 add_noise(
673 data_full - res,
674 0,
675 scipy.stats.norm.ppf((i / 100), 0, np.sqrt(variance)),
676 ),
677 )
679 for i in range(len(per) - 1):
680 plt.fill_between(
681 np.arange(len(f_obs)),
682 per[i][f_obs, step],
683 per[i + 1][f_obs, step],
684 color=color_full[i],
685 alpha=0.20,
686 )
687 for i in range(len(per)):
688 plt.plot(
689 np.arange(len(f_obs)),
690 per[i][f_obs, step],
691 color=color_full2[i],
692 linewidth=0.5,
693 alpha=0.40,
694 )
695 if i > 4:
696 plt.fill_between(
697 [],
698 [],
699 [],
700 color=color_full2[i],
701 label=str(ni[9 - i]) + "% Coverage",
702 alpha=0.20,
703 )
705 plt.plot(
706 Y[f_obs, step] - res[f_obs, step],
707 label="Series",
708 color="black",
709 linewidth=1.5,
710 marker="o",
711 ms=3,
712 )
713 flag = impact_anom[f_obs, step] != 0
714 print(flag.sum())
715 plt.plot(
716 np.arange(len(f_obs))[flag],
717 Y[f_obs, step][flag] - res[f_obs, step][flag],
718 label="Anom",
719 color="red",
720 ls="",
721 marker="X",
722 ms=10,
723 alpha=0.8,
724 )
726 if False:
727 f_anom = np.repeat(np.abs(impact_anom)[f_obs, step] > 0, 7)
728 for i in range(8):
729 f_anom += np.roll(f_anom, i - 4)
730 f_anom = f_anom > 0
731 plt.fill_between(
732 np.arange(len(f_obs) * 7) / 7 - (3 / 7),
733 -1000,
734 Y[f_obs, step].max() * 1.2,
735 where=f_anom,
736 facecolor="blue",
737 alpha=0.2,
738 label="Anomaly",
739 zorder=-10,
740 )
742 if ylim is None:
743 plt.ylim(per[0][f_obs, step].min() * 1.05, per[-1][f_obs, step].max() * 1.05)
744 else:
745 plt.ylim(ylim[0], ylim[1])
746 plt.legend(ncol=7, fontsize=14)
747 plt.xlim(0, len(f_obs))
748 plt.tight_layout()
749 plt.show()
750 return (per, per_list)
753def generate_default(
754 Name_data="data_test", storing="", dict_param=dict(), seed=7885000
755):
756 pass
758 # Multivariate time series génération of size : (Dim_n, dim_t, dim_g) : (Day, Hour, Station)
759 n_year = 10
760 dim_g = 3
761 n_day = 3
762 dim_t = 20
763 dim_n = n_day * dim_t * n_year
764 dim_g = 3
765 dim = (dim_n, dim_t, dim_g)
767 # Param generation
768 dict_param_default = {
769 "c_dyn": 0.6, # 0.7
770 "n_year": n_year,
771 "n_day": n_day,
772 "var_jour": 0.01, # 0.08
773 "per_anom": 0.01,
774 "var_mult": 0.3, # Pourcentage de bruit multiplicatifs. 0.30
775 "var_add": 0.03, # Pourcentage de bruit additifs. 0.12
776 "f_anom": 0.8,
777 "anom_seuil": 0.01,
778 "verbose": 0,
779 # [(dim_t*n_day,1.0),((dim_n*dim_t)/n_year,0.8)],
780 "seas": [(dim_t * n_day, 2.0), ((dim_n * dim_t) / n_year, 1.5)],
781 # [(dim_t*n_day,0.5),(255,0.2)],
782 "permuts": [],
783 # [(dim_t*n_day, 0.35), (255, 0.15)],
784 # [(7,0.3,0.4),(3,0.8,0.2)],#[(7,0.4,0.3),(1,0.2,0.5),(3,0.8,0.2)],
785 "cats": [],
786 # [(7, 0.3, 0.4)],
787 "v_mean": 2.5,
788 "anom_point": True,
789 }
791 for key in dict_param_default.keys():
792 if key not in dict_param.keys():
793 dict_param[key] = dict_param_default[key]
795 # seed=7885912
797 # Génération d'un dataset
798 gen_variable = core_gen(
799 c_dyn=dict_param["c_dyn"],
800 var_jour=dict_param["var_jour"],
801 v_ratio=0.3,
802 per_anom=dict_param["per_anom"],
803 var_mult=dict_param["var_mult"],
804 var_add=dict_param["var_add"],
805 f_anom=dict_param["f_anom"],
806 anom_seuil=dict_param["anom_seuil"],
807 verbose=dict_param["verbose"],
808 anom_point=dict_param["anom_point"],
809 seasons_ctx=dict_param["seas"],
810 categorials_ctx=dict_param["cats"],
811 permutations_ctx=dict_param["permuts"],
812 v_mean=dict_param["v_mean"],
813 dim=dim,
814 seed=seed,
815 )
817 (
818 Y,
819 data_mean,
820 data_mean_stat,
821 data_mean_dyn,
822 impact_anom,
823 anom,
824 anom_list,
825 var,
826 var_stat,
827 var_dyn,
828 X_init,
829 X_name_init,
830 train,
831 test,
832 seed,
833 ) = gen_variable
835 # Remove multivariate other dimension
837 X, Y, X_name, train, test = factory(
838 X_init,
839 Y,
840 X_name_init,
841 train,
842 test,
843 freq=[2],
844 lags=[1, 2],
845 lags_mean=[(1, int(dim_t / 2)), (1, n_day * dim_t)],
846 dim=(dim_n, dim_t, dim_g),
847 )
849 Cat_flag = np.array(["cat" in i for i in X_name_init])
850 X_cat = (
851 (X_init[:, Cat_flag])
852 * np.array([np.power(2, i) for i in range(0, Cat_flag.sum())])
853 ).sum(axis=1)
855 context = np.concatenate(
856 [X_init[:, np.invert(Cat_flag)], X_cat[:, None], Y, var], axis=1
857 )
858 context_name = np.concatenate(
859 [
860 np.array(X_name_init)[np.invert(Cat_flag)].tolist(),
861 ["Cat_contexts"],
862 ["lag_Y_Numerical", "lag_dim1_Numerical", "lag_dim2_Numerical"],
863 ["Var_Y_Numerical", "var_dim1_Numerical", "var_dim2_Numerical"],
864 ]
865 )
867 context[:, 0] = np.floor(
868 (context[:, 0] / (context[:, 0].max() / (dim_t - 0.000001)))
869 ).astype(int)
870 context[:, 1] = np.floor(
871 (context[:, 1] / (context[:, 1].max() / (n_day - 0.000001)))
872 ).astype(int)
873 context[:, 2] = np.floor((context[:, 2] / (context[:, 2].max() / (5.99)))).astype(
874 int
875 )
877 np.arange(dim_n) % dict_param["n_day"]
878 anom_pred = (np.abs(impact_anom).sum(axis=-1) > 0).astype(int) - (anom > 0).astype(
879 int
880 )
881 if anom_pred.sum() < 1:
882 anom_pred[0] = 1
883 anom_pred[-1] = 1
885 f_obs = np.arange(len(Y))[test][0:200]
887 # Env var
888 for g in range(dim_g):
889 per, per_info = plot_var(
890 Y, data_mean, var, impact_anom, anom, f_obs=f_obs, dim=dim, g=g
891 )
893 f_obs = np.arange(len(Y))
894 per, per_info = plot_var(
895 Y, data_mean, var, impact_anom, anom, f_obs=f_obs, dim=dim, g=g
896 )
898 # Env full stat
899 # per_bis,per_info=Gen.plot_var(Y,data_mean_dyn,var_stat,impact_anom,anom,f_obs=f_obs,dim=dim,g=0)
901 # Env var dyn
902 # per_bis,per_info=Gen.plot_var(Y,data_mean_stat,var_dyn,impact_anom,anom,f_obs=f_obs,dim=dim,g=0)
904 plt.figure(figsize=(14, 2))
905 plt.subplot(1, 2, 1)
906 plt.scatter(
907 data_mean[:, 0],
908 np.abs(Y[:, 0] - data_mean[:, 0]),
909 s=1,
910 c=np.arange(dim_t * dim_n) % dim_t,
911 cmap="jet",
912 )
913 plt.subplot(1, 2, 2)
914 plt.scatter(
915 data_mean[:, 0], var[:, 0], s=1, c=np.arange(dim_t * dim_n) % dim_t, cmap="jet"
916 )
917 plt.show()
919 dict_data = {}
920 dict_data["X"] = X
921 dict_data["Y"] = Y
922 dict_data["context"] = context
923 dict_data["X_name"] = X_name
924 dict_data["train"] = train
925 dict_data["test"] = test
926 dict_data["X_split"] = np.arange(len(X)) // (len(X) / 5)
927 dict_data["context_name"] = context_name
928 dict_data["aux"] = {
929 "data_mean": data_mean,
930 "data_mean_dyn": data_mean_dyn,
931 "impact_anom": impact_anom,
932 "anom": anom,
933 "var": var,
934 "var_dyn": var_dyn,
935 "var_stat": var_stat,
936 "f_obs": f_obs,
937 "dim": dim,
938 }
940 # write(storing, [Name_data], dict_data)
941 return dict_data