Coverage for dqm/representativeness/utils.py: 47%

114 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-05 14:00 +0000

1""" 

2This module implements two classes, DiscretisationParams and VariableAnalysis, 

3providing functionality for variable counting, countplot visualization, 

4and discretization of variables using normal or uniform distributions. 

5It also includes functions for processing data for chi-square tests, calculating 

6expected values, and generating histograms for observed and expected values. 

7 

8Authors: 

9 Faouzi ADJED 

10 Anani DJATO 

11 

12Dependencies: 

13 numpy 

14 pandas 

15 matplotlib.pyplot 

16 scipy.stats 

17 dqm.utils.twe_logger 

18 seaborn 

19 

20Functions : None 

21 

22Classes: 

23 DiscretisationParams: Class for defining discretization parameters 

24 VariableAnalysis: Class for analyzing data distribution 

25 

26Example: 

27from utils import VariableAnalysis, DiscretisationParams 

28 

29# Example of using VariableAnalysis class 

30variable_analyzer = VariableAnalysis() 

31 

32# Example of using the variable_counting method 

33my_variable = pd.Series([1, 2, 2, 3, 3, 3, 4, 4, 4, 4]) 

34counts = variable_analyzer.variable_counting(my_variable) 

35print("Counts of unique values:") 

36print(counts) 

37 

38# Example of using the countplot method 

39variable_analyzer.countplot(my_variable) 

40plt.show() 

41 

42# Instantiate the DiscretisationParams class 

43discretisation_params = DiscretisationParams( 

44 data=my_variable, 

45 distribution_theory='normal', 

46 distribution_empirical=[-1.0, 0.0, 1.0, 2.0], 

47 mean=0.0, 

48 std=1.0 

49) 

50""" 

51 

52from typing import Optional, List, Union 

53import numpy as np 

54import pandas as pd 

55from matplotlib import pyplot as plt 

56from scipy import stats 

57from dqm.utils.twe_logger import get_logger 

58import seaborn as sns 

59 

60logger = get_logger() 

61 

62 

63class DiscretisationParams: 

64 """ 

65 Parameters for discretization. 

66 

67 Args: 

68 data: Input data. 

69 distribution_params: Dictionary containing distribution parameters. 

70 'theory': Distribution theory ('normal' or 'uniform'). 

71 'empirical': Empirical distribution used for discretization. 

72 'mean': Mean parameter for the distribution theory. 

73 'std': Standard deviation for the distribution theory. 

74 

75 Methods: 

76 __init__: 

77 Initializes an instance of the DiscretisationParams class. 

78 

79 Args: 

80 data: Input data. 

81 distribution_params: Dictionary containing distribution parameters. 

82 

83 Returns: 

84 None 

85 

86 to_dict: 

87 Converts the parameters to a dictionary. 

88 

89 Returns: 

90 dict: A dictionary representation of the parameters. 

91 

92 Note: 

93 This method is not necessary. It was created solely to have at 

94 least 2 methods as recommended in a class. 

95 

96 get_data: 

97 Gets the input data. 

98 

99 Returns: 

100 Any: The input data. 

101 """ 

102 def __init__(self, data, distribution_params): 

103 """ 

104 Initializes an instance of the DiscretisationParams class. 

105 

106 Args: 

107 data (pd.Series): Input data. 

108 distribution_params (dict): Dictionary containing distribution parameters. 

109 'theory': Distribution theory ('normal' or 'uniform'). 

110 'empirical': Empirical distribution used for discretization. 

111 'mean': Mean parameter for the distribution theory. 

112 'std': Standard deviation for the distribution theory. 

113 

114 Returns: 

115 None 

116 """ 

117 self.data = data 

118 self.distribution_theory = distribution_params['theory'] 

119 self.distribution_empirical = distribution_params['empirical'] 

120 self.mean = distribution_params['mean'] 

121 self.std = distribution_params['std'] 

122 

123 def to_dict(self): 

124 """ 

125 Convert the parameters to a dictionary. 

126 

127 Returns: 

128 dict: A dictionary representation of the parameters. 

129 

130 Note: 

131 This method is not necessary. It was created solely to have at 

132 least 2 methods as recommended in a class. 

133 """ 

134 return { 

135 'data': self.data, 

136 'distribution_theory': self.distribution_theory, 

137 'distribution_empirical': self.distribution_empirical, 

138 'mean': self.mean, 

139 'std': self.std 

140 } 

141 

142 def get_data(self): 

143 """ 

144 Get the input data. 

145 

146 Returns: 

147 Any: The input data. 

148 """ 

149 return self.data 

150 

151 

152class VariableAnalysis: 

153 """ 

154 This class provides functions for variable counting, countplot visualization, 

155 and discretization of variables using normal or uniform distributions. 

156 It includes functions for processing data for chi-square tests, 

157 calculating expected values, and generating histograms for observed and expected values. 

158 

159 Args: None 

160 

161 Methods: 

162 variable_counting 

163 countplot 

164 discretisation 

165 normal_discretization 

166 data_processing_for_chisqure_test 

167 uniform_discretization 

168 discretisation_intervals 

169 delete_na 

170 expected 

171 expected_hist 

172 observed_hist 

173 """ 

174 

175 def variable_counting(self, variable: pd.Series) -> pd.DataFrame: 

176 """ 

177 Counting unique values (only int values and modalities. 

178 It cannot be used for float values) 

179 

180 Args: 

181 variable (panda.Series) 

182 

183 Returns: 

184 variable_count (DataFrame): counts of unique values 

185 """ 

186 variable_count = variable.value_counts().to_frame() 

187 variable_count.columns = ["count"] 

188 variable_count.sort_index(inplace=True) 

189 return variable_count 

190 

191 def countplot(self, variable: pd.Series) -> Optional[None]: 

192 """ 

193 This function will not be used and will be deleted in the final package (to decide) 

194 Show the counts of observations of every category 

195 

196 Args: 

197 variable (DataFrame) 

198 

199 Returns: 

200 countplot (show the bar plot of counts of variable) 

201 """ 

202 plt.figure(figsize=(10, 5)) 

203 sns.countplot(x=variable) 

204 

205 def discretisation( 

206 self, 

207 variable: pd.Series, 

208 distribution: str, 

209 bins: int 

210 ) -> List[Union[float, int]]: 

211 """ Discretisation of variable into bins 

212 

213 Args: 

214 distribution (string): 'normal' ou 'uniform' 

215 variable (Series) 

216 bins (int) 

217 

218 Returns: 

219 interval (array): discretised variable into bins 

220 """ 

221 interval = [] 

222 

223 if distribution == 'normal': 

224 mean = np.mean(variable) 

225 std = np.std(variable) 

226 for i in range(1, bins): 

227 val = stats.norm.ppf(i / bins, mean, std) 

228 interval.append(val) 

229 

230 elif distribution == 'uniform': 

231 min_value = variable.min() 

232 max_value = variable.max() 

233 for i in range(1, bins): 

234 val = stats.uniform.ppf(i / bins, min_value, max_value) 

235 interval.append(val) 

236 

237 interval.insert(0, -np.inf) 

238 interval.append(np.inf) 

239 return interval 

240 

241 def normal_discretization( 

242 self, 

243 bins: int, 

244 mean: float, 

245 std: float 

246 ) -> List[float]: 

247 """ 

248 normal Discretisation of variable into bins 

249 

250 Args: 

251 bins (int): int 

252 mean (float): the first parameter of the gaussian distribution 

253 std (float): standard 

254 

255 Returns 

256 interval (array): discretised variable into bins 

257 """ 

258 interval = [] 

259 for i in range(1, bins): 

260 val = stats.norm.ppf(i / bins, mean, std) 

261 interval.append(val) 

262 interval.insert(0, -np.inf) 

263 interval.append(np.inf) 

264 return interval 

265 

266 def uniform_discretization( 

267 self, 

268 bins: int, 

269 min_value: float, 

270 max_value: float, 

271 ) -> List[float]: 

272 """ 

273 This function discretizes a variable with a uniform distribution into specified bins. 

274 It uses the inverse transform method with the scipy.stats.uniform.ppf function. 

275 

276 Args: 

277 bins (int): Number of bins. 

278 min_value (float): Minimum value for the uniform distribution. 

279 max_value (float): Maximum value for the uniform distribution. 

280 

281 Returns: 

282 interval (list): Discretized variable into bins. 

283 The list includes intervals with the first element representing negative infinity 

284 and the last element representing positive infinity. 

285 """ 

286 interval = [] 

287 for i in range(1, bins): 

288 val = stats.uniform.ppf(i / bins, min_value, max_value) 

289 interval.append(val) 

290 interval.insert(0, -np.inf) 

291 interval.append(np.inf) 

292 return interval 

293 

294 def data_processing_for_chisqure_test( 

295 self, 

296 data: pd.DataFrame 

297 ) -> pd.DataFrame: 

298 """ 

299 This function is designed to preprocess the input data for chi-square tests. 

300 If the data type is object ('O'), it is assumed to be categorical, 

301 and the function converts it into value counts. 

302 This step is crucial for chi-square tests, which require frequency distributions. 

303 

304 Args: 

305 data (pd.DataFrame): Input data. 

306 

307 Returns: 

308 data (pd.DataFrame): 

309 Processed data suitable for chi-square tests. 

310 """ 

311 if data.dtypes == 'O': 

312 data = data.value_counts() 

313 return data 

314 

315 def discretisation_intervals( 

316 self, 

317 params: DiscretisationParams 

318 ) -> Optional[pd.DataFrame]: 

319 """ 

320 This function discretizes a given set of data into intervals based on 

321 empirical distribution and calculates observed and expected frequencies 

322 for each interval. It supports both normal and uniform distribution theories. 

323 

324 Args: 

325 params (DiscretisationParams): Parameters for discretization. 

326 

327 Returns: 

328 intervals (Optional[DataFrame]): Intervals and counts of each interval. 

329 Returns None if an unsupported distribution theory is provided. 

330 

331 Note: 

332 The function may issue a warning if there are missing values in the data. 

333 

334 Example: 

335 interval_data = discretisation_intervals( 

336 DiscretisationParams( 

337 data, { 

338 'theory': 'normal', 

339 'empirical': distribution_empirical, 

340 'mean': mean, 'std': std 

341 } 

342 ) 

343 ) 

344 if interval_data is not None: 

345 logger.info(interval_data) 

346 """ 

347 alpha = 1.0 

348 processed_data = self.delete_na(params.data) 

349 

350 if len(processed_data) != len(params.data): 

351 deleted_data = len(params.data) - len(processed_data) 

352 logger.info("the data is not complete, there are %s missed items", deleted_data) 

353 params.data = processed_data 

354 

355 if params.distribution_theory == "normal": 

356 exp = self.expected(params.distribution_theory, params.data, params.mean, params.std) 

357 elif params.distribution_theory == "uniform": 

358 min_value = params.mean 

359 max_value = params.mean + params.std 

360 exp = self.expected(params.distribution_theory, params.data, min_value, max_value) 

361 

362 if params.distribution_theory in ('normal', 'uniform'): 

363 intervals = pd.DataFrame( 

364 {'lower_limit': params.distribution_empirical[:-1], 

365 'upper_limit': params.distribution_empirical[1:]} 

366 ) 

367 

368 observed_values = sorted(params.data) 

369 expected_values = sorted(exp) 

370 

371 intervals['obs_freq'] = intervals.apply( 

372 lambda x: sum(x['lower_limit'] < i <= x['upper_limit'] for i in observed_values), axis=1) / alpha 

373 

374 intervals['exp_freq'] = intervals.apply( 

375 lambda x: sum(x['lower_limit'] < i <= x['upper_limit'] for i in expected_values), axis=1) / alpha 

376 

377 return intervals 

378 

379 # Add a return value (can be None if needed) 

380 return None 

381 

382 def delete_na(self, data: pd.DataFrame) -> pd.DataFrame: 

383 """ 

384 Remove missing values (NaN) from the input data. 

385 

386 Args: 

387 data (pd.DataFrame): The input data containing missing values. 

388 

389 Returns: 

390 data (pd.DataFrame): 

391 The input data with missing values removed. If the input is a Series, 

392 the output will also be a Series. If the input is a DataFrame, 

393 the output will be a DataFrame. 

394 """ 

395 data = data.dropna() 

396 return data 

397 

398 def expected( 

399 self, 

400 distribution: str, 

401 data: List[float], 

402 *argv: float 

403 ) -> List[float]: 

404 """ 

405 Calculate the expected values of the distribution 

406 

407 Args: 

408 distribution (str): 'normal' or 'uniform' 

409 data (List[float]): Input data. 

410 *argv : Parameters of the distribution. 

411 

412 Returns: 

413 n or u (List[float]): Expected values for every distribution. 

414 """ 

415 if distribution == 'normal': 

416 mean = argv[0] 

417 std = argv[1] 

418 n = np.random.normal(mean, std, len(data)) 

419 return n 

420 

421 # Assuming distribution is 'uniform' 

422 min_value = argv[0] 

423 max_value = argv[1] 

424 u = np.random.uniform(min_value, max_value, len(data)) 

425 return u 

426 

427 def observed_hist(self, variable: pd.Series) -> None: 

428 """ 

429 Plot the observed values of the distribution 

430 

431 Args: 

432 variable (pd.Series): Input variable. 

433 

434 Returns: 

435 None (plots histogram) 

436 """ 

437 if variable.dtypes == 'int64': 

438 plt.figure(figsize=(10, 5)) 

439 plt.hist(variable, bins=len(variable.unique())) 

440 plt.xlabel(variable.name) 

441 elif variable.dtypes == 'O': 

442 plt.figure(figsize=(10, 5)) 

443 plt.bar(variable.value_counts().index, variable.value_counts()) 

444 plt.xlabel(variable.name) 

445 elif variable.dtypes == 'bool': 

446 plt.figure(figsize=(10, 5)) 

447 true = variable.value_counts()[True] 

448 false = variable.value_counts()[False] 

449 plt.bar(['True', 'False'], [true, false]) 

450 plt.xlabel(variable.name)