Coverage for dqm/representativeness/utils.py: 47%

1"""

2This module implements two classes, DiscretisationParams and VariableAnalysis,

3providing functionality for variable counting, countplot visualization,

4and discretization of variables using normal or uniform distributions.

5It also includes functions for processing data for chi-square tests, calculating

6expected values, and generating histograms for observed and expected values.

8Authors:

9 Faouzi ADJED

10 Anani DJATO

12Dependencies:

13 numpy

14 pandas

15 matplotlib.pyplot

16 scipy.stats

17 dqm.utils.twe_logger

18 seaborn

20Functions : None

22Classes:

23 DiscretisationParams: Class for defining discretization parameters

24 VariableAnalysis: Class for analyzing data distribution

26Example:

27from utils import VariableAnalysis, DiscretisationParams

29# Example of using VariableAnalysis class

30variable_analyzer = VariableAnalysis()

32# Example of using the variable_counting method

33my_variable = pd.Series([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])

34counts = variable_analyzer.variable_counting(my_variable)

35print("Counts of unique values:")

36print(counts)

38# Example of using the countplot method

39variable_analyzer.countplot(my_variable)

40plt.show()

42# Instantiate the DiscretisationParams class

43discretisation_params = DiscretisationParams(

44 data=my_variable,

45 distribution_theory='normal',

46 distribution_empirical=[-1.0, 0.0, 1.0, 2.0],

47 mean=0.0,

48 std=1.0

49)

50"""

52from typing import Optional, List, Union

53import numpy as np

54import pandas as pd

55from matplotlib import pyplot as plt

56from scipy import stats

57from dqm.utils.twe_logger import get_logger

58import seaborn as sns

60logger = get_logger()

63class DiscretisationParams:

64 """

65 Parameters for discretization.

67 Args:

68 data: Input data.

69 distribution_params: Dictionary containing distribution parameters.

70 'theory': Distribution theory ('normal' or 'uniform').

71 'empirical': Empirical distribution used for discretization.

72 'mean': Mean parameter for the distribution theory.

73 'std': Standard deviation for the distribution theory.

75 Methods:

76 __init__:

77 Initializes an instance of the DiscretisationParams class.

79 Args:

80 data: Input data.

81 distribution_params: Dictionary containing distribution parameters.

83 Returns:

84 None

86 to_dict:

87 Converts the parameters to a dictionary.

89 Returns:

90 dict: A dictionary representation of the parameters.

92 Note:

93 This method is not necessary. It was created solely to have at

94 least 2 methods as recommended in a class.

96 get_data:

97 Gets the input data.

99 Returns:

100 Any: The input data.

101 """

102 def __init__(self, data, distribution_params):

103 """

104 Initializes an instance of the DiscretisationParams class.

105

106 Args:

107 data (pd.Series): Input data.

108 distribution_params (dict): Dictionary containing distribution parameters.

109 'theory': Distribution theory ('normal' or 'uniform').

110 'empirical': Empirical distribution used for discretization.

111 'mean': Mean parameter for the distribution theory.

112 'std': Standard deviation for the distribution theory.

113

114 Returns:

115 None

116 """

117 self.data = data

118 self.distribution_theory = distribution_params['theory']

119 self.distribution_empirical = distribution_params['empirical']

120 self.mean = distribution_params['mean']

121 self.std = distribution_params['std']

122

123 def to_dict(self):

124 """

125 Convert the parameters to a dictionary.

126

127 Returns:

128 dict: A dictionary representation of the parameters.

129

130 Note:

131 This method is not necessary. It was created solely to have at

132 least 2 methods as recommended in a class.

133 """

134 return {

135 'data': self.data,

136 'distribution_theory': self.distribution_theory,

137 'distribution_empirical': self.distribution_empirical,

138 'mean': self.mean,

139 'std': self.std

140 }

141

142 def get_data(self):

143 """

144 Get the input data.

145

146 Returns:

147 Any: The input data.

148 """

149 return self.data

150

151

152class VariableAnalysis:

153 """

154 This class provides functions for variable counting, countplot visualization,

155 and discretization of variables using normal or uniform distributions.

156 It includes functions for processing data for chi-square tests,

157 calculating expected values, and generating histograms for observed and expected values.

158

159 Args: None

160

161 Methods:

162 variable_counting

163 countplot

164 discretisation

165 normal_discretization

166 data_processing_for_chisqure_test

167 uniform_discretization

168 discretisation_intervals

169 delete_na

170 expected

171 expected_hist

172 observed_hist

173 """

174

175 def variable_counting(self, variable: pd.Series) -> pd.DataFrame:

176 """

177 Counting unique values (only int values and modalities.

178 It cannot be used for float values)

179

180 Args:

181 variable (panda.Series)

182

183 Returns:

184 variable_count (DataFrame): counts of unique values

185 """

186 variable_count = variable.value_counts().to_frame()

187 variable_count.columns = ["count"]

188 variable_count.sort_index(inplace=True)

189 return variable_count

190

191 def countplot(self, variable: pd.Series) -> Optional[None]:

192 """

193 This function will not be used and will be deleted in the final package (to decide)

194 Show the counts of observations of every category

195

196 Args:

197 variable (DataFrame)

198

199 Returns:

200 countplot (show the bar plot of counts of variable)

201 """

202 plt.figure(figsize=(10, 5))

203 sns.countplot(x=variable)

204

205 def discretisation(

206 self,

207 variable: pd.Series,

208 distribution: str,

209 bins: int

210 ) -> List[Union[float, int]]:

211 """ Discretisation of variable into bins

212

213 Args:

214 distribution (string): 'normal' ou 'uniform'

215 variable (Series)

216 bins (int)

217

218 Returns:

219 interval (array): discretised variable into bins

220 """

221 interval = []

222

223 if distribution == 'normal':

224 mean = np.mean(variable)

225 std = np.std(variable)

226 for i in range(1, bins):

227 val = stats.norm.ppf(i / bins, mean, std)

228 interval.append(val)

229

230 elif distribution == 'uniform':

231 min_value = variable.min()

232 max_value = variable.max()

233 for i in range(1, bins):

234 val = stats.uniform.ppf(i / bins, min_value, max_value)

235 interval.append(val)

236

237 interval.insert(0, -np.inf)

238 interval.append(np.inf)

239 return interval

240

241 def normal_discretization(

242 self,

243 bins: int,

244 mean: float,

245 std: float

246 ) -> List[float]:

247 """

248 normal Discretisation of variable into bins

249

250 Args:

251 bins (int): int

252 mean (float): the first parameter of the gaussian distribution

253 std (float): standard

254

255 Returns

256 interval (array): discretised variable into bins

257 """

258 interval = []

259 for i in range(1, bins):

260 val = stats.norm.ppf(i / bins, mean, std)

261 interval.append(val)

262 interval.insert(0, -np.inf)

263 interval.append(np.inf)

264 return interval

265

266 def uniform_discretization(

267 self,

268 bins: int,

269 min_value: float,

270 max_value: float,

271 ) -> List[float]:

272 """

273 This function discretizes a variable with a uniform distribution into specified bins.

274 It uses the inverse transform method with the scipy.stats.uniform.ppf function.

275

276 Args:

277 bins (int): Number of bins.

278 min_value (float): Minimum value for the uniform distribution.

279 max_value (float): Maximum value for the uniform distribution.

280

281 Returns:

282 interval (list): Discretized variable into bins.

283 The list includes intervals with the first element representing negative infinity

284 and the last element representing positive infinity.

285 """

286 interval = []

287 for i in range(1, bins):

288 val = stats.uniform.ppf(i / bins, min_value, max_value)

289 interval.append(val)

290 interval.insert(0, -np.inf)

291 interval.append(np.inf)

292 return interval

293

294 def data_processing_for_chisqure_test(

295 self,

296 data: pd.DataFrame

297 ) -> pd.DataFrame:

298 """

299 This function is designed to preprocess the input data for chi-square tests.

300 If the data type is object ('O'), it is assumed to be categorical,

301 and the function converts it into value counts.

302 This step is crucial for chi-square tests, which require frequency distributions.

303

304 Args:

305 data (pd.DataFrame): Input data.

306

307 Returns:

308 data (pd.DataFrame):

309 Processed data suitable for chi-square tests.

310 """

311 if data.dtypes == 'O':

312 data = data.value_counts()

313 return data

314

315 def discretisation_intervals(

316 self,

317 params: DiscretisationParams

318 ) -> Optional[pd.DataFrame]:

319 """

320 This function discretizes a given set of data into intervals based on

321 empirical distribution and calculates observed and expected frequencies

322 for each interval. It supports both normal and uniform distribution theories.

323

324 Args:

325 params (DiscretisationParams): Parameters for discretization.

326

327 Returns:

328 intervals (Optional[DataFrame]): Intervals and counts of each interval.

329 Returns None if an unsupported distribution theory is provided.

330

331 Note:

332 The function may issue a warning if there are missing values in the data.

333

334 Example:

335 interval_data = discretisation_intervals(

336 DiscretisationParams(

337 data, {

338 'theory': 'normal',

339 'empirical': distribution_empirical,

340 'mean': mean, 'std': std

341 }

342 )

343 )

344 if interval_data is not None:

345 logger.info(interval_data)

346 """

347 alpha = 1.0

348 processed_data = self.delete_na(params.data)

349

350 if len(processed_data) != len(params.data):

351 deleted_data = len(params.data) - len(processed_data)

352 logger.info("the data is not complete, there are %s missed items", deleted_data)

353 params.data = processed_data

354

355 if params.distribution_theory == "normal":

356 exp = self.expected(params.distribution_theory, params.data, params.mean, params.std)

357 elif params.distribution_theory == "uniform":

358 min_value = params.mean

359 max_value = params.mean + params.std

360 exp = self.expected(params.distribution_theory, params.data, min_value, max_value)

361

362 if params.distribution_theory in ('normal', 'uniform'):

363 intervals = pd.DataFrame(

364 {'lower_limit': params.distribution_empirical[:-1],

365 'upper_limit': params.distribution_empirical[1:]}

366 )

367

368 observed_values = sorted(params.data)

369 expected_values = sorted(exp)

370

371 intervals['obs_freq'] = intervals.apply(

372 lambda x: sum(x['lower_limit'] < i <= x['upper_limit'] for i in observed_values), axis=1) / alpha

373

374 intervals['exp_freq'] = intervals.apply(

375 lambda x: sum(x['lower_limit'] < i <= x['upper_limit'] for i in expected_values), axis=1) / alpha

376

377 return intervals

378

379 # Add a return value (can be None if needed)

380 return None

381

382 def delete_na(self, data: pd.DataFrame) -> pd.DataFrame:

383 """

384 Remove missing values (NaN) from the input data.

385

386 Args:

387 data (pd.DataFrame): The input data containing missing values.

388

389 Returns:

390 data (pd.DataFrame):

391 The input data with missing values removed. If the input is a Series,

392 the output will also be a Series. If the input is a DataFrame,

393 the output will be a DataFrame.

394 """

395 data = data.dropna()

396 return data

397

398 def expected(

399 self,

400 distribution: str,

401 data: List[float],

402 *argv: float

403 ) -> List[float]:

404 """

405 Calculate the expected values of the distribution

406

407 Args:

408 distribution (str): 'normal' or 'uniform'

409 data (List[float]): Input data.

410 *argv : Parameters of the distribution.

411

412 Returns:

413 n or u (List[float]): Expected values for every distribution.

414 """

415 if distribution == 'normal':

416 mean = argv[0]

417 std = argv[1]

418 n = np.random.normal(mean, std, len(data))

419 return n

420

421 # Assuming distribution is 'uniform'

422 min_value = argv[0]

423 max_value = argv[1]

424 u = np.random.uniform(min_value, max_value, len(data))

425 return u

426

427 def observed_hist(self, variable: pd.Series) -> None:

428 """

429 Plot the observed values of the distribution

430

431 Args:

432 variable (pd.Series): Input variable.

433

434 Returns:

435 None (plots histogram)

436 """

437 if variable.dtypes == 'int64':

438 plt.figure(figsize=(10, 5))

439 plt.hist(variable, bins=len(variable.unique()))

440 plt.xlabel(variable.name)

441 elif variable.dtypes == 'O':

442 plt.figure(figsize=(10, 5))

443 plt.bar(variable.value_counts().index, variable.value_counts())

444 plt.xlabel(variable.name)

445 elif variable.dtypes == 'bool':

446 plt.figure(figsize=(10, 5))

447 true = variable.value_counts()[True]

448 false = variable.value_counts()[False]

449 plt.bar(['True', 'False'], [true, false])

450 plt.xlabel(variable.name)