Coverage for dqm/representativeness/utils.py: 47%
114 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:00 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:00 +0000
1"""
2This module implements two classes, DiscretisationParams and VariableAnalysis,
3providing functionality for variable counting, countplot visualization,
4and discretization of variables using normal or uniform distributions.
5It also includes functions for processing data for chi-square tests, calculating
6expected values, and generating histograms for observed and expected values.
8Authors:
9 Faouzi ADJED
10 Anani DJATO
12Dependencies:
13 numpy
14 pandas
15 matplotlib.pyplot
16 scipy.stats
17 dqm.utils.twe_logger
18 seaborn
20Functions : None
22Classes:
23 DiscretisationParams: Class for defining discretization parameters
24 VariableAnalysis: Class for analyzing data distribution
26Example:
27from utils import VariableAnalysis, DiscretisationParams
29# Example of using VariableAnalysis class
30variable_analyzer = VariableAnalysis()
32# Example of using the variable_counting method
33my_variable = pd.Series([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
34counts = variable_analyzer.variable_counting(my_variable)
35print("Counts of unique values:")
36print(counts)
38# Example of using the countplot method
39variable_analyzer.countplot(my_variable)
40plt.show()
42# Instantiate the DiscretisationParams class
43discretisation_params = DiscretisationParams(
44 data=my_variable,
45 distribution_theory='normal',
46 distribution_empirical=[-1.0, 0.0, 1.0, 2.0],
47 mean=0.0,
48 std=1.0
49)
50"""
52from typing import Optional, List, Union
53import numpy as np
54import pandas as pd
55from matplotlib import pyplot as plt
56from scipy import stats
57from dqm.utils.twe_logger import get_logger
58import seaborn as sns
60logger = get_logger()
63class DiscretisationParams:
64 """
65 Parameters for discretization.
67 Args:
68 data: Input data.
69 distribution_params: Dictionary containing distribution parameters.
70 'theory': Distribution theory ('normal' or 'uniform').
71 'empirical': Empirical distribution used for discretization.
72 'mean': Mean parameter for the distribution theory.
73 'std': Standard deviation for the distribution theory.
75 Methods:
76 __init__:
77 Initializes an instance of the DiscretisationParams class.
79 Args:
80 data: Input data.
81 distribution_params: Dictionary containing distribution parameters.
83 Returns:
84 None
86 to_dict:
87 Converts the parameters to a dictionary.
89 Returns:
90 dict: A dictionary representation of the parameters.
92 Note:
93 This method is not necessary. It was created solely to have at
94 least 2 methods as recommended in a class.
96 get_data:
97 Gets the input data.
99 Returns:
100 Any: The input data.
101 """
102 def __init__(self, data, distribution_params):
103 """
104 Initializes an instance of the DiscretisationParams class.
106 Args:
107 data (pd.Series): Input data.
108 distribution_params (dict): Dictionary containing distribution parameters.
109 'theory': Distribution theory ('normal' or 'uniform').
110 'empirical': Empirical distribution used for discretization.
111 'mean': Mean parameter for the distribution theory.
112 'std': Standard deviation for the distribution theory.
114 Returns:
115 None
116 """
117 self.data = data
118 self.distribution_theory = distribution_params['theory']
119 self.distribution_empirical = distribution_params['empirical']
120 self.mean = distribution_params['mean']
121 self.std = distribution_params['std']
123 def to_dict(self):
124 """
125 Convert the parameters to a dictionary.
127 Returns:
128 dict: A dictionary representation of the parameters.
130 Note:
131 This method is not necessary. It was created solely to have at
132 least 2 methods as recommended in a class.
133 """
134 return {
135 'data': self.data,
136 'distribution_theory': self.distribution_theory,
137 'distribution_empirical': self.distribution_empirical,
138 'mean': self.mean,
139 'std': self.std
140 }
142 def get_data(self):
143 """
144 Get the input data.
146 Returns:
147 Any: The input data.
148 """
149 return self.data
152class VariableAnalysis:
153 """
154 This class provides functions for variable counting, countplot visualization,
155 and discretization of variables using normal or uniform distributions.
156 It includes functions for processing data for chi-square tests,
157 calculating expected values, and generating histograms for observed and expected values.
159 Args: None
161 Methods:
162 variable_counting
163 countplot
164 discretisation
165 normal_discretization
166 data_processing_for_chisqure_test
167 uniform_discretization
168 discretisation_intervals
169 delete_na
170 expected
171 expected_hist
172 observed_hist
173 """
175 def variable_counting(self, variable: pd.Series) -> pd.DataFrame:
176 """
177 Counting unique values (only int values and modalities.
178 It cannot be used for float values)
180 Args:
181 variable (panda.Series)
183 Returns:
184 variable_count (DataFrame): counts of unique values
185 """
186 variable_count = variable.value_counts().to_frame()
187 variable_count.columns = ["count"]
188 variable_count.sort_index(inplace=True)
189 return variable_count
191 def countplot(self, variable: pd.Series) -> Optional[None]:
192 """
193 This function will not be used and will be deleted in the final package (to decide)
194 Show the counts of observations of every category
196 Args:
197 variable (DataFrame)
199 Returns:
200 countplot (show the bar plot of counts of variable)
201 """
202 plt.figure(figsize=(10, 5))
203 sns.countplot(x=variable)
205 def discretisation(
206 self,
207 variable: pd.Series,
208 distribution: str,
209 bins: int
210 ) -> List[Union[float, int]]:
211 """ Discretisation of variable into bins
213 Args:
214 distribution (string): 'normal' ou 'uniform'
215 variable (Series)
216 bins (int)
218 Returns:
219 interval (array): discretised variable into bins
220 """
221 interval = []
223 if distribution == 'normal':
224 mean = np.mean(variable)
225 std = np.std(variable)
226 for i in range(1, bins):
227 val = stats.norm.ppf(i / bins, mean, std)
228 interval.append(val)
230 elif distribution == 'uniform':
231 min_value = variable.min()
232 max_value = variable.max()
233 for i in range(1, bins):
234 val = stats.uniform.ppf(i / bins, min_value, max_value)
235 interval.append(val)
237 interval.insert(0, -np.inf)
238 interval.append(np.inf)
239 return interval
241 def normal_discretization(
242 self,
243 bins: int,
244 mean: float,
245 std: float
246 ) -> List[float]:
247 """
248 normal Discretisation of variable into bins
250 Args:
251 bins (int): int
252 mean (float): the first parameter of the gaussian distribution
253 std (float): standard
255 Returns
256 interval (array): discretised variable into bins
257 """
258 interval = []
259 for i in range(1, bins):
260 val = stats.norm.ppf(i / bins, mean, std)
261 interval.append(val)
262 interval.insert(0, -np.inf)
263 interval.append(np.inf)
264 return interval
266 def uniform_discretization(
267 self,
268 bins: int,
269 min_value: float,
270 max_value: float,
271 ) -> List[float]:
272 """
273 This function discretizes a variable with a uniform distribution into specified bins.
274 It uses the inverse transform method with the scipy.stats.uniform.ppf function.
276 Args:
277 bins (int): Number of bins.
278 min_value (float): Minimum value for the uniform distribution.
279 max_value (float): Maximum value for the uniform distribution.
281 Returns:
282 interval (list): Discretized variable into bins.
283 The list includes intervals with the first element representing negative infinity
284 and the last element representing positive infinity.
285 """
286 interval = []
287 for i in range(1, bins):
288 val = stats.uniform.ppf(i / bins, min_value, max_value)
289 interval.append(val)
290 interval.insert(0, -np.inf)
291 interval.append(np.inf)
292 return interval
294 def data_processing_for_chisqure_test(
295 self,
296 data: pd.DataFrame
297 ) -> pd.DataFrame:
298 """
299 This function is designed to preprocess the input data for chi-square tests.
300 If the data type is object ('O'), it is assumed to be categorical,
301 and the function converts it into value counts.
302 This step is crucial for chi-square tests, which require frequency distributions.
304 Args:
305 data (pd.DataFrame): Input data.
307 Returns:
308 data (pd.DataFrame):
309 Processed data suitable for chi-square tests.
310 """
311 if data.dtypes == 'O':
312 data = data.value_counts()
313 return data
315 def discretisation_intervals(
316 self,
317 params: DiscretisationParams
318 ) -> Optional[pd.DataFrame]:
319 """
320 This function discretizes a given set of data into intervals based on
321 empirical distribution and calculates observed and expected frequencies
322 for each interval. It supports both normal and uniform distribution theories.
324 Args:
325 params (DiscretisationParams): Parameters for discretization.
327 Returns:
328 intervals (Optional[DataFrame]): Intervals and counts of each interval.
329 Returns None if an unsupported distribution theory is provided.
331 Note:
332 The function may issue a warning if there are missing values in the data.
334 Example:
335 interval_data = discretisation_intervals(
336 DiscretisationParams(
337 data, {
338 'theory': 'normal',
339 'empirical': distribution_empirical,
340 'mean': mean, 'std': std
341 }
342 )
343 )
344 if interval_data is not None:
345 logger.info(interval_data)
346 """
347 alpha = 1.0
348 processed_data = self.delete_na(params.data)
350 if len(processed_data) != len(params.data):
351 deleted_data = len(params.data) - len(processed_data)
352 logger.info("the data is not complete, there are %s missed items", deleted_data)
353 params.data = processed_data
355 if params.distribution_theory == "normal":
356 exp = self.expected(params.distribution_theory, params.data, params.mean, params.std)
357 elif params.distribution_theory == "uniform":
358 min_value = params.mean
359 max_value = params.mean + params.std
360 exp = self.expected(params.distribution_theory, params.data, min_value, max_value)
362 if params.distribution_theory in ('normal', 'uniform'):
363 intervals = pd.DataFrame(
364 {'lower_limit': params.distribution_empirical[:-1],
365 'upper_limit': params.distribution_empirical[1:]}
366 )
368 observed_values = sorted(params.data)
369 expected_values = sorted(exp)
371 intervals['obs_freq'] = intervals.apply(
372 lambda x: sum(x['lower_limit'] < i <= x['upper_limit'] for i in observed_values), axis=1) / alpha
374 intervals['exp_freq'] = intervals.apply(
375 lambda x: sum(x['lower_limit'] < i <= x['upper_limit'] for i in expected_values), axis=1) / alpha
377 return intervals
379 # Add a return value (can be None if needed)
380 return None
382 def delete_na(self, data: pd.DataFrame) -> pd.DataFrame:
383 """
384 Remove missing values (NaN) from the input data.
386 Args:
387 data (pd.DataFrame): The input data containing missing values.
389 Returns:
390 data (pd.DataFrame):
391 The input data with missing values removed. If the input is a Series,
392 the output will also be a Series. If the input is a DataFrame,
393 the output will be a DataFrame.
394 """
395 data = data.dropna()
396 return data
398 def expected(
399 self,
400 distribution: str,
401 data: List[float],
402 *argv: float
403 ) -> List[float]:
404 """
405 Calculate the expected values of the distribution
407 Args:
408 distribution (str): 'normal' or 'uniform'
409 data (List[float]): Input data.
410 *argv : Parameters of the distribution.
412 Returns:
413 n or u (List[float]): Expected values for every distribution.
414 """
415 if distribution == 'normal':
416 mean = argv[0]
417 std = argv[1]
418 n = np.random.normal(mean, std, len(data))
419 return n
421 # Assuming distribution is 'uniform'
422 min_value = argv[0]
423 max_value = argv[1]
424 u = np.random.uniform(min_value, max_value, len(data))
425 return u
427 def observed_hist(self, variable: pd.Series) -> None:
428 """
429 Plot the observed values of the distribution
431 Args:
432 variable (pd.Series): Input variable.
434 Returns:
435 None (plots histogram)
436 """
437 if variable.dtypes == 'int64':
438 plt.figure(figsize=(10, 5))
439 plt.hist(variable, bins=len(variable.unique()))
440 plt.xlabel(variable.name)
441 elif variable.dtypes == 'O':
442 plt.figure(figsize=(10, 5))
443 plt.bar(variable.value_counts().index, variable.value_counts())
444 plt.xlabel(variable.name)
445 elif variable.dtypes == 'bool':
446 plt.figure(figsize=(10, 5))
447 true = variable.value_counts()[True]
448 false = variable.value_counts()[False]
449 plt.bar(['True', 'False'], [true, false])
450 plt.xlabel(variable.name)