"""
This module implements two classes, DiscretisationParams and VariableAnalysis,
providing functionality for variable counting, countplot visualization,
and discretization of variables using normal or uniform distributions.
It also includes functions for processing data for chi-square tests, calculating
expected values, and generating histograms for observed and expected values.
Authors:
Faouzi ADJED
Anani DJATO
Dependencies:
numpy
pandas
matplotlib.pyplot
scipy.stats
dqm.representativeness.twe_logger
seaborn
Functions : None
Classes:
DiscretisationParams: Class for defining discretization parameters
VariableAnalysis: Class for analyzing data distribution
Example:
from utils import VariableAnalysis, DiscretisationParams
# Example of using VariableAnalysis class
variable_analyzer = VariableAnalysis()
# Example of using the variable_counting method
my_variable = pd.Series([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
counts = variable_analyzer.variable_counting(my_variable)
print("Counts of unique values:")
print(counts)
# Example of using the countplot method
variable_analyzer.countplot(my_variable)
plt.show()
# Instantiate the DiscretisationParams class
discretisation_params = DiscretisationParams(
data=my_variable,
distribution_theory='normal',
distribution_empirical=[-1.0, 0.0, 1.0, 2.0],
mean=0.0,
std=1.0
)
"""
from typing import Optional, List, Union
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
from dqm.representativeness.twe_logger import get_logger
import seaborn as sns
logger = get_logger()
[docs]
class DiscretisationParams:
"""
Parameters for discretization.
Args:
data: Input data.
distribution_params: Dictionary containing distribution parameters.
'theory': Distribution theory ('normal' or 'uniform').
'empirical': Empirical distribution used for discretization.
'mean': Mean parameter for the distribution theory.
'std': Standard deviation for the distribution theory.
Methods:
__init__:
Initializes an instance of the DiscretisationParams class.
Args:
data: Input data.
distribution_params: Dictionary containing distribution parameters.
Returns:
None
to_dict:
Converts the parameters to a dictionary.
Returns:
dict: A dictionary representation of the parameters.
Note:
This method is not necessary. It was created solely to have at
least 2 methods as recommended in a class.
get_data:
Gets the input data.
Returns:
Any: The input data.
"""
[docs]
def __init__(self, data, distribution_params):
"""
Initializes an instance of the DiscretisationParams class.
Args:
data (pd.Series): Input data.
distribution_params (dict): Dictionary containing distribution parameters.
'theory': Distribution theory ('normal' or 'uniform').
'empirical': Empirical distribution used for discretization.
'mean': Mean parameter for the distribution theory.
'std': Standard deviation for the distribution theory.
Returns:
None
"""
self.data = data
self.distribution_theory = distribution_params['theory']
self.distribution_empirical = distribution_params['empirical']
self.mean = distribution_params['mean']
self.std = distribution_params['std']
[docs]
def to_dict(self):
"""
Convert the parameters to a dictionary.
Returns:
dict: A dictionary representation of the parameters.
Note:
This method is not necessary. It was created solely to have at
least 2 methods as recommended in a class.
"""
return {
'data': self.data,
'distribution_theory': self.distribution_theory,
'distribution_empirical': self.distribution_empirical,
'mean': self.mean,
'std': self.std
}
[docs]
def get_data(self):
"""
Get the input data.
Returns:
Any: The input data.
"""
return self.data
[docs]
class VariableAnalysis:
"""
This class provides functions for variable counting, countplot visualization,
and discretization of variables using normal or uniform distributions.
It includes functions for processing data for chi-square tests,
calculating expected values, and generating histograms for observed and expected values.
Args: None
Methods:
variable_counting
countplot
discretisation
normal_discretization
data_processing_for_chisqure_test
uniform_discretization
discretisation_intervals
delete_na
expected
expected_hist
observed_hist
"""
[docs]
def variable_counting(self, variable: pd.Series) -> pd.DataFrame:
"""
Counting unique values (only int values and modalities.
It cannot be used for float values)
Args:
variable (panda.Series)
Returns:
variable_count (DataFrame): counts of unique values
"""
variable_count = variable.value_counts().to_frame()
variable_count.columns = ["count"]
variable_count.sort_index(inplace=True)
return variable_count
[docs]
def countplot(self, variable: pd.Series) -> Optional[None]:
"""
This function will not be used and will be deleted in the final package (to decide)
Show the counts of observations of every category
Args:
variable (DataFrame)
Returns:
countplot (show the bar plot of counts of variable)
"""
plt.figure(figsize=(10, 5))
sns.countplot(x=variable)
[docs]
def discretisation(
self,
variable: pd.Series,
distribution: str,
bins: int
) -> List[Union[float, int]]:
""" Discretisation of variable into bins
Args:
distribution (string): 'normal' ou 'uniform'
variable (Series)
bins (int)
Returns:
interval (array): discretised variable into bins
"""
interval = []
if distribution == 'normal':
mean = np.mean(variable)
std = np.std(variable)
for i in range(1, bins):
val = stats.norm.ppf(i / bins, mean, std)
interval.append(val)
elif distribution == 'uniform':
min_value = variable.min()
max_value = variable.max()
for i in range(1, bins):
val = stats.uniform.ppf(i / bins, min_value, max_value)
interval.append(val)
interval.insert(0, -np.inf)
interval.append(np.inf)
return interval
[docs]
def normal_discretization(
self,
bins: int,
mean: float,
std: float
) -> List[float]:
"""
normal Discretisation of variable into bins
Args:
bins (int): int
mean (float): the first parameter of the gaussian distribution
std (float): standard
Returns
interval (array): discretised variable into bins
"""
interval = []
for i in range(1, bins):
val = stats.norm.ppf(i / bins, mean, std)
interval.append(val)
interval.insert(0, -np.inf)
interval.append(np.inf)
return interval
[docs]
def data_processing_for_chisqure_test(
self,
data: pd.DataFrame
) -> pd.DataFrame:
"""
This function is designed to preprocess the input data for chi-square tests.
If the data type is object ('O'), it is assumed to be categorical,
and the function converts it into value counts.
This step is crucial for chi-square tests, which require frequency distributions.
Args:
data (pd.DataFrame): Input data.
Returns:
data (pd.DataFrame):
Processed data suitable for chi-square tests.
"""
if data.dtypes == 'O':
data = data.value_counts()
return data
[docs]
def discretisation_intervals(
self,
params: DiscretisationParams
) -> Optional[pd.DataFrame]:
"""
This function discretizes a given set of data into intervals based on
empirical distribution and calculates observed and expected frequencies
for each interval. It supports both normal and uniform distribution theories.
Args:
params (DiscretisationParams): Parameters for discretization.
Returns:
intervals (Optional[DataFrame]): Intervals and counts of each interval.
Returns None if an unsupported distribution theory is provided.
Note:
The function may issue a warning if there are missing values in the data.
Example:
interval_data = discretisation_intervals(
DiscretisationParams(
data, {
'theory': 'normal',
'empirical': distribution_empirical,
'mean': mean, 'std': std
}
)
)
if interval_data is not None:
logger.info(interval_data)
"""
alpha = 1.0
processed_data = self.delete_na(params.data)
if len(processed_data) != len(params.data):
deleted_data = len(params.data) - len(processed_data)
logger.info("the data is not complete, there are %s missed items", deleted_data)
params.data = processed_data
if params.distribution_theory == "normal":
exp = self.expected(params.distribution_theory, params.data, params.mean, params.std)
elif params.distribution_theory == "uniform":
min_value = params.mean
max_value = params.mean + params.std
exp = self.expected(params.distribution_theory, params.data, min_value, max_value)
if params.distribution_theory in ('normal', 'uniform'):
intervals = pd.DataFrame(
{'lower_limit': params.distribution_empirical[:-1],
'upper_limit': params.distribution_empirical[1:]}
)
observed_values = sorted(params.data)
expected_values = sorted(exp)
intervals['obs_freq'] = intervals.apply(
lambda x: sum(x['lower_limit'] < i <= x['upper_limit'] for i\
in observed_values), axis=1) / alpha
intervals['exp_freq'] = intervals.apply(
lambda x: sum(x['lower_limit'] < i <= x['upper_limit'] for i\
in expected_values), axis=1) / alpha
return intervals
# Add a return value (can be None if needed)
return None
[docs]
def delete_na(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Remove missing values (NaN) from the input data.
Args:
data (pd.DataFrame): The input data containing missing values.
Returns:
data (pd.DataFrame):
The input data with missing values removed. If the input is a Series,
the output will also be a Series. If the input is a DataFrame,
the output will be a DataFrame.
"""
data = data.dropna()
return data
[docs]
def expected(
self,
distribution: str,
data: List[float],
*argv: float
) -> List[float]:
"""
Calculate the expected values of the distribution
Args:
distribution (str): 'normal' or 'uniform'
data (List[float]): Input data.
*argv : Parameters of the distribution.
Returns:
n or u (List[float]): Expected values for every distribution.
"""
if distribution == 'normal':
mean = argv[0]
std = argv[1]
n = np.random.normal(mean, std, len(data))
return n
# Assuming distribution is 'uniform'
min_value = argv[0]
max_value = argv[1]
u = np.random.uniform(min_value, max_value, len(data))
return u
[docs]
def observed_hist(self, variable: pd.Series) -> None:
"""
Plot the observed values of the distribution
Args:
variable (pd.Series): Input variable.
Returns:
None (plots histogram)
"""
if variable.dtypes == 'int64':
plt.figure(figsize=(10, 5))
plt.hist(variable, bins=len(variable.unique()))
plt.xlabel(variable.name)
elif variable.dtypes == 'O':
plt.figure(figsize=(10, 5))
plt.bar(variable.value_counts().index, variable.value_counts())
plt.xlabel(variable.name)
elif variable.dtypes == 'bool':
plt.figure(figsize=(10, 5))
true = variable.value_counts()[True]
false = variable.value_counts()[False]
plt.bar(['True', 'False'], [true, false])
plt.xlabel(variable.name)