Source code for dqm.representativeness.metric

"""
This script provides functions for analyzing data distribution using chi-square tests,
goodness-of-fit tests, Kolmogorov-Smirnov tests, Shannon entropy, and confidence intervals.

Authors:
    Faouzi ADJED
    Anani DJATO

Dependencies:
    numpy
    pandas
    matplotlib
    scipy
    seaborn
    dqm.representativeness.twe_logger

Classes:
    DistributionAnalyzer: Class for analyzing data distribution

Functions: None

Usage: Import this script and use the provided functions for distribution analysis.

"""
from typing import Optional, Tuple, Any
import pandas as pd
import numpy as np
from scipy import stats
from dqm.representativeness.utils import VariableAnalysis, DiscretisationParams
from dqm.representativeness.twe_logger import get_logger

logger = get_logger()
variable_analyzer = VariableAnalysis()


[docs] class DistributionAnalyzer: """ Class for analyzing data distribution. Args: data (pd.DataFrame): The data to be analyzed. bins (int): The number of bins for analysis. distribution (str): The distribution type ('normal' or 'uniform'). Methods: chisquare_test: Perform the chi-square test on the provided data. Returns p-value and confidence intervals kolmogorov: Calculate the Kolmogorov-Smirnov test for the chosen distribution. Returns the KS test p-value. shannon_entropy: Calculate Shannon entropy for the provided intervals. Returns Shannon entropy. grte: Calculates the Granular Relative and Theoretical Entropy (GRTE) for given data Returns The calculated GRTE value and the intervals discretized data """ def __init__(self, data: pd.Series, bins: int, distribution: str): """ Initialize DistributionAnalyzer with the provided data and parameters. Args: data (pd.Series): The data to be analyzed. bins (int): The number of bins for analysis. distribution (str): The distribution type ('normal' or 'uniform'). """ self.data = data self.bins = bins self.distribution = distribution self.logger = get_logger() self.variable_analyzer = VariableAnalysis()
[docs] def chisquare_test(self, *par_dist: Optional[Tuple[float, float]]) -> Tuple[float, pd.Series]: """ Perform a chi-square test for goodness of fit. This method analyzes the distribution of data using a chi-square test for goodness of fit. It supports normal and uniform distributions. Args: *par_dist (float): Parameters for the specified distribution. Returns: p-value (float): The p-value from the chi-square test intervals_frequencies (pd.DataFrame): The DataFrame containing observed and expected frequencies. """ if self.data.dtypes in ('O', 'bool'): self.logger.error("Categorical or boolean data are not processed yet.") return float('nan'), pd.Series(dtype='float') # Create a dictionary for distribution parameters distribution_params = {'theory': self.distribution} if self.distribution == 'normal': #if len(par_dist)>0: if len(par_dist)==2: mean = par_dist[0] std = par_dist[1] else: mean = np.mean(self.data) std = np.std(self.data) # Update distribution parameters for uniform distribution distribution_params.update( { 'empirical': variable_analyzer.normal_discretization( self.bins, mean, std ), 'mean': mean, 'std': std } ) #discrete_distrib = variable_analyzer.normal_discretization(bins, mean, std) # Create an instance of DiscretisationParams discretisation_params = DiscretisationParams(self.data, distribution_params) intervals_frequencies = variable_analyzer.discretisation_intervals( discretisation_params ) if sum(intervals_frequencies['exp_freq']==0)!=0: logger.error("Number of intervals is to large to get acceptable expected values") chi = stats.chisquare( intervals_frequencies['obs_freq'], intervals_frequencies['exp_freq'] ) elif self.distribution == "uniform": #if len(par_dist)>0: if len(par_dist)==2: min_value = par_dist[0] max_value = par_dist[1] else: min_value = np.min(self.data) max_value = np.max(self.data) # Update distribution parameters for uniform distribution distribution_params.update( { 'empirical': variable_analyzer.uniform_discretization( self.bins, min_value, max_value ), 'mean': min_value, 'std': max_value } ) #discrete_distrib = variable_analyzer.uniform_discretization(bins, min_value, max_value) # Create an instance of DiscretisationParams discretisation_params = DiscretisationParams(self.data, distribution_params) intervals_frequencies = variable_analyzer.discretisation_intervals( discretisation_params ) if sum(intervals_frequencies['exp_freq']==0)!=0: logger.error("Number of intervals is to large to get acceptable expected values") chi = stats.chisquare( intervals_frequencies['obs_freq'], intervals_frequencies['exp_freq'] ) if chi.pvalue < 0.05: logger.info( "pvalue = %s < 0.05: Data is not following the %s distribution", chi.pvalue, self.distribution ) else: logger.info( "pvalue = %s >= 0.05: Data is following the %s distribution", chi.pvalue, self.distribution ) return chi.pvalue, intervals_frequencies
[docs] def kolmogorov(self, *par_dist: float) -> float: """ Calculation of the Kolmogorov-Smirnov test for every distribution. Args: *par_dist: arbitrary positional arguments, should be numeric Returns: p-value (float): KS test p-value """ #if data.dtypes in {'O', 'bool'}: if any(isinstance(value, (str, bool)) for value in self.data): logger.error("Categorical or boolean variables are not treated yet.") return float('nan') if self.distribution == 'normal': if len(par_dist) != 2: logger.error("Error: Provide mean and std for normal distribution.") return float('nan') mean, std = par_dist elif self.distribution == 'uniform': if len(par_dist) != 2: logger.error("Error: Provide min and max for uniform distribution.") return float('nan') mean, std = par_dist else: logger.error("Unsupported distribution %s ", self.distribution) return float('nan') k = stats.kstest( self.data, stats.norm.cdf, args = (mean, std) ) if self.distribution == 'normal' else stats.kstest( self.data, stats.uniform.cdf, args=(mean, mean + std) ) logger.info(k) if k.pvalue < 0.05: logger.info("p-value = %s < 0.05 : The data is not following" "the %s distribution", k.pvalue, self.distribution) else: logger.info("p-value = %s >= 0.05 : The data is not following" "the %s distribution", k.pvalue, self.distribution) return k.pvalue
[docs] def shannon_entropy(self) -> float: """ Calculation of Shannon entropy. Args: None Returns: Shannon entropy (float): """ if self.distribution == "uniform": min_value, max_value = np.min(self.data), np.max(self.data) discrete_distrib = variable_analyzer.uniform_discretization( self.bins, min_value, max_value ) # Create a dictionary for distribution parameters distribution_params = { 'theory': self.distribution, 'empirical': discrete_distrib, 'mean': min_value, 'std': max_value } discretisation_params = DiscretisationParams(self.data, distribution_params) intervals = variable_analyzer.discretisation_intervals(discretisation_params) if self.distribution == "normal": mean, std = np.mean(self.data), np.std(self.data) discrete_distrib = variable_analyzer.normal_discretization(self.bins, mean, std) # Create a dictionary for distribution parameters distribution_params = { 'theory': self.distribution, 'empirical': discrete_distrib, 'mean': mean, 'std': std } discretisation_params = DiscretisationParams(self.data, distribution_params) intervals = variable_analyzer.discretisation_intervals(discretisation_params) if intervals['exp_freq'].sum() == 0: logger.info("Leading division by zero") prob_exp = intervals['exp_freq'] / intervals['exp_freq'].sum() return stats.entropy(prob_exp)
[docs] def grte(self, *args: float) -> Tuple[float, Any]: """ Calculates the Granular Relative and Theoretical Entropy (GRTE) for given data. Args: *args (float): Optional arguments. For 'uniform', provide start and end; for 'normal', provide mean and std. Returns: grte_res (float): The calculated GRTE value. intervals_discretized (pd.Series): The intervals discretized data. """ # Create a dictionary for distribution parameters distribution_params = {'theory': self.distribution} # Check the specified distribution type and process accordingly if self.distribution == "uniform": min_value, max_value = (args[0], args[1]) if len(args) == 2 \ else (np.min(self.data), np.max(self.data)) logger.info("debut %s", min_value) logger.info("la fin %s", max_value) # Update distribution parameters for uniform distribution distribution_params.update( { 'empirical': variable_analyzer.uniform_discretization( self.bins, min_value, max_value ), 'mean': min_value, 'std': max_value } ) elif self.distribution == "normal": mean, std = (args[0], args[1]) if len(args) == 2 \ else (np.mean(self.data), np.std(self.data)) # Update distribution parameters for normal distribution distribution_params.update( { 'empirical': variable_analyzer.normal_discretization( self.bins, mean, std ), 'mean': mean, 'std': std } ) else: logger.error('Expecting only uniform or normal distribution') return None, None # Create an instance of DiscretisationParams discretisation_params = DiscretisationParams(self.data, distribution_params) # Calculate the intervals for the discretized data intervals_discretized = variable_analyzer.discretisation_intervals(discretisation_params) # Compute GRTE using the entropy of expected and observed frequencies grte_res = np.exp(-2 * abs(stats.entropy(intervals_discretized['exp_freq']) -\ stats.entropy(intervals_discretized['obs_freq']))) # Return the GRTE result and the discretized intervals return grte_res, intervals_discretized