Coverage for dqm/diversity/metric.py: 94%

18 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-05 14:00 +0000

1""" 

2Diversity Index Calculator 

3 

4This module defines the DiversityIndexCalculator class, which 

5offers methods to calculate various diversity indices for categorical data. 

6These indices are useful in statistical analysis and data science to 

7understand the distribution and diversity of categorical data. 

8 

9Authors: 

10 Faouzi ADJED 

11 Anani DJATO 

12 

13Dependencies: 

14 pandas 

15 

16Classes: 

17 DiversityIndexCalculator: Provides methods for calculating diversity indices in a dataset. 

18 

19Functions: None 

20 

21Usage: 

22 from metric import DiversityIndexCalculator 

23 calculator = DiversityIndexCalculator() 

24 dataset = pandas.Series([...]) # Replace with your data 

25 simpson_index = calculator.simpson(dataset) 

26 gini_index = calculator.gini(dataset) 

27 

28These methods are useful for ecological, sociological, and 

29various other types of categorical data analysis. 

30""" 

31 

32import pandas as pd 

33 

34 

35class DiversityIndexCalculator: 

36 """ 

37 This class provides methods to calculate various diversity 

38 indices for a given dataset. 

39 

40 Methods: 

41 num: Counts the number of each category in a dataset. 

42 simpson: Calculates the Simpson diversity index. 

43 prob: Calculates the frequencies of each category in a dataset. 

44 gini: Calculates the Gini-Simpson index. 

45 """ 

46 

47 def num(self, variable: pd.Series) -> pd.Series: 

48 """ 

49 Calculate the number of each category of a variable. 

50 

51 Args: 

52 variable (Series): The data series for which to count categories. 

53 

54 Returns: 

55 n (Series): The count of each category. 

56 """ 

57 n = variable.value_counts() 

58 return n 

59 

60 def simpson(self, variable: pd.Series) -> float: 

61 """ 

62 Calculate Simpson's index, which is a measure of diversity. 

63 

64 Args: 

65 variable (Series): The data series for which to calculate the Simpson index. 

66 

67 Returns: 

68 s (float): The Simpson diversity index. 

69 """ 

70 n = self.num(variable) 

71 s = 1 - (sum(n * (n - 1)) / (len(variable) * (len(variable) - 1))) 

72 return s 

73 

74 def prob(self, variable: pd.Series) -> pd.Series: 

75 """ 

76 Calculate the frequencies of each category in a variable. 

77 

78 Args: 

79 variable (Series): The data series for which to calculate frequencies. 

80 

81 Returns: 

82 p (Series): The frequency of each category. 

83 """ 

84 p = variable.value_counts() / len(variable) 

85 return p 

86 

87 def gini(self, variable: pd.Series) -> float: 

88 """ 

89 Compute the Gini-Simpson index, a metric for assessing diversity that 

90 takes into consideration both the quantity of distinct categories 

91 and the uniformity of their distribution. 

92 

93 Args: 

94 variable (Series): The data series for which to calculate the Gini-Simpson index. 

95 

96 Returns: 

97 g (float): The Gini-Simpson index. 

98 """ 

99 p = self.prob(variable) 

100 g = 1 - (sum(p**2)) 

101 return g 

102 

103 def RD(self, variable: pd.Series) -> float: 

104 print(variable)