Coverage for dqm/diversity/metric.py: 94%

1"""

2Diversity Index Calculator

4This module defines the DiversityIndexCalculator class, which

5offers methods to calculate various diversity indices for categorical data.

6These indices are useful in statistical analysis and data science to

7understand the distribution and diversity of categorical data.

9Authors:

10 Faouzi ADJED

11 Anani DJATO

13Dependencies:

14 pandas

16Classes:

17 DiversityIndexCalculator: Provides methods for calculating diversity indices in a dataset.

19Functions: None

21Usage:

22 from metric import DiversityIndexCalculator

23 calculator = DiversityIndexCalculator()

24 dataset = pandas.Series([...]) # Replace with your data

25 simpson_index = calculator.simpson(dataset)

26 gini_index = calculator.gini(dataset)

28These methods are useful for ecological, sociological, and

29various other types of categorical data analysis.

30"""

32import pandas as pd

35class DiversityIndexCalculator:

36 """

37 This class provides methods to calculate various diversity

38 indices for a given dataset.

40 Methods:

41 num: Counts the number of each category in a dataset.

42 simpson: Calculates the Simpson diversity index.

43 prob: Calculates the frequencies of each category in a dataset.

44 gini: Calculates the Gini-Simpson index.

45 """

47 def num(self, variable: pd.Series) -> pd.Series:

48 """

49 Calculate the number of each category of a variable.

51 Args:

52 variable (Series): The data series for which to count categories.

54 Returns:

55 n (Series): The count of each category.

56 """

57 n = variable.value_counts()

58 return n

60 def simpson(self, variable: pd.Series) -> float:

61 """

62 Calculate Simpson's index, which is a measure of diversity.

64 Args:

65 variable (Series): The data series for which to calculate the Simpson index.

67 Returns:

68 s (float): The Simpson diversity index.

69 """

70 n = self.num(variable)

71 s = 1 - (sum(n * (n - 1)) / (len(variable) * (len(variable) - 1)))

72 return s

74 def prob(self, variable: pd.Series) -> pd.Series:

75 """

76 Calculate the frequencies of each category in a variable.

78 Args:

79 variable (Series): The data series for which to calculate frequencies.

81 Returns:

82 p (Series): The frequency of each category.

83 """

84 p = variable.value_counts() / len(variable)

85 return p

87 def gini(self, variable: pd.Series) -> float:

88 """

89 Compute the Gini-Simpson index, a metric for assessing diversity that

90 takes into consideration both the quantity of distinct categories

91 and the uniformity of their distribution.

93 Args:

94 variable (Series): The data series for which to calculate the Gini-Simpson index.

96 Returns:

97 g (float): The Gini-Simpson index.

98 """

99 p = self.prob(variable)

100 g = 1 - (sum(p**2))

101 return g

102

103 def RD(self, variable: pd.Series) -> float:

104 print(variable)