Coverage for dqm/diversity/metric.py: 94%
18 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:00 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:00 +0000
1"""
2Diversity Index Calculator
4This module defines the DiversityIndexCalculator class, which
5offers methods to calculate various diversity indices for categorical data.
6These indices are useful in statistical analysis and data science to
7understand the distribution and diversity of categorical data.
9Authors:
10 Faouzi ADJED
11 Anani DJATO
13Dependencies:
14 pandas
16Classes:
17 DiversityIndexCalculator: Provides methods for calculating diversity indices in a dataset.
19Functions: None
21Usage:
22 from metric import DiversityIndexCalculator
23 calculator = DiversityIndexCalculator()
24 dataset = pandas.Series([...]) # Replace with your data
25 simpson_index = calculator.simpson(dataset)
26 gini_index = calculator.gini(dataset)
28These methods are useful for ecological, sociological, and
29various other types of categorical data analysis.
30"""
32import pandas as pd
35class DiversityIndexCalculator:
36 """
37 This class provides methods to calculate various diversity
38 indices for a given dataset.
40 Methods:
41 num: Counts the number of each category in a dataset.
42 simpson: Calculates the Simpson diversity index.
43 prob: Calculates the frequencies of each category in a dataset.
44 gini: Calculates the Gini-Simpson index.
45 """
47 def num(self, variable: pd.Series) -> pd.Series:
48 """
49 Calculate the number of each category of a variable.
51 Args:
52 variable (Series): The data series for which to count categories.
54 Returns:
55 n (Series): The count of each category.
56 """
57 n = variable.value_counts()
58 return n
60 def simpson(self, variable: pd.Series) -> float:
61 """
62 Calculate Simpson's index, which is a measure of diversity.
64 Args:
65 variable (Series): The data series for which to calculate the Simpson index.
67 Returns:
68 s (float): The Simpson diversity index.
69 """
70 n = self.num(variable)
71 s = 1 - (sum(n * (n - 1)) / (len(variable) * (len(variable) - 1)))
72 return s
74 def prob(self, variable: pd.Series) -> pd.Series:
75 """
76 Calculate the frequencies of each category in a variable.
78 Args:
79 variable (Series): The data series for which to calculate frequencies.
81 Returns:
82 p (Series): The frequency of each category.
83 """
84 p = variable.value_counts() / len(variable)
85 return p
87 def gini(self, variable: pd.Series) -> float:
88 """
89 Compute the Gini-Simpson index, a metric for assessing diversity that
90 takes into consideration both the quantity of distinct categories
91 and the uniformity of their distribution.
93 Args:
94 variable (Series): The data series for which to calculate the Gini-Simpson index.
96 Returns:
97 g (float): The Gini-Simpson index.
98 """
99 p = self.prob(variable)
100 g = 1 - (sum(p**2))
101 return g
103 def RD(self, variable: pd.Series) -> float:
104 print(variable)