Coverage for dqm/diversity/diversity.py: 25%
32 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:00 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-05 14:00 +0000
1"""
2This module, DiversityCalculator, calculates various types of diversity in datasets.
3It focuses on both lexical and visual diversities, employing statistical indices for
4different metrics such as richness, variety, color, and shape. Useful in linguistics,
5image processing, and data analysis, it helps understand the diversity of elements
6in a dataset.
8Authors:
9 Faouzi ADJED
10 Anani DJATO
12Dependencies:
13 numpy
14 collections.Counter
16Classes:
17 DiversityCalculator: A class that provides methods for calculating
18 different types of diversity in datasets.
20Functions: None
22Usage:
23 To use this module, create an instance of the DiversityCalculator class and call its
24 compute_diversity method with appropriate arguments.
25 Example:
26 calculator = DiversityCalculator()
27 diversity_score = calculator.compute_diversity(data, 'lexical', 'richness')
28"""
30from collections import Counter
31from typing import Iterable
32import numpy as np
34from dqm.utils.twe_logger import get_logger
36logger = get_logger()
39class DiversityCalculator:
40 """
41 A class to compute various types of diversity within data.
43 This class offers methods to calculate lexical and visual diversities in datasets using
44 different statistical measures. It can measure lexical diversity in terms of richness and
45 variety, and visual diversity in terms of color and shape using indices like Shannon,
46 Simpson, and Gini-Simpson.
48 Methods:
49 compute_diversity: Calculates diversity based on specified type and need.
50 """
52 def compute_diversity(
53 self, data: Iterable, diversity_type: str, need: str
54 ) -> float:
55 """
56 Compute diversity of given data based on type and need.
58 Args:
59 data (Iterable): Dataset for diversity computation.
60 diversity_type (str): Type of diversity ('lexical' or 'visual').
61 need (str): Specific need for calculation ('richness', 'variety', 'color', 'shape')
63 Returns:
64 diversity (float): Calculated diversity value.
65 """
66 if diversity_type == "lexical" and need == "richness":
67 # Compute lexical richness using Shannon Index
68 _, counts = np.unique(data, return_counts=True)
69 norm_counts = counts / counts.sum()
70 diversity = -(norm_counts * np.log(norm_counts)).sum()
71 elif diversity_type == "lexical" and need == "variety":
72 # Compute lexical variety using Simpson Index
73 _, counts = np.unique(data, return_counts=True)
74 norm_counts = counts / counts.sum()
75 diversity = 1 - (np.square(norm_counts)).sum()
76 elif diversity_type == "visual" and need == "color":
77 # Compute color diversity using Richness Index
78 counter = Counter(data)
79 freqs = np.array(list(counter.values())) / len(data)
80 diversity = len(freqs)
81 elif diversity_type == "visual" and need == "shape":
82 # Compute shape diversity using Gini-Simpson Index
83 counter = Counter(data)
84 freqs = np.array(list(counter.values())) / len(data)
85 diversity = 1 - np.sum(freqs**2)
86 else:
87 logger.error("Invalid diversity type or need.")
89 return float(diversity) # To be homogenous on output data type
91 def validate_inputs(self, diversity_type: str, need: str) -> None:
92 """
93 This method is added just to have at least two public methods
94 in a class as required by Python coding standards.
96 This method validates the inputs for compute_diversity method.
98 Args:
99 diversity_type (str): Type of diversity to be computed.
100 need (str): Specific need for diversity calculation.
102 """
103 valid_diversity_types = ["lexical", "visual"]
104 valid_needs = ["richness", "variety", "color", "shape"]
106 if diversity_type not in valid_diversity_types:
107 logger.error(
108 "Invalid diversity type : %s. Must be one of %s.",
109 diversity_type,
110 valid_diversity_types,
111 )
112 if need not in valid_needs:
113 logger.error("Invalid need: %s. Must be one of %s.", need, valid_needs)