Coverage for dqm/diversity/diversity.py: 25%

32 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-05 14:00 +0000

1""" 

2This module, DiversityCalculator, calculates various types of diversity in datasets. 

3It focuses on both lexical and visual diversities, employing statistical indices for 

4different metrics such as richness, variety, color, and shape. Useful in linguistics, 

5image processing, and data analysis, it helps understand the diversity of elements 

6in a dataset. 

7 

8Authors: 

9 Faouzi ADJED 

10 Anani DJATO 

11 

12Dependencies: 

13 numpy 

14 collections.Counter 

15 

16Classes: 

17 DiversityCalculator: A class that provides methods for calculating 

18 different types of diversity in datasets. 

19 

20Functions: None 

21 

22Usage: 

23 To use this module, create an instance of the DiversityCalculator class and call its 

24 compute_diversity method with appropriate arguments. 

25 Example: 

26 calculator = DiversityCalculator() 

27 diversity_score = calculator.compute_diversity(data, 'lexical', 'richness') 

28""" 

29 

30from collections import Counter 

31from typing import Iterable 

32import numpy as np 

33 

34from dqm.utils.twe_logger import get_logger 

35 

36logger = get_logger() 

37 

38 

39class DiversityCalculator: 

40 """ 

41 A class to compute various types of diversity within data. 

42 

43 This class offers methods to calculate lexical and visual diversities in datasets using 

44 different statistical measures. It can measure lexical diversity in terms of richness and 

45 variety, and visual diversity in terms of color and shape using indices like Shannon, 

46 Simpson, and Gini-Simpson. 

47 

48 Methods: 

49 compute_diversity: Calculates diversity based on specified type and need. 

50 """ 

51 

52 def compute_diversity( 

53 self, data: Iterable, diversity_type: str, need: str 

54 ) -> float: 

55 """ 

56 Compute diversity of given data based on type and need. 

57 

58 Args: 

59 data (Iterable): Dataset for diversity computation. 

60 diversity_type (str): Type of diversity ('lexical' or 'visual'). 

61 need (str): Specific need for calculation ('richness', 'variety', 'color', 'shape') 

62 

63 Returns: 

64 diversity (float): Calculated diversity value. 

65 """ 

66 if diversity_type == "lexical" and need == "richness": 

67 # Compute lexical richness using Shannon Index 

68 _, counts = np.unique(data, return_counts=True) 

69 norm_counts = counts / counts.sum() 

70 diversity = -(norm_counts * np.log(norm_counts)).sum() 

71 elif diversity_type == "lexical" and need == "variety": 

72 # Compute lexical variety using Simpson Index 

73 _, counts = np.unique(data, return_counts=True) 

74 norm_counts = counts / counts.sum() 

75 diversity = 1 - (np.square(norm_counts)).sum() 

76 elif diversity_type == "visual" and need == "color": 

77 # Compute color diversity using Richness Index 

78 counter = Counter(data) 

79 freqs = np.array(list(counter.values())) / len(data) 

80 diversity = len(freqs) 

81 elif diversity_type == "visual" and need == "shape": 

82 # Compute shape diversity using Gini-Simpson Index 

83 counter = Counter(data) 

84 freqs = np.array(list(counter.values())) / len(data) 

85 diversity = 1 - np.sum(freqs**2) 

86 else: 

87 logger.error("Invalid diversity type or need.") 

88 

89 return float(diversity) # To be homogenous on output data type 

90 

91 def validate_inputs(self, diversity_type: str, need: str) -> None: 

92 """ 

93 This method is added just to have at least two public methods 

94 in a class as required by Python coding standards. 

95 

96 This method validates the inputs for compute_diversity method. 

97 

98 Args: 

99 diversity_type (str): Type of diversity to be computed. 

100 need (str): Specific need for diversity calculation. 

101 

102 """ 

103 valid_diversity_types = ["lexical", "visual"] 

104 valid_needs = ["richness", "variety", "color", "shape"] 

105 

106 if diversity_type not in valid_diversity_types: 

107 logger.error( 

108 "Invalid diversity type : %s. Must be one of %s.", 

109 diversity_type, 

110 valid_diversity_types, 

111 ) 

112 if need not in valid_needs: 

113 logger.error("Invalid need: %s. Must be one of %s.", need, valid_needs)