Coverage for dqm/diversity/diversity.py: 25%

1"""

2This module, DiversityCalculator, calculates various types of diversity in datasets.

3It focuses on both lexical and visual diversities, employing statistical indices for

4different metrics such as richness, variety, color, and shape. Useful in linguistics,

5image processing, and data analysis, it helps understand the diversity of elements

6in a dataset.

8Authors:

9 Faouzi ADJED

10 Anani DJATO

12Dependencies:

13 numpy

14 collections.Counter

16Classes:

17 DiversityCalculator: A class that provides methods for calculating

18 different types of diversity in datasets.

20Functions: None

22Usage:

23 To use this module, create an instance of the DiversityCalculator class and call its

24 compute_diversity method with appropriate arguments.

25 Example:

26 calculator = DiversityCalculator()

27 diversity_score = calculator.compute_diversity(data, 'lexical', 'richness')

28"""

30from collections import Counter

31from typing import Iterable

32import numpy as np

34from dqm.utils.twe_logger import get_logger

36logger = get_logger()

39class DiversityCalculator:

40 """

41 A class to compute various types of diversity within data.

43 This class offers methods to calculate lexical and visual diversities in datasets using

44 different statistical measures. It can measure lexical diversity in terms of richness and

45 variety, and visual diversity in terms of color and shape using indices like Shannon,

46 Simpson, and Gini-Simpson.

48 Methods:

49 compute_diversity: Calculates diversity based on specified type and need.

50 """

52 def compute_diversity(

53 self, data: Iterable, diversity_type: str, need: str

54 ) -> float:

55 """

56 Compute diversity of given data based on type and need.

58 Args:

59 data (Iterable): Dataset for diversity computation.

60 diversity_type (str): Type of diversity ('lexical' or 'visual').

61 need (str): Specific need for calculation ('richness', 'variety', 'color', 'shape')

63 Returns:

64 diversity (float): Calculated diversity value.

65 """

66 if diversity_type == "lexical" and need == "richness":

67 # Compute lexical richness using Shannon Index

68 _, counts = np.unique(data, return_counts=True)

69 norm_counts = counts / counts.sum()

70 diversity = -(norm_counts * np.log(norm_counts)).sum()

71 elif diversity_type == "lexical" and need == "variety":

72 # Compute lexical variety using Simpson Index

73 _, counts = np.unique(data, return_counts=True)

74 norm_counts = counts / counts.sum()

75 diversity = 1 - (np.square(norm_counts)).sum()

76 elif diversity_type == "visual" and need == "color":

77 # Compute color diversity using Richness Index

78 counter = Counter(data)

79 freqs = np.array(list(counter.values())) / len(data)

80 diversity = len(freqs)

81 elif diversity_type == "visual" and need == "shape":

82 # Compute shape diversity using Gini-Simpson Index

83 counter = Counter(data)

84 freqs = np.array(list(counter.values())) / len(data)

85 diversity = 1 - np.sum(freqs**2)

86 else:

87 logger.error("Invalid diversity type or need.")

89 return float(diversity) # To be homogenous on output data type

91 def validate_inputs(self, diversity_type: str, need: str) -> None:

92 """

93 This method is added just to have at least two public methods

94 in a class as required by Python coding standards.

96 This method validates the inputs for compute_diversity method.

98 Args:

99 diversity_type (str): Type of diversity to be computed.

100 need (str): Specific need for diversity calculation.

101

102 """

103 valid_diversity_types = ["lexical", "visual"]

104 valid_needs = ["richness", "variety", "color", "shape"]

105

106 if diversity_type not in valid_diversity_types:

107 logger.error(

108 "Invalid diversity type : %s. Must be one of %s.",

109 diversity_type,

110 valid_diversity_types,

111 )

112 if need not in valid_needs:

113 logger.error("Invalid need: %s. Must be one of %s.", need, valid_needs)