added lexical diversity analyzer

b2fc8dbd · Arturo Montejo Ráez · db893efe · b2fc8dbd · b2fc8dbd · b2fc8dbd
Commit b2fc8dbd authored Jun 30, 2022 by Arturo Montejo Ráez
Showing with 78 additions and 4 deletions
README.md
requirements.txt
textflow/LexicalDiversityAnalyzer.py
--- a/README.md
+++ b/README.md
@@ -14,10 +14,17 @@ In this library we have sequences and analyzers.
    + **Children:** This is a dictionary where we store a list of sequences that came from the actual sequence. For example, if we have a text, we can split this text in phrases. "Phrases" will be the key in the dictionary and each phrase of the text will be a sequence inside the list of sequence of the key of the children dictionary. Each phrase can split in words too, that we will store in the children dictionary of the phrases sequences. So, inside of the original sequence(text) we have sequence of phrases and inside of them sequences of words. This forms the different levels of a sequence.
-        + The level in a sequence is used like a path in a directory, to access the different subsequences in analyzers or filter funtions. In our example we have:
+        <ul> 
-            - Text
+            <li> The level in a sequence is used like a path in a directory, to access the different subsequences in analyzers or filter funtions. In our example we have:
-                - Phrases
+            <ul>
-                    - Words 
+                <li> Text
+                <ul> 
+                    <li> Phrases
+                        <ul> 
+                            <li> Words 
+                        </ul>
+                </ul>
+            </ul>
        So, to access children of level Words we can use "Phrases/Words" in filter or analyze. As the same mode, we can use "Phrases/Words/text" to access a text(value of metadata dictionary) at the Words level in functions like filterMetadata or analyze.    

--- a/requirements.txt
+++ b/requirements.txt
@@ -37,3 +37,4 @@ typer==0.4.1; python_version >= "3.6"
 typing-extensions==4.2.0; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" and python_full_version >= "3.7.0"
 urllib3==1.26.9; python_full_version >= "3.7.0" and python_version < "4" and python_version >= "3.6"
 wasabi==0.9.1; python_version >= "3.6"
+lexical-diversity>=0.1.1
--- a/textflow/LexicalDiversityAnalyzer.py
+++ b/textflow/LexicalDiversityAnalyzer.py
+import string
+from typing import Optional
+from nltk.text import Text
+from nltk.tokenize import WhitespaceTokenizer
+from lexical_diversity import lex_div as ld
+import math
+from textflow.Analyzer import Analyzer
+class LexicalDiversityAnalyzer(Analyzer): 
+    """
+    A class that provides methods to analyze the lexical diversity of the text of a sequence.
+    Attributes:
+        lemmatizer: a function that tokenize the text in lemmas (preferred than original words)
+    """
+    def __init__(self, lemmatizer):
+        """
+        Create a stylometry analyzer from an input object.
+        Args:
+            lemmatizer: a function to tokenize the text in lemmas
+        """
+        self.lemmatizer = lemmatizer
+    def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str]= ""):
+        """
+        Analyze a sequence with a stylometry function.
+        Args:
+            sequence: the Sequence we want to analyze.
+            tag: the label to store the analysis result.
+            levelOfAnalyzer: the path of the sequence level to analyze inside of the result.
+            levelOfResult: the path of the sequence level to store the result.
+        """
+        super().analyze(self.lexicalDiversity, sequence, tag, levelOfAnalyzer, levelOfResult, True)
+    def lexicalDiversity(self, arrayText):
+        '''
+        Function that get the lexical diversity measures  of a list of texts.
+        Args:
+            arrayText: list that contains the texts that we want to analyze
+        Returns:
+            A list with the dictionaries. Each dictionary contains the result
+            of the analysis of the corresponding text.
+        '''
+        resultsList = []
+        for t in arrayText:
+            lemmas = self.lemmatizer(t.lower())
+            result={
+                "SimpleTTR": ld.ttr(lemmas),
+                "RootTTR": ld.root_ttr(lemmas),
+                "LogTTR": ld.log_ttr(lemmas),
+                "MaasTTR": ld.maas_ttr(lemmas),
+                "MSTTR": ld.msttr(lemmas),
+                "MATTR": ld.mattr(lemmas),
+                "HDD": ld.hdd(lemmas),
+                "MTLD": ld.mtld(lemmas),
+                "MTLDMAWrap": ld.mtld_ma_wrap(lemmas),
+                "MTLDMABi": ld.mtld_ma_bid(lemmas)
+            }
+            resultsList.append(result)
+        return resultsList
\ No newline at end of file