SeqToDF and starting with Test class

c22942f3 · Estrella Vallecillo · de1b1540 · c22942f3 · c22942f3 · c22942f3
Commit c22942f3 authored Oct 06, 2023 by Estrella Vallecillo
Showing with 212 additions and 6 deletions
Examples/ExampleDirectory/Documento sin título copy.txt
Examples/df_trans_merged_textflow.csv
poetry.lock
pyproject.toml
textflow/Sequence.py
textflow/SequenceDirectory.py
textflow/SequenceFile.py
textflow/SequenceString.py
textflow/SequenceToken.py
textflow/Test.py
--- a/Examples/ExampleDirectory/Documento sin título copy.txt
+++ b/Examples/ExampleDirectory/Documento sin título copy.txt
+Hola, ¿Como estas?
+Hola, ¿Como estas?
\ No newline at end of file
--- a/Examples/df_trans_merged_textflow.csv
+++ b/Examples/df_trans_merged_textflow.csv
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,10 @@ transformers = "^4.19.0"
 torch = {version = "^1.11.0", python = "^3.7", platform = "linux"}
 lexical-diversity = "^0.1.1"
 emoji = "^1.7.0"
+pandas = {version = "^2.1.1", python = ">=3.9,<4.0"}
+numpy = {version = "^1.26.0", python = ">=3.9,<3.13"}
+scipy = {version = "^1.11.3", python = ">=3.9,<3.13"}
+statsmodels = {version = "^0.14.0", python = ">=3.8,<4.0"}

 [tool.poetry.dev-dependencies]
 pytest = "^5.2"

--- a/textflow/Sequence.py
+++ b/textflow/Sequence.py
 from typing import Optional
 from abc import ABC, abstractmethod
+import pandas as pd
+from collections import defaultdict

 class SequenceIterator:
    """
@@ -253,4 +255,78 @@ class Sequence(ABC):
        for r in gen:
            yield gen[cont]
            cont+=1
-        
\ No newline at end of file
+    
+    @abstractmethod
+    def toDF(self, level= "metadata"):
+        '''
+        Convert a Sequence to a pandas DataFrame
+
+        #Args:
+        #    level: the route of the level as string, separating each level with "/" 
+
+        Returns:
+            A pandas DataFrame with the sequence information
+        '''
+        path = level.split("/")
+        children = [self.children]
+        metadata = [self.metadata]
+        columns = []
+        values = []
+        for idx, p in enumerate(path):
+            print(idx,p)
+            if p == "metadata":
+                print("Es metadata",metadata)
+                for metadataDic in metadata:
+                    for m in metadataDic:
+                        columns.append(m+str(idx))
+                        values.append({str(m)+str(idx):metadataDic[m]})
+            elif p == "children":
+                childrenAux=[]
+                for child in children: #Accedemos a los hijos
+                    for ch in child: #Cada hijo tiene un diccionario de valores
+                        #print(ch,child[ch])
+                        columns.append(ch+str(idx))
+                        auxDic={}
+                        for c in child[ch]: #Dentro de cada diccionario de valores tenemos más sequencias
+                            #print(c.metadata)
+                            #print(c.children)
+                            childrenAux.append(c.children)
+                            for metadataKey in c.metadata: #Cada Sequencia tiene sus metadatos, como todas las sequencias de este nivel pertenecen al mismo, todas deberían tener los mismos metadatos
+                                #print(metadataKey)
+                                if metadataKey not in auxDic:
+                                    auxDic[metadataKey]=[c.metadata[metadataKey]]
+                                else:
+                                    auxDic[metadataKey].append(c.metadata[metadataKey])                        
+                        values.append({str(ch)+str(idx):auxDic})
+                children = childrenAux
+
+        finalColumns = []
+        finalRows = {}
+        for value in values: #Recorremos la lista de valores (diccionarios)
+            for v in value: # Para cada clave en el diccionario
+                print(v,type(value))
+                if type(value[v]) == dict:
+                    for keyValue in value[v]:
+                        if v+keyValue not in finalColumns:
+                            finalColumns.append(v+keyValue)
+                            if type(value[v][keyValue]) == list:
+                                finalRows[v+keyValue]=value[v][keyValue]
+                            else:
+                                finalRows[v+keyValue]=[value[v][keyValue]]
+                        else:
+                            if type(value[v][keyValue]) == list and finalRows[v+keyValue][0] != list: #Solo se ejecutara una vez despues de haber creado el dataset
+                                newList=[]
+                                for element in finalRows[v+keyValue]:
+                                    newList.append(element)
+                                finalRows[v+keyValue]=[newList,value[v][keyValue]]
+
+                            else:
+                                finalRows[v+keyValue].append(value[v][keyValue])
+                else:
+                    if len(value[v]) != 0:
+                        if v not in finalColumns:
+                            finalColumns.append(v)
+                            finalRows[v]=value[v]
+        print(finalRows)
+        df = pd.DataFrame(finalRows)
+        return df
--- a/textflow/SequenceDirectory.py
+++ b/textflow/SequenceDirectory.py
@@ -163,4 +163,17 @@ class SequenceDirectory(Sequence):
        Returns:
            A generator with the result of the filter
        '''
-        return super().filterMetadata(level,criteria)
\ No newline at end of file
+        return super().filterMetadata(level,criteria)
+
+    def toDF(self,level):#, level, criteria):
+        '''
+        Filter the children of a Sequence according to a criteria
+
+        Args:
+            level: the route of the level as string, separating each level with "/" 
+            criteria: the filter function
+
+        Returns:
+            A generator with the result of the filter
+        '''
+        return super().toDF(level)
\ No newline at end of file
--- a/textflow/SequenceFile.py
+++ b/textflow/SequenceFile.py
@@ -147,4 +147,17 @@ class SequenceFile (Sequence):
        Returns:
            A generator with the result of the filter
        '''
-        return super().filterMetadata(level,criteria)
\ No newline at end of file
+        return super().filterMetadata(level,criteria)
+    
+    def toDF(self,level):#, level, criteria):
+        '''
+        Filter the children of a Sequence according to a criteria
+
+        Args:
+            level: the route of the level as string, separating each level with "/" 
+            criteria: the filter function
+
+        Returns:
+            A generator with the result of the filter
+        '''
+        return super().toDF(level)
\ No newline at end of file
--- a/textflow/SequenceString.py
+++ b/textflow/SequenceString.py
@@ -147,4 +147,17 @@ class SequenceString (Sequence):
        '''
        return super().filterMetadata(level,criteria)

+    def toDF(self,level):#, level, criteria):
+        '''
+        Filter the children of a Sequence according to a criteria
+
+        Args:
+            level: the route of the level as string, separating each level with "/" 
+            criteria: the filter function
+
+        Returns:
+            A generator with the result of the filter
+        '''
+        return super().toDF(level)
+
    
\ No newline at end of file
--- a/textflow/SequenceToken.py
+++ b/textflow/SequenceToken.py
@@ -137,3 +137,16 @@ class SequenceToken (Sequence):
            A generator with the result of the filter
        '''
        return super().filterMetadata(level,criteria)
+
+    def toDF(self,level):#, level, criteria):
+        '''
+        Filter the children of a Sequence according to a criteria
+
+        Args:
+            level: the route of the level as string, separating each level with "/" 
+            criteria: the filter function
+
+        Returns:
+            A generator with the result of the filter
+        '''
+        return super().toDF(level)
--- a/textflow/Test.py
+++ b/textflow/Test.py
+from typing import Optional
+from abc import ABC, abstractmethod
+import pandas as pd
+from collections import defaultdict
+from scipy.stats import shapiro, normaltest, kstest, anderson, chisquare, jarque_bera
+from statsmodels.stats.diagnostic import lilliefors
+
+class Test():
+    #https://towardsdatascience.com/normality-tests-in-python-31e04aa4f411
+    def __init__(self,parametricTest=["Shapiro","D'Agostino","Anderson-Darling","Chi-Square","Lilliefors","Jarque–Bera","Kolmogorov-Smirnov"],nonParametricTest=["mannwhitneyu","wilcoxon","kruskal"]):
+        self.parametricTest = parametricTest
+        self.nonParametricTest = nonParametricTest
+        
+
+    def apply(self,df1,df2):
+        #Hay que poner gráficas:
+        #   qUARTIL QUARTIL
+        #   Box Plot
+        #   Histograma
+        pass
+
+    def applyParametric(self,df):
+        testFinal = pd.DataFrame()
+        #Numeric Cols, hay que filtrar el df
+        for i in self.parametricTest:
+            if i == "Shapiro":
+                test = df.apply(lambda x: shapiro(x), axis=0)
+                test.index = ['Shapiro stat', 'Shapiro p-value']
+                test = test.transpose()
+            elif i == "D'Agostino":
+                test = df.apply(lambda x: normaltest(x), axis=0)
+                test.index = ["D'Agostino stat", "D'Agostino p-value"]
+                test = test.transpose()
+            elif i == "Anderson-Darling": 
+                test = df.apply(lambda x: anderson(x), axis=0)
+                test.index = ['Anderson-Darling stat', 'Anderson-Darling crit_val', 'Anderson-Darling sig_level']
+                test = test.transpose()
+                pass
+            elif i == "Chi-Square":
+                test = df.apply(lambda x: chisquare(x), axis=0)
+                test.index = ['Chi-Square stat', 'Chi-Square p-value']
+                test = test.transpose()
+                pass
+            elif i == "Lilliefors": 
+                test = df.apply(lambda x: lilliefors(x), axis=0)
+                test.index = ['Lilliefors stat', 'Lilliefors p-value']
+                test = test.transpose()
+                pass
+            elif i == "Jarque–Bera": 
+                test = df.apply(lambda x: jarque_bera(x), axis=0)
+                test.index = ['Shapiro stat', 'Shapiro p-value']
+                test = test.transpose()
+                pass
+            elif i == "Kolmogorov-Smirnov":
+                test = df.apply(lambda x: kstest(x, 'norm'), axis=0)
+                test.index = ["Kolmogorov-Smirnov stat", "Kolmogorov-Smirnov p-value"]
+                test = test.transpose()
+        
+        for t in self.parametricTest:
+            if t != "Anderson-Darling":
+                print("Pass the test of"+t)
+                print(list(test[test[t+' p-value'] > 0.05].index))
+            else:
+                for i in range(len(list(test[t+' crit_val'].index))):
+                    sig_level, crit_val = test[t+' sig_level'][i], test[t+' crit_val'][i]
+                    print("Pass the test of"+t)
+                    print(list(test[test[t+' stat'] < crit_val].index),"at {sig_level} level of significance")
+                
+                
+                
\ No newline at end of file