SeqToDF and starting with Test class

parent de1b1540
Hola, ¿Como estas?
Hola, ¿Como estas?
\ No newline at end of file
This diff could not be displayed because it is too large.
......@@ -12,6 +12,10 @@ transformers = "^4.19.0"
torch = {version = "^1.11.0", python = "^3.7", platform = "linux"}
lexical-diversity = "^0.1.1"
emoji = "^1.7.0"
pandas = {version = "^2.1.1", python = ">=3.9,<4.0"}
numpy = {version = "^1.26.0", python = ">=3.9,<3.13"}
scipy = {version = "^1.11.3", python = ">=3.9,<3.13"}
statsmodels = {version = "^0.14.0", python = ">=3.8,<4.0"}
[tool.poetry.dev-dependencies]
pytest = "^5.2"
......
from typing import Optional
from abc import ABC, abstractmethod
import pandas as pd
from collections import defaultdict
class SequenceIterator:
"""
......@@ -253,4 +255,78 @@ class Sequence(ABC):
for r in gen:
yield gen[cont]
cont+=1
\ No newline at end of file
@abstractmethod
def toDF(self, level= "metadata"):
'''
Convert a Sequence to a pandas DataFrame
#Args:
# level: the route of the level as string, separating each level with "/"
Returns:
A pandas DataFrame with the sequence information
'''
path = level.split("/")
children = [self.children]
metadata = [self.metadata]
columns = []
values = []
for idx, p in enumerate(path):
print(idx,p)
if p == "metadata":
print("Es metadata",metadata)
for metadataDic in metadata:
for m in metadataDic:
columns.append(m+str(idx))
values.append({str(m)+str(idx):metadataDic[m]})
elif p == "children":
childrenAux=[]
for child in children: #Accedemos a los hijos
for ch in child: #Cada hijo tiene un diccionario de valores
#print(ch,child[ch])
columns.append(ch+str(idx))
auxDic={}
for c in child[ch]: #Dentro de cada diccionario de valores tenemos más sequencias
#print(c.metadata)
#print(c.children)
childrenAux.append(c.children)
for metadataKey in c.metadata: #Cada Sequencia tiene sus metadatos, como todas las sequencias de este nivel pertenecen al mismo, todas deberían tener los mismos metadatos
#print(metadataKey)
if metadataKey not in auxDic:
auxDic[metadataKey]=[c.metadata[metadataKey]]
else:
auxDic[metadataKey].append(c.metadata[metadataKey])
values.append({str(ch)+str(idx):auxDic})
children = childrenAux
finalColumns = []
finalRows = {}
for value in values: #Recorremos la lista de valores (diccionarios)
for v in value: # Para cada clave en el diccionario
print(v,type(value))
if type(value[v]) == dict:
for keyValue in value[v]:
if v+keyValue not in finalColumns:
finalColumns.append(v+keyValue)
if type(value[v][keyValue]) == list:
finalRows[v+keyValue]=value[v][keyValue]
else:
finalRows[v+keyValue]=[value[v][keyValue]]
else:
if type(value[v][keyValue]) == list and finalRows[v+keyValue][0] != list: #Solo se ejecutara una vez despues de haber creado el dataset
newList=[]
for element in finalRows[v+keyValue]:
newList.append(element)
finalRows[v+keyValue]=[newList,value[v][keyValue]]
else:
finalRows[v+keyValue].append(value[v][keyValue])
else:
if len(value[v]) != 0:
if v not in finalColumns:
finalColumns.append(v)
finalRows[v]=value[v]
print(finalRows)
df = pd.DataFrame(finalRows)
return df
......@@ -163,4 +163,17 @@ class SequenceDirectory(Sequence):
Returns:
A generator with the result of the filter
'''
return super().filterMetadata(level,criteria)
\ No newline at end of file
return super().filterMetadata(level,criteria)
def toDF(self,level):#, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().toDF(level)
\ No newline at end of file
......@@ -147,4 +147,17 @@ class SequenceFile (Sequence):
Returns:
A generator with the result of the filter
'''
return super().filterMetadata(level,criteria)
\ No newline at end of file
return super().filterMetadata(level,criteria)
def toDF(self,level):#, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().toDF(level)
\ No newline at end of file
......@@ -147,4 +147,17 @@ class SequenceString (Sequence):
'''
return super().filterMetadata(level,criteria)
def toDF(self,level):#, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().toDF(level)
\ No newline at end of file
......@@ -137,3 +137,16 @@ class SequenceToken (Sequence):
A generator with the result of the filter
'''
return super().filterMetadata(level,criteria)
def toDF(self,level):#, level, criteria):
'''
Filter the children of a Sequence according to a criteria
Args:
level: the route of the level as string, separating each level with "/"
criteria: the filter function
Returns:
A generator with the result of the filter
'''
return super().toDF(level)
from typing import Optional
from abc import ABC, abstractmethod
import pandas as pd
from collections import defaultdict
from scipy.stats import shapiro, normaltest, kstest, anderson, chisquare, jarque_bera
from statsmodels.stats.diagnostic import lilliefors
class Test():
#https://towardsdatascience.com/normality-tests-in-python-31e04aa4f411
def __init__(self,parametricTest=["Shapiro","D'Agostino","Anderson-Darling","Chi-Square","Lilliefors","Jarque–Bera","Kolmogorov-Smirnov"],nonParametricTest=["mannwhitneyu","wilcoxon","kruskal"]):
self.parametricTest = parametricTest
self.nonParametricTest = nonParametricTest
def apply(self,df1,df2):
#Hay que poner gráficas:
# qUARTIL QUARTIL
# Box Plot
# Histograma
pass
def applyParametric(self,df):
testFinal = pd.DataFrame()
#Numeric Cols, hay que filtrar el df
for i in self.parametricTest:
if i == "Shapiro":
test = df.apply(lambda x: shapiro(x), axis=0)
test.index = ['Shapiro stat', 'Shapiro p-value']
test = test.transpose()
elif i == "D'Agostino":
test = df.apply(lambda x: normaltest(x), axis=0)
test.index = ["D'Agostino stat", "D'Agostino p-value"]
test = test.transpose()
elif i == "Anderson-Darling":
test = df.apply(lambda x: anderson(x), axis=0)
test.index = ['Anderson-Darling stat', 'Anderson-Darling crit_val', 'Anderson-Darling sig_level']
test = test.transpose()
pass
elif i == "Chi-Square":
test = df.apply(lambda x: chisquare(x), axis=0)
test.index = ['Chi-Square stat', 'Chi-Square p-value']
test = test.transpose()
pass
elif i == "Lilliefors":
test = df.apply(lambda x: lilliefors(x), axis=0)
test.index = ['Lilliefors stat', 'Lilliefors p-value']
test = test.transpose()
pass
elif i == "Jarque–Bera":
test = df.apply(lambda x: jarque_bera(x), axis=0)
test.index = ['Shapiro stat', 'Shapiro p-value']
test = test.transpose()
pass
elif i == "Kolmogorov-Smirnov":
test = df.apply(lambda x: kstest(x, 'norm'), axis=0)
test.index = ["Kolmogorov-Smirnov stat", "Kolmogorov-Smirnov p-value"]
test = test.transpose()
for t in self.parametricTest:
if t != "Anderson-Darling":
print("Pass the test of"+t)
print(list(test[test[t+' p-value'] > 0.05].index))
else:
for i in range(len(list(test[t+' crit_val'].index))):
sig_level, crit_val = test[t+' sig_level'][i], test[t+' crit_val'][i]
print("Pass the test of"+t)
print(list(test[test[t+' stat'] < crit_val].index),"at {sig_level} level of significance")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment