new data visualization functions added

parent 329ef996
...@@ -158,4 +158,4 @@ cython_debug/ ...@@ -158,4 +158,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
./Examples/df_trans_merged_textflow.csv Examples/df_trans_merged_textflow.csv
\ No newline at end of file \ No newline at end of file
...@@ -16,6 +16,8 @@ pandas = {version = "^2.1.1", python = ">=3.9,<4.0"} ...@@ -16,6 +16,8 @@ pandas = {version = "^2.1.1", python = ">=3.9,<4.0"}
numpy = {version = "^1.26.0", python = ">=3.9,<3.13"} numpy = {version = "^1.26.0", python = ">=3.9,<3.13"}
scipy = {version = "^1.11.3", python = ">=3.9,<3.13"} scipy = {version = "^1.11.3", python = ">=3.9,<3.13"}
statsmodels = {version = "^0.14.0", python = ">=3.8,<4.0"} statsmodels = {version = "^0.14.0", python = ">=3.8,<4.0"}
matplotlib = {version = "^3.8.0", python = ">=3.9,<4.0"}
seaborn = {version = "^0.13.0", python = ">=3.8,<4.0"}
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pytest = "^5.2" pytest = "^5.2"
......
from typing import Optional from typing import Optional
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import pandas as pd import pandas as pd
import numpy as np
from collections import defaultdict from collections import defaultdict
from scipy.stats import shapiro, normaltest, kstest, anderson, chisquare, jarque_bera from scipy.stats import shapiro, normaltest, kstest, anderson, chisquare, jarque_bera, mannwhitneyu, wilcoxon, kruskal, ttest_ind, ttest_rel, f_oneway
from statsmodels.stats.diagnostic import lilliefors from statsmodels.stats.diagnostic import lilliefors
class Test(): class Test():
#https://towardsdatascience.com/normality-tests-in-python-31e04aa4f411 #https://towardsdatascience.com/normality-tests-in-python-31e04aa4f411
def __init__(self,normalityTest=["Shapiro","D'Agostino","Anderson-Darling","Chi-Square","Lilliefors","Jarque–Bera","Kolmogorov-Smirnov"], parametricTest=["mannwhitneyu","wilcoxon","kruskal"], nonParametricTest=["mannwhitneyu","wilcoxon","kruskal"]): def __init__(self,normalityTest=["Shapiro","D'Agostino","Anderson-Darling","Chi-Square","Lilliefors","Jarque–Bera","Kolmogorov-Smirnov"], parametricTest=["Students t-test","Paired Students t-Test", "ANOVA"], nonParametricTest=["mannwhitneyu","wilcoxon","kruskal"],alpha=0.05):
self.normalityTest = normalityTest self.normalityTest = normalityTest
self.parametricTest = parametricTest self.parametricTest = parametricTest #https://machinelearningmastery.com/parametric-statistical-significance-tests-in-python/
self.nonParametricTest = nonParametricTest self.nonParametricTest = nonParametricTest #https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
self.alpha = alpha
def apply(self,df1,df2): def apply(self,df1,df2):
...@@ -79,8 +81,84 @@ class Test(): ...@@ -79,8 +81,84 @@ class Test():
for i in range(len(crit_val)): for i in range(len(crit_val)):
print("Pass the test of "+t) print("Pass the test of "+t)
print(list(testFinal[testFinal[t+' stat'] < crit_val[i]].index),"at "+str(sig_level[i])+" level of significance") print(list(testFinal[testFinal[t+' stat'] < crit_val[i]].index),"at "+str(sig_level[i])+" level of significance")
dicResult[t+' '+sig_level[i]+' sig_lev'] = list(testFinal[testFinal[t+' p-value'] > 0.05].index) dicResult[t+' '+str(sig_level[i])+' sig_lev'] = list(testFinal[testFinal[t+' stat'] < crit_val[i]].index)
return testFinal, dicResult return testFinal, dicResult
def applyNonParametricTest(self, df1, df2, criteriaColumn1,criteriaColumn2, contrastCriteriaColumns): #https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
\ No newline at end of file columnsDF=['Feature', 'Criteria_1', 'Criteria_2']
dicResult = {}
for npt in self.nonParametricTest:
columnsDF.extend([npt+" stat", npt+" p-value"])
dfResult = pd.DataFrame(columns=columnsDF)
dicResult[npt] = {"Reject H0":[],"Fail to Reject H0": []} #Reject == Different distribution, Fail to Reject == Same distribution
for col in contrastCriteriaColumns:
row=[col, criteriaColumn1,criteriaColumn2]
if "mannwhitneyu" in self.nonParametricTest:
stat_mw, p_value_mw = mannwhitneyu(df1[col], df2[col])
if p_value_mw > self.alpha:
dicResult["mannwhitneyu"]['Fail to Reject H0'].append(col)
else:
dicResult["mannwhitneyu"]["Reject H0"].append(col)
row.extend([stat_mw, p_value_mw])
if "wilcoxon" in self.nonParametricTest:
if len(df1) == len(df2[col]):
stat_wc, p_value_w = wilcoxon(df1[col], df2[col])
if p_value_w > self.alpha:
dicResult["wilcoxon"]['Fail to Reject H0'].append(col)
else:
dicResult["wilcoxon"]["Reject H0"].append(col)
row.extend([stat_wc, p_value_w])
else:
row.extend([np.nan, np.nan])
if "kruskal" in self.nonParametricTest:
stat_k, p_value_k = kruskal(df1[col], df2[col])
if p_value_k > self.alpha:
dicResult["kruskal"]['Fail to Reject H0'].append(col)
else:
dicResult["kruskal"]["Reject H0"].append(col)
row.extend([stat_k, p_value_k])
dfResult = dfResult._append(pd.Series(row,index=dfResult.columns), ignore_index = True)
print(dfResult)
print(dicResult)
return dfResult, dicResult
def applyParametricTest(self, df1, df2, criteriaColumn1,criteriaColumn2, contrastCriteriaColumns): #https://machinelearningmastery.com/parametric-statistical-significance-tests-in-python/
columnsDF=['Feature', 'Criteria_1', 'Criteria_2']
dicResult = {}
for pt in self.parametricTest:
columnsDF.extend([pt+" stat", pt+" p-value"])
dfResult = pd.DataFrame(columns=columnsDF)
dicResult[pt] = {"Reject H0":[],"Fail to Reject H0": []} #Reject == Different distribution, Fail to Reject == Same distribution
for col in contrastCriteriaColumns:
row=[col, criteriaColumn1,criteriaColumn2]
if "Students t-test" in self.parametricTest:
stat_ttestInd, p_value_ttestInd = mannwhitneyu(df1[col], df2[col])
if p_value_ttestInd > self.alpha:
dicResult["Students t-test"]['Fail to Reject H0'].append(col)
else:
dicResult["Students t-test"]["Reject H0"].append(col)
row.extend([stat_ttestInd, p_value_ttestInd])
if "Paired Students t-Test" in self.parametricTest:
if len(df1) == len(df2[col]):
stat_ttestRel, p_value_ttestRel = ttest_rel(df1[col], df2[col])
if p_value_ttestRel > self.alpha:
dicResult["Paired Students t-Test"]['Fail to Reject H0'].append(col)
else:
dicResult["Paired Students t-Test"]["Reject H0"].append(col)
row.extend([stat_ttestRel, p_value_ttestRel])
else:
row.extend([np.nan, np.nan])
if "ANOVA" in self.parametricTest:
stat_anova, p_value_anova = f_oneway(df1[col], df2[col])
if p_value_anova > self.alpha:
dicResult["ANOVA"]['Fail to Reject H0'].append(col)
else:
dicResult["ANOVA"]["Reject H0"].append(col)
row.extend([stat_anova, p_value_anova])
dfResult = dfResult._append(pd.Series(row,index=dfResult.columns), ignore_index = True)
print(dfResult)
print(dicResult)
return dfResult, dicResult
\ No newline at end of file
from typing import Optional
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
import seaborn as sns
import pylab
import scipy.stats as stats
import matplotlib.pyplot as plt
from numpy import linalg as LA
import math
class Visualization():
def __init__(self, savePath, n_cols=6, cmap=sns.diverging_palette(230, 20, as_cmap=True)):
self.n_cols=n_cols
self.cmap = cmap
if savePath[-1] == '/' or savePath[-1] == '\"':
self.savePath = savePath
else:
if "/" in savePath:
self.savePath = savePath+"/"
else:
self.savePath = savePath+'\"'
def show_distplots(self, df, columns, savePicture = False, pictureName= None): #columns == numeric_cols
ncols = self.n_cols
nrows = int(len(columns)/ncols+1)
fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 50), sharey=False)
for i, target in enumerate(columns):
ax = axs[int(i/ncols), i % ncols]
_ = sns.histplot(df[target], kde=True, stat="density", kde_kws=dict(cut=3), alpha=.4, edgecolor=(1, 1, 1, .4), ax=ax)
ax.set_title(target)
plt.tight_layout()
if savePicture:
plt.savefig(self.savePath+pictureName)
plt.show()
def show_probplots(self, df, columns, savePicture = False, pictureName= None): #columns == numeric_cols
ncols = self.n_cols
nrows = int(len(columns)/ncols+1)
fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 50), sharey=False)
for i, target in enumerate(columns):
ax = axs[int(i/ncols), i % ncols]
stats.probplot(df[target], dist="norm", plot=ax)
ax.set_title(target)
plt.tight_layout()
if savePicture:
plt.savefig(self.savePath+pictureName)
plt.show()
def show_kde(self, df, columns, group, savePicture = False, pictureName= None):
ncols = self.n_cols
nrows = math.ceil(len(columns)/ncols)
group_unique_values = list(df[group].unique())
fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 50), sharey=False)
for i, c in enumerate(columns):
ax = axs[int(i/ncols), i % ncols]
for value in group_unique_values:
try:
df[df[group] == value][c].plot.kde(alpha=1.0/len(group_unique_values), label=value, ax=ax)
except LA.LinAlgError:
pass
ax.set_title(c)
ax.legend(loc='upper right')
plt.tight_layout()
if savePicture:
plt.savefig(self.savePath+pictureName)
plt.show()
def show_boxplot(self, df, columns, x, hue= None, savePicture= False, pictureName=None):
nrows = int(len(columns)/self.ncols+1)
fig, axs = plt.subplots(nrows=nrows, ncols=self.n_cols, figsize=(30, 60), sharey=False)
for i, target in enumerate(columns):
ax = axs[int(i/self.ncols), i % self.ncols]
if hue != None:
sns.stripplot(ax=ax, x=x, y=target, hue=hue, data=df, color='black', size=10, alpha=0.3)
sns.boxplot(ax=ax, x=x, y=target, hue=hue, data=df, showmeans=True,
meanprops=dict(marker='D', markeredgecolor='black', markerfacecolor='firebrick'))
else:
sns.stripplot(ax=ax, x=x, y=target, data=df, color='black', size=10, alpha=0.3)
sns.boxplot(ax=ax, x=x, y=target, data=df, showmeans=True,
meanprops=dict(marker='D', markeredgecolor='black', markerfacecolor='firebrick'))
plt.tight_layout()
plt.show()
pass
def show_wordCloud(self):
pass
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment