new data visualization functions added

parent 329ef996
......@@ -158,4 +158,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
./Examples/df_trans_merged_textflow.csv
\ No newline at end of file
Examples/df_trans_merged_textflow.csv
\ No newline at end of file
......@@ -16,6 +16,8 @@ pandas = {version = "^2.1.1", python = ">=3.9,<4.0"}
numpy = {version = "^1.26.0", python = ">=3.9,<3.13"}
scipy = {version = "^1.11.3", python = ">=3.9,<3.13"}
statsmodels = {version = "^0.14.0", python = ">=3.8,<4.0"}
matplotlib = {version = "^3.8.0", python = ">=3.9,<4.0"}
seaborn = {version = "^0.13.0", python = ">=3.8,<4.0"}
[tool.poetry.dev-dependencies]
pytest = "^5.2"
......
from typing import Optional
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.stats import shapiro, normaltest, kstest, anderson, chisquare, jarque_bera
from scipy.stats import shapiro, normaltest, kstest, anderson, chisquare, jarque_bera, mannwhitneyu, wilcoxon, kruskal, ttest_ind, ttest_rel, f_oneway
from statsmodels.stats.diagnostic import lilliefors
class Test():
#https://towardsdatascience.com/normality-tests-in-python-31e04aa4f411
def __init__(self,normalityTest=["Shapiro","D'Agostino","Anderson-Darling","Chi-Square","Lilliefors","Jarque–Bera","Kolmogorov-Smirnov"], parametricTest=["mannwhitneyu","wilcoxon","kruskal"], nonParametricTest=["mannwhitneyu","wilcoxon","kruskal"]):
def __init__(self,normalityTest=["Shapiro","D'Agostino","Anderson-Darling","Chi-Square","Lilliefors","Jarque–Bera","Kolmogorov-Smirnov"], parametricTest=["Students t-test","Paired Students t-Test", "ANOVA"], nonParametricTest=["mannwhitneyu","wilcoxon","kruskal"],alpha=0.05):
self.normalityTest = normalityTest
self.parametricTest = parametricTest
self.nonParametricTest = nonParametricTest
self.parametricTest = parametricTest #https://machinelearningmastery.com/parametric-statistical-significance-tests-in-python/
self.nonParametricTest = nonParametricTest #https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
self.alpha = alpha
def apply(self,df1,df2):
......@@ -79,8 +81,84 @@ class Test():
for i in range(len(crit_val)):
print("Pass the test of "+t)
print(list(testFinal[testFinal[t+' stat'] < crit_val[i]].index),"at "+str(sig_level[i])+" level of significance")
dicResult[t+' '+sig_level[i]+' sig_lev'] = list(testFinal[testFinal[t+' p-value'] > 0.05].index)
dicResult[t+' '+str(sig_level[i])+' sig_lev'] = list(testFinal[testFinal[t+' stat'] < crit_val[i]].index)
return testFinal, dicResult
\ No newline at end of file
def applyNonParametricTest(self, df1, df2, criteriaColumn1,criteriaColumn2, contrastCriteriaColumns): #https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
columnsDF=['Feature', 'Criteria_1', 'Criteria_2']
dicResult = {}
for npt in self.nonParametricTest:
columnsDF.extend([npt+" stat", npt+" p-value"])
dfResult = pd.DataFrame(columns=columnsDF)
dicResult[npt] = {"Reject H0":[],"Fail to Reject H0": []} #Reject == Different distribution, Fail to Reject == Same distribution
for col in contrastCriteriaColumns:
row=[col, criteriaColumn1,criteriaColumn2]
if "mannwhitneyu" in self.nonParametricTest:
stat_mw, p_value_mw = mannwhitneyu(df1[col], df2[col])
if p_value_mw > self.alpha:
dicResult["mannwhitneyu"]['Fail to Reject H0'].append(col)
else:
dicResult["mannwhitneyu"]["Reject H0"].append(col)
row.extend([stat_mw, p_value_mw])
if "wilcoxon" in self.nonParametricTest:
if len(df1) == len(df2[col]):
stat_wc, p_value_w = wilcoxon(df1[col], df2[col])
if p_value_w > self.alpha:
dicResult["wilcoxon"]['Fail to Reject H0'].append(col)
else:
dicResult["wilcoxon"]["Reject H0"].append(col)
row.extend([stat_wc, p_value_w])
else:
row.extend([np.nan, np.nan])
if "kruskal" in self.nonParametricTest:
stat_k, p_value_k = kruskal(df1[col], df2[col])
if p_value_k > self.alpha:
dicResult["kruskal"]['Fail to Reject H0'].append(col)
else:
dicResult["kruskal"]["Reject H0"].append(col)
row.extend([stat_k, p_value_k])
dfResult = dfResult._append(pd.Series(row,index=dfResult.columns), ignore_index = True)
print(dfResult)
print(dicResult)
return dfResult, dicResult
def applyParametricTest(self, df1, df2, criteriaColumn1,criteriaColumn2, contrastCriteriaColumns): #https://machinelearningmastery.com/parametric-statistical-significance-tests-in-python/
columnsDF=['Feature', 'Criteria_1', 'Criteria_2']
dicResult = {}
for pt in self.parametricTest:
columnsDF.extend([pt+" stat", pt+" p-value"])
dfResult = pd.DataFrame(columns=columnsDF)
dicResult[pt] = {"Reject H0":[],"Fail to Reject H0": []} #Reject == Different distribution, Fail to Reject == Same distribution
for col in contrastCriteriaColumns:
row=[col, criteriaColumn1,criteriaColumn2]
if "Students t-test" in self.parametricTest:
stat_ttestInd, p_value_ttestInd = mannwhitneyu(df1[col], df2[col])
if p_value_ttestInd > self.alpha:
dicResult["Students t-test"]['Fail to Reject H0'].append(col)
else:
dicResult["Students t-test"]["Reject H0"].append(col)
row.extend([stat_ttestInd, p_value_ttestInd])
if "Paired Students t-Test" in self.parametricTest:
if len(df1) == len(df2[col]):
stat_ttestRel, p_value_ttestRel = ttest_rel(df1[col], df2[col])
if p_value_ttestRel > self.alpha:
dicResult["Paired Students t-Test"]['Fail to Reject H0'].append(col)
else:
dicResult["Paired Students t-Test"]["Reject H0"].append(col)
row.extend([stat_ttestRel, p_value_ttestRel])
else:
row.extend([np.nan, np.nan])
if "ANOVA" in self.parametricTest:
stat_anova, p_value_anova = f_oneway(df1[col], df2[col])
if p_value_anova > self.alpha:
dicResult["ANOVA"]['Fail to Reject H0'].append(col)
else:
dicResult["ANOVA"]["Reject H0"].append(col)
row.extend([stat_anova, p_value_anova])
dfResult = dfResult._append(pd.Series(row,index=dfResult.columns), ignore_index = True)
print(dfResult)
print(dicResult)
return dfResult, dicResult
\ No newline at end of file
from typing import Optional
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
import seaborn as sns
import pylab
import scipy.stats as stats
import matplotlib.pyplot as plt
from numpy import linalg as LA
import math
class Visualization():
def __init__(self, savePath, n_cols=6, cmap=sns.diverging_palette(230, 20, as_cmap=True)):
self.n_cols=n_cols
self.cmap = cmap
if savePath[-1] == '/' or savePath[-1] == '\"':
self.savePath = savePath
else:
if "/" in savePath:
self.savePath = savePath+"/"
else:
self.savePath = savePath+'\"'
def show_distplots(self, df, columns, savePicture = False, pictureName= None): #columns == numeric_cols
ncols = self.n_cols
nrows = int(len(columns)/ncols+1)
fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 50), sharey=False)
for i, target in enumerate(columns):
ax = axs[int(i/ncols), i % ncols]
_ = sns.histplot(df[target], kde=True, stat="density", kde_kws=dict(cut=3), alpha=.4, edgecolor=(1, 1, 1, .4), ax=ax)
ax.set_title(target)
plt.tight_layout()
if savePicture:
plt.savefig(self.savePath+pictureName)
plt.show()
def show_probplots(self, df, columns, savePicture = False, pictureName= None): #columns == numeric_cols
ncols = self.n_cols
nrows = int(len(columns)/ncols+1)
fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 50), sharey=False)
for i, target in enumerate(columns):
ax = axs[int(i/ncols), i % ncols]
stats.probplot(df[target], dist="norm", plot=ax)
ax.set_title(target)
plt.tight_layout()
if savePicture:
plt.savefig(self.savePath+pictureName)
plt.show()
def show_kde(self, df, columns, group, savePicture = False, pictureName= None):
ncols = self.n_cols
nrows = math.ceil(len(columns)/ncols)
group_unique_values = list(df[group].unique())
fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 50), sharey=False)
for i, c in enumerate(columns):
ax = axs[int(i/ncols), i % ncols]
for value in group_unique_values:
try:
df[df[group] == value][c].plot.kde(alpha=1.0/len(group_unique_values), label=value, ax=ax)
except LA.LinAlgError:
pass
ax.set_title(c)
ax.legend(loc='upper right')
plt.tight_layout()
if savePicture:
plt.savefig(self.savePath+pictureName)
plt.show()
def show_boxplot(self, df, columns, x, hue= None, savePicture= False, pictureName=None):
nrows = int(len(columns)/self.ncols+1)
fig, axs = plt.subplots(nrows=nrows, ncols=self.n_cols, figsize=(30, 60), sharey=False)
for i, target in enumerate(columns):
ax = axs[int(i/self.ncols), i % self.ncols]
if hue != None:
sns.stripplot(ax=ax, x=x, y=target, hue=hue, data=df, color='black', size=10, alpha=0.3)
sns.boxplot(ax=ax, x=x, y=target, hue=hue, data=df, showmeans=True,
meanprops=dict(marker='D', markeredgecolor='black', markerfacecolor='firebrick'))
else:
sns.stripplot(ax=ax, x=x, y=target, data=df, color='black', size=10, alpha=0.3)
sns.boxplot(ax=ax, x=x, y=target, data=df, showmeans=True,
meanprops=dict(marker='D', markeredgecolor='black', markerfacecolor='firebrick'))
plt.tight_layout()
plt.show()
pass
def show_wordCloud(self):
pass
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment