new data visualization functions added

8b6a0f7b · Estrella Vallecillo · 329ef996 · 8b6a0f7b · 8b6a0f7b · 8b6a0f7b
Commit 8b6a0f7b authored Oct 12, 2023 by Estrella Vallecillo
Showing with 182 additions and 9 deletions
.gitignore
poetry.lock
pyproject.toml
textflow/Test.py
textflow/Visualization.py
--- a/.gitignore
+++ b/.gitignore
@@ -158,4 +158,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-./Examples/df_trans_merged_textflow.csv
+Examples/df_trans_merged_textflow.csv
\ No newline at end of file
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,8 @@ pandas = {version = "^2.1.1", python = ">=3.9,<4.0"}
 numpy = {version = "^1.26.0", python = ">=3.9,<3.13"}
 scipy = {version = "^1.11.3", python = ">=3.9,<3.13"}
 statsmodels = {version = "^0.14.0", python = ">=3.8,<4.0"}
+matplotlib = {version = "^3.8.0", python = ">=3.9,<4.0"}
+seaborn = {version = "^0.13.0", python = ">=3.8,<4.0"}
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"

--- a/textflow/Test.py
+++ b/textflow/Test.py
 from typing import Optional
 from abc import ABC, abstractmethod
 import pandas as pd
+import numpy as np
 from collections import defaultdict
-from scipy.stats import shapiro, normaltest, kstest, anderson, chisquare, jarque_bera
+from scipy.stats import shapiro, normaltest, kstest, anderson, chisquare, jarque_bera, mannwhitneyu, wilcoxon, kruskal, ttest_ind, ttest_rel, f_oneway 
 from statsmodels.stats.diagnostic import lilliefors
 class Test():
    #https://towardsdatascience.com/normality-tests-in-python-31e04aa4f411
-    def __init__(self,normalityTest=["Shapiro","D'Agostino","Anderson-Darling","Chi-Square","Lilliefors","Jarque–Bera","Kolmogorov-Smirnov"], parametricTest=["mannwhitneyu","wilcoxon","kruskal"], nonParametricTest=["mannwhitneyu","wilcoxon","kruskal"]):
+    def __init__(self,normalityTest=["Shapiro","D'Agostino","Anderson-Darling","Chi-Square","Lilliefors","Jarque–Bera","Kolmogorov-Smirnov"], parametricTest=["Students t-test","Paired Students t-Test", "ANOVA"], nonParametricTest=["mannwhitneyu","wilcoxon","kruskal"],alpha=0.05):
        self.normalityTest = normalityTest
-        self.parametricTest = parametricTest
+        self.parametricTest = parametricTest #https://machinelearningmastery.com/parametric-statistical-significance-tests-in-python/
-        self.nonParametricTest = nonParametricTest
+        self.nonParametricTest = nonParametricTest #https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
+        self.alpha = alpha
    def apply(self,df1,df2):
@@ -79,8 +81,84 @@ class Test():
                for i in range(len(crit_val)):
                    print("Pass the test of "+t)
                    print(list(testFinal[testFinal[t+' stat'] < crit_val[i]].index),"at "+str(sig_level[i])+" level of significance")
-                    dicResult[t+' '+sig_level[i]+' sig_lev'] = list(testFinal[testFinal[t+' p-value'] > 0.05].index)
+                    dicResult[t+' '+str(sig_level[i])+' sig_lev'] = list(testFinal[testFinal[t+' stat'] < crit_val[i]].index)
        return testFinal, dicResult        
+    def applyNonParametricTest(self, df1, df2, criteriaColumn1,criteriaColumn2, contrastCriteriaColumns): #https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
\ No newline at end of file
+        columnsDF=['Feature', 'Criteria_1', 'Criteria_2']
+        dicResult = {}
+        for npt in self.nonParametricTest:
+            columnsDF.extend([npt+" stat", npt+" p-value"])
+            dfResult = pd.DataFrame(columns=columnsDF)
+            dicResult[npt] = {"Reject H0":[],"Fail to Reject H0": []} #Reject == Different distribution, Fail to Reject == Same distribution
+        for col in contrastCriteriaColumns:
+            row=[col, criteriaColumn1,criteriaColumn2]
+            if "mannwhitneyu" in self.nonParametricTest:
+                stat_mw, p_value_mw = mannwhitneyu(df1[col], df2[col])
+                if p_value_mw > self.alpha:
+                    dicResult["mannwhitneyu"]['Fail to Reject H0'].append(col)
+                else:
+                    dicResult["mannwhitneyu"]["Reject H0"].append(col)
+                row.extend([stat_mw, p_value_mw])
+            if "wilcoxon" in self.nonParametricTest:
+                if len(df1) == len(df2[col]):
+                    stat_wc, p_value_w = wilcoxon(df1[col], df2[col])
+                    if p_value_w > self.alpha:
+                        dicResult["wilcoxon"]['Fail to Reject H0'].append(col)
+                    else:
+                        dicResult["wilcoxon"]["Reject H0"].append(col)
+                    row.extend([stat_wc, p_value_w])
+                else:
+                    row.extend([np.nan, np.nan])
+            if "kruskal" in self.nonParametricTest:
+                stat_k, p_value_k = kruskal(df1[col], df2[col])
+                if p_value_k > self.alpha:
+                    dicResult["kruskal"]['Fail to Reject H0'].append(col)
+                else:
+                    dicResult["kruskal"]["Reject H0"].append(col)
+                row.extend([stat_k, p_value_k])    
+            dfResult = dfResult._append(pd.Series(row,index=dfResult.columns), ignore_index = True)
+        print(dfResult)
+        print(dicResult)
+        return dfResult, dicResult
+    def applyParametricTest(self, df1, df2, criteriaColumn1,criteriaColumn2, contrastCriteriaColumns): #https://machinelearningmastery.com/parametric-statistical-significance-tests-in-python/ 
+        columnsDF=['Feature', 'Criteria_1', 'Criteria_2']
+        dicResult = {}
+        for pt in self.parametricTest:
+            columnsDF.extend([pt+" stat", pt+" p-value"])
+            dfResult = pd.DataFrame(columns=columnsDF)
+            dicResult[pt] = {"Reject H0":[],"Fail to Reject H0": []} #Reject == Different distribution, Fail to Reject == Same distribution
+        for col in contrastCriteriaColumns:
+            row=[col, criteriaColumn1,criteriaColumn2]
+            if "Students t-test" in self.parametricTest:
+                stat_ttestInd, p_value_ttestInd = mannwhitneyu(df1[col], df2[col])
+                if p_value_ttestInd > self.alpha:
+                    dicResult["Students t-test"]['Fail to Reject H0'].append(col)
+                else:
+                    dicResult["Students t-test"]["Reject H0"].append(col)
+                row.extend([stat_ttestInd, p_value_ttestInd])
+            if "Paired Students t-Test" in self.parametricTest:
+                if len(df1) == len(df2[col]):
+                    stat_ttestRel, p_value_ttestRel = ttest_rel(df1[col], df2[col])
+                    if p_value_ttestRel > self.alpha:
+                        dicResult["Paired Students t-Test"]['Fail to Reject H0'].append(col)
+                    else:
+                        dicResult["Paired Students t-Test"]["Reject H0"].append(col)
+                    row.extend([stat_ttestRel, p_value_ttestRel])
+                else:
+                    row.extend([np.nan, np.nan])
+            if "ANOVA" in self.parametricTest:
+                stat_anova, p_value_anova = f_oneway(df1[col], df2[col])
+                if p_value_anova > self.alpha:
+                    dicResult["ANOVA"]['Fail to Reject H0'].append(col)
+                else:
+                    dicResult["ANOVA"]["Reject H0"].append(col)
+                row.extend([stat_anova, p_value_anova])    
+            dfResult = dfResult._append(pd.Series(row,index=dfResult.columns), ignore_index = True)
+        print(dfResult)
+        print(dicResult)
+        return dfResult, dicResult                
\ No newline at end of file
--- a/textflow/Visualization.py
+++ b/textflow/Visualization.py
+from typing import Optional
+from abc import ABC, abstractmethod
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import pylab
+import scipy.stats as stats
+import matplotlib.pyplot as plt
+from numpy import linalg as LA
+import math
+class Visualization():
+    def __init__(self, savePath, n_cols=6, cmap=sns.diverging_palette(230, 20, as_cmap=True)):
+        self.n_cols=n_cols
+        self.cmap = cmap
+        if savePath[-1] == '/' or savePath[-1] == '\"':
+            self.savePath = savePath
+        else:
+            if "/" in savePath:
+                self.savePath = savePath+"/"
+            else:
+                self.savePath = savePath+'\"'
+    def show_distplots(self, df, columns, savePicture = False, pictureName= None): #columns == numeric_cols
+        ncols = self.n_cols
+        nrows = int(len(columns)/ncols+1)
+        fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 50), sharey=False)
+        for i, target in enumerate(columns):
+            ax = axs[int(i/ncols), i % ncols]
+            _ = sns.histplot(df[target], kde=True, stat="density", kde_kws=dict(cut=3), alpha=.4, edgecolor=(1, 1, 1, .4), ax=ax)
+            ax.set_title(target)
+        plt.tight_layout()
+        if savePicture:
+            plt.savefig(self.savePath+pictureName)
+        plt.show()
+    def show_probplots(self, df, columns, savePicture = False, pictureName= None): #columns == numeric_cols
+        ncols = self.n_cols
+        nrows = int(len(columns)/ncols+1)
+        fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 50), sharey=False)
+        for i, target in enumerate(columns):
+            ax = axs[int(i/ncols), i % ncols]
+            stats.probplot(df[target], dist="norm", plot=ax)
+            ax.set_title(target)
+        plt.tight_layout()
+        if savePicture:
+            plt.savefig(self.savePath+pictureName)
+        plt.show()
+    def show_kde(self, df, columns, group, savePicture = False, pictureName= None):
+        ncols = self.n_cols
+        nrows = math.ceil(len(columns)/ncols)
+        group_unique_values = list(df[group].unique())
+        fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 50), sharey=False)
+        for i, c in enumerate(columns):
+            ax = axs[int(i/ncols), i % ncols]
+            for value in group_unique_values:
+                try:
+                    df[df[group] == value][c].plot.kde(alpha=1.0/len(group_unique_values), label=value, ax=ax)
+                except LA.LinAlgError:
+                    pass
+            ax.set_title(c)
+            ax.legend(loc='upper right')
+        plt.tight_layout()
+        if savePicture:
+            plt.savefig(self.savePath+pictureName)
+        plt.show()
+    def show_boxplot(self, df, columns, x, hue= None, savePicture= False, pictureName=None):
+        nrows = int(len(columns)/self.ncols+1)
+        fig, axs = plt.subplots(nrows=nrows, ncols=self.n_cols, figsize=(30, 60), sharey=False)
+        for i, target in enumerate(columns):
+            ax = axs[int(i/self.ncols), i % self.ncols]
+            if hue != None:
+                sns.stripplot(ax=ax, x=x, y=target, hue=hue, data=df, color='black', size=10, alpha=0.3)
+                sns.boxplot(ax=ax, x=x, y=target, hue=hue, data=df, showmeans=True,
+                            meanprops=dict(marker='D', markeredgecolor='black', markerfacecolor='firebrick'))
+            else:
+                sns.stripplot(ax=ax, x=x, y=target, data=df, color='black', size=10, alpha=0.3)
+                sns.boxplot(ax=ax, x=x, y=target, data=df, showmeans=True,
+                            meanprops=dict(marker='D', markeredgecolor='black', markerfacecolor='firebrick'))
+        plt.tight_layout()
+        plt.show()
+        pass
+    def show_wordCloud(self):
+        pass
\ No newline at end of file