Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Alba Maria Mármol
/
TextAnalysisSpacy
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
You need to sign in or sign up before continuing.
Commit
de47e349
authored
Feb 15, 2022
by
Alba Maria Mármol
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
update example
parent
0c1e24b8
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
1 additions
and
2 deletions
example.ipynb
example.ipynb
View file @
de47e349
{"cells":[{"cell_type":"markdown","metadata":{"id":"tgTuDdZIH7og"},"source":["# Librerías"]},{"cell_type":"code","execution_count":85,"metadata":{"id":"ydZHklMLF2g9","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1644412293290,"user_tz":-60,"elapsed":9817,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}},"outputId":"36f2c877-04f4-4e87-886d-3608f0757c0e"},"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: lexical-diversity in /usr/local/lib/python3.7/dist-packages (0.1.1)\n","Requirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4)\n","Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (7.4.0)\n","Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.62.3)\n","Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1)\n","Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.9.0)\n","Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5)\n","Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.0)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.4.0)\n","Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.1.3)\n","Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.19.5)\n","Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0)\n","Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.6)\n","Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.6)\n","Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.6)\n","Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (4.10.1)\n","Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.10.0.2)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.7.0)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.10.8)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n","Requirement already satisfied: syllables in /usr/local/lib/python3.7/dist-packages (1.0.3)\n"]}],"source":["!pip install lexical-diversity # Lexical Diversity\n","!pip install spacy # Natural Language Processing\n","!pip install syllables # Syllable counter for english"]},{"cell_type":"code","source":["crea_total_path = './CREA_total.txt'"],"metadata":{"id":"F8_bf3LFYobW","executionInfo":{"status":"ok","timestamp":1644412293291,"user_tz":-60,"elapsed":26,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}}},"execution_count":86,"outputs":[]},{"cell_type":"markdown","source":["# TextComplexitySpacy"],"metadata":{"id":"iyyf3urz3ljW"}},{"cell_type":"code","source":["from functools import reduce\n","import math\n","import syllables\n","class TextComplexitySpacy():\n"," \n"," def __init__(self, lang='es'):\n"," self.lang = lang\n","\n"," # create language analyzer\n"," if lang == 'es':\n"," self.nlp = es_core_news_sm.load()\n"," if lang == 'en':\n"," self.nlp = spacy.load(\"en_core_web_sm\")\n","\n"," # Para leer el texto que introducimos\n"," f = open(crea_total_path)\n"," lines = f.readlines()\n"," f.close()\n"," crea = {}\n"," for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words\n"," data = l.strip().split()\n"," crea[data[1]] = float(data[2].replace(',', ''))\n"," self.crea = crea\n"," pass\n","\n"," def textProcessing(self, text):\n"," # Meter todas las funciones en una patron de los tokens válidos\n"," doc = self.nlp(text)\n"," self.tokens = [w for w in doc]\n"," self.sentences = [sent for sent in doc.sents]\n"," self.nsentences = len(self.sentences)\n"," \n"," '''\n"," Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos\n"," '''\n"," pos_content_sentences = []\n"," for sentence in self.sentences:\n"," ws = self.nlp(sentence.text)\n"," pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])\n"," self.pos_content_sentences = pos_content_sentences \n"," \n"," return self.pos_content_sentences\n","\n"," \n"," def punctuationMarks(self):\n"," # Solo nos interesa contar los tokens que sean signo de puntuación.\n"," # Number of words.\n"," punctuation = []\n"," N_words = []\n"," for w in self.tokens:\n"," if re.match('PUNCT.*', w.pos_):\n"," punctuation.append(w.text)\n"," else:\n"," N_words.append(w.text)\n","\n"," aux = len(N_words) \n"," if aux == 0:\n"," aux = 1\n"," self.N_words = aux\n"," \n"," self.npunctuation = len(punctuation)\n"," self.punctuation = punctuation\n"," \n"," return self.npunctuation, self.punctuation, self.N_words\n","\n"," def lexicalComplexity(self):\n"," # Number of low frequency words \n"," count = 0\n"," for sentence in self.pos_content_sentences:\n"," for w in sentence:\n"," if w.text not in self.crea:\n"," count+=1\n"," N_lfw = count\n"," self.N_lfw = N_lfw\n","\n"," # Number of distinct content words \n"," N_dcw = len(set([w.text.lower() for s in self.pos_content_sentences for w in s]))\n"," self.N_dcw =N_dcw\n","\n"," # Number of sentences\n"," self.N_s = len(self.pos_content_sentences)\n","\n"," # Number of total content words\n"," N_cw = reduce((lambda x, y: x + y), [len(s) for s in self.pos_content_sentences])\n"," self.N_cw = N_cw\n","\n"," # Lexical Distribution Index\n"," if self.N_s == 0:\n"," self.N_s = 1\n"," LDI = N_dcw / float(self.N_s)\n"," self.LDI = LDI\n"," \n"," # Index of Low Frequency Words\n"," if N_cw == 0:\n"," N_cw = 1\n"," ILFW = N_lfw / float(N_cw)\n"," self.ILFW =ILFW\n","\n"," # Lexical Complexity\n"," LC = (LDI + ILFW) / 2\n"," self.LC = LC\n"," \n"," return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC\n","\n"," def ssReadability(self): \n"," #Number of rare words\n"," byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)\n"," byfreq = byfreq[:1500]\n"," count = 0\n"," for sentence in self.pos_content_sentences:\n"," for w in sentence:\n"," if w.text.lower() not in byfreq:\n"," count +=1\n"," N_rw = count\n"," self.N_rw = N_rw\n"," \n"," SSR = 1.609*(self.N_words / self.N_s) + 331.8* (self.N_rw /self.N_words) + 22.0 \n"," self.SSR= SSR\n"," \n"," return self.N_rw, self.SSR\n","\n"," def sentenceComplexity(self):\n"," #Number of complex sentences\n"," N_cs = 0\n"," for sentence in self.sentences:\n"," previous_is_verb = False\n"," count = 0\n"," for w in sentence:\n"," if re.match('VERB.*', w.pos_):\n"," if (previous_is_verb):\n"," count += 1\n"," previous_is_verb = False\n"," else:\n"," previous_is_verb = True\n"," else:\n"," previous_is_verb = False\n"," if count>0:\n"," N_cs += 1 \n"," self.N_cs = N_cs\n"," \n"," ASL = self.N_words / self.N_s\n"," self.ASL = ASL\n"," \n"," CS = self.N_cs / self.N_s\n"," self.CS = CS\n"," \n"," SCI = (ASL + CS)/ 2\n"," self.SCI = SCI\n"," \n"," return self.N_cs, self.ASL, self.CS, self.SCI\n","\n"," def autoReadability(self):\n"," # Number of characters\n"," count = 0\n"," listwords = []\n"," for words in self.sentences:\n"," for w in words:\n"," if re.match('\\r\\n.*', w.text):\n"," count +=1\n"," else:\n"," listwords.append(w.text)\n"," \n"," self.listwords = listwords \n"," N_charac = 0\n"," for characters in self.listwords:\n"," N_charac += len(characters)\n"," \n"," self.N_charac = N_charac\n"," \n"," ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words/ self.N_s - 21.43\n"," self.ARI = ARI\n"," \n"," return self.N_charac, self.ARI, self.listwords\n","\n"," \n"," def tree_height(self,root, cont):\n"," if not list(root.children):\n"," return 1\n"," else:\n"," cont+=1\n"," if cont == 320:\n"," return 320\n"," return 1 + max(self.tree_height(x, cont) for x in root.children)\n","\n"," def embeddingDepth(self):\n"," ## Output results\n"," roots = [sent.root for sent in self.sentences]\n"," max_list = []\n"," max_list = [self.tree_height(root,0) for root in roots]\n"," mean_max_list = sum(max_list)/(len(max_list))\n"," max_max_list = max(max_list)\n"," min_max_list = min(max_list)\n"," \n"," self.max_max_list = max_max_list\n"," self.min_max_list = min_max_list\n"," self.mean_max_list = mean_max_list\n"," \n"," return self.max_max_list, self.min_max_list, self.mean_max_list\n","\n"," def syllable_counter_spanish(self,text):\n"," if self.lang == 'es':\n"," t = re.sub(r'y([aáeéiíoóuú])', '\\\\1', text.lower())\n"," t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())\n"," t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower()\n"," t = re.sub(r'[aáeéiíoóuúy]', 'A', t)\n"," return(len(t.split('A'))-1)\n","\n"," elif self.lang == 'en':\n"," return syllables.estimate(text)\n","\n"," def readability(self):\n"," # Number of syllables and Number of words with 3 or more syllables:tagger\n"," n_syllables = 0\n"," n_syllables3 = 0\n"," for words in self.listwords:\n"," syllables = self.syllable_counter_spanish(words)\n"," n_syllables += syllables\n"," if syllables>=3:\n"," n_syllables3 += 1\n"," \n"," self.n_syllables = n_syllables\n"," self.n_syllables3 = n_syllables3\n"," \n"," # Number of letters\n"," nletters= 0\n"," letters = []\n"," vecletters =[]\n"," for word in self.listwords:\n"," if re.match('[a-zA-Z]|á|ó|í|ú|é', word):\n"," letters.append(word)\n"," nletters+=len(word)\n"," vecletters.append(len(word))\n"," \n"," self.letters = letters\n"," self.nletters = nletters\n"," self.vecletters= vecletters\n"," \n"," huertareadability = 206.835 - 60 * (self.n_syllables / self.N_words) - 102 * (self.nsentences / self.N_words)\n"," self.huertareadability = huertareadability\n"," \n"," ifszreadability = 206.835 - 62.3 * (self.n_syllables / self.N_words) - (self.N_words / self.nsentences) \n"," self.ifszreadability = ifszreadability\n"," \n"," self.syll_words = self.n_syllables / self.N_words\n"," \n"," \n"," polinicompressibility = 95.2 - 9.7 * (self.nletters / self.N_words) - 0.35 * (self.N_words / self.nsentences) \n"," self.polinicompressibility = polinicompressibility\n"," \n"," self.words_sen = self.N_words / self.nsentences\n"," \n"," x=self.nletters / self.N_words\n"," varianza=np.var(self.vecletters)\n"," if varianza == 0:\n"," varianza =1\n"," aux = self.N_words-1\n"," if aux == 0:\n"," aux=1\n"," mureadability = (self.N_words /aux)*(x/varianza)*100\n"," self.mureadability = mureadability\n"," \n"," return self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen\n"," \n"," def ageReadability(self):\n"," \n"," minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395\n"," self.minimumage = minimumage\n"," \n"," solreadability= -2.51+0.74*(3.1291+1.0430*math.sqrt(self.n_syllables3*(30/self.nsentences)))\n"," self.solreadability = solreadability\n"," \n"," return self.minimumage, self.solreadability\n"," \n"," def yearsCrawford(self):\n"," \n"," years = -20.5 *(self.nsentences/self.N_words) + 4.9 * (self.n_syllables /self.N_words) - 3.407\n"," self.years = years\n"," \n"," return self.years"],"metadata":{"id":"fbsCkEC8vBLg","executionInfo":{"status":"ok","timestamp":1644412294091,"user_tz":-60,"elapsed":824,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}}},"execution_count":87,"outputs":[]},{"cell_type":"markdown","source":["# TextAnalisisSpacy"],"metadata":{"id":"GQm029q73tRQ"}},{"cell_type":"code","source":["# Only required for analysis in Spanish\n","import spacy.cli \n","spacy.cli.download(\"es_core_news_sm\")\n","import es_core_news_sm\n","\n","# Imports\n","import spacy\n","import numpy as np\n","from tqdm import tqdm\n","import re\n","import pandas as pd\n","\n","import matplotlib.pyplot as plt\n","%matplotlib inline\n","import seaborn as sb\n","import nltk\n","from nltk.probability import FreqDist\n","from nltk.text import Text\n","from lexical_diversity import lex_div as ld\n","\n","class TextAnalisisSpacy():\n"," \n"," def __init__(self, lang='es'):\n","\n"," # Create language analyzer\n"," if lang == 'es':\n"," self.nlp = es_core_news_sm.load()\n"," self.textComplexitySpacy = TextComplexitySpacy()\n"," self.nlp = es_core_news_sm.load()\n"," elif lang == 'en':\n"," self.nlp = spacy.load(\"en_core_web_sm\")\n"," self.textComplexitySpacy = TextComplexitySpacy('en')\n","\n"," self.Text = Text\n"," self.FreqDist = FreqDist\n"," self.POS_LIST = [\"ADJ\", \"ADP\", \"ADV\", \"AUX\",\"X\", \"CCONJ\",\"CONJ\", \"DET\", \"INTJ\", \"NOUN\", \"NUM\", \"PART\", \"PRON\", \"PROPN\", \"PUNCT\", \"SCONJ\", \"SYM\", \"VERB\", \"SPACE\"]\n"," pass\n","\n"," # \n"," # X = samples input , y = tags\n"," #\n"," def textProcessing(self, X, y):\n"," d = {'category':y, 'text':X}\n"," self.df = pd.DataFrame(d)\n"," # Replace gaps\n"," self.df['text'].replace(np.nan,'', inplace=True)\n"," print('Shape: ', self.df.shape)\n","\n"," # Create category dictionary\n"," self.dic_categorias = {}\n"," for i in range(len(df)): \n"," if df.iloc[i,0] in self.dic_categorias:\n"," self.dic_categorias[df.iloc[i,0]] += 1\n"," else:\n"," self.dic_categorias[df.iloc[i,0]] = 1\n","\n"," self.df_category = pd.DataFrame({'category': self.dic_categorias.keys()})\n"," print('Dictionary of categories:', self.dic_categorias)\n","\n"," # Initialising variables for graphs\n"," sb.set(rc={'figure.figsize':(14,6)})\n"," all_values = self.dic_categorias.values()\n"," self.max_value = max(all_values)\n","\n"," def showGraph(self, columnas, type_g='strip',export=False):\n"," # Graph generator\n"," for columna in columnas:\n"," if (type_g == 'strip'):\n"," splot = sb.stripplot(x=columna,y='category', data=self.df)\n"," elif (type_g == 'box'):\n"," splot = sb.boxplot(x=columna,y='category', data=self.df)\n"," elif (type_g == 'heatmap'):\n"," dic={}\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," dic[cat] = df_grupo[columna].tolist()\n"," while len(dic[cat]) < self.max_value:\n"," dic[cat].append(dic[cat][len(dic[cat])-1])\n"," df_n = pd.DataFrame(dic)\n"," splot = sb.heatmap(df_n.transpose()).set_title(columna)\n"," if export == False:\n"," plt.show()\n"," else:\n"," splot.get_figure().savefig(columna+\"-\"+ type_g+\".jpg\", bbox_inches='tight')\n"," plt.clf()\n","\n"," def export(self):\n"," print('Exporting...')\n"," self.df.to_csv(\"data.csv\") \n"," self.df_category.to_csv(\"data_cat.csv\")\n"," self.showGraph(self.df.columns[2:],'strip',True)\n"," self.showGraph(self.df.columns[2:],'box',True)\n"," self.showGraph(self.df.columns[2:],'heatmap',True)\n","\n"," def volumetry(self):\n"," # Volumetrics for each text\n"," self.df['words'] = [len(text.split()) for text in self.df['text'].tolist()] # Number of words\n"," self.df['uniques'] = [len(set(text.split())) for text in self.df['text'].tolist()] # Number of unique words\n"," self.df['chars'] = self.df['text'].str.len() # Number of characters\n"," self.df['avg_words_len'] = round(self.df['chars'] / self.df['words'], 3) # Average word length\n"," self.df = self.df.replace([np.inf, -np.inf, np.nan], 0)\n"," \n"," # Volumetrics for each category\n"," volumetry = ['words','uniques','chars','avg_words_len']\n"," category_columns = ['category','docs']\n"," for col in volumetry:\n"," category_columns.append('avg_'+col)\n"," category_columns.append('std_'+col)\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in volumetry:\n"," self.df_category.loc[i,'docs'] = len(df_grupo)\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3) \n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 5)\n"," i+=1\n"," \n"," print('Volumetrics for each text:')\n"," display(self.df.head())\n"," print('Volumetrics for each category:')\n"," display(self.df_category[category_columns])\n"," \n"," self.showGraph(volumetry,'strip')\n"," self.showGraph(volumetry,'box')\n"," self.showGraph(volumetry,'heatmap')\n","\n"," return self.df, self.df_category\n","\n"," def lemmas(self):\n"," # Number and length of different lemmas per text\n"," dic_lemmas = {}\n"," for cat in self.dic_categorias:\n"," dic_lemmas[cat] = []\n","\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in tqdm(self.dic_categorias):\n"," df_grupo = groups.get_group(cat)\n"," for text in df_grupo['text'].tolist():\n"," set_ = set()\n"," suma = 0\n"," doc = self.nlp(text)\n"," for token in doc:\n"," set_.add(token.lemma_)\n"," suma += len(token.lemma_)\n"," if re.match('PUNCT.*|SYM.*|SPACE.*', token.pos_) == None:\n"," dic_lemmas[cat].append(token.lemma_)\n"," self.df.loc[i,'lemmas_uniques'] = len(set_)\n"," if(len(set_) != 0):\n"," self.df.loc[i,'avg_lemmas_len'] = round(suma / len(set_), 3)\n"," else:\n"," self.df.loc[i,'avg_lemmas_len'] = suma\n"," i+=1\n"," self.dic_lemmas = dic_lemmas\n","\n"," # Average and variance of different lemmas and length by category\n"," i = 0\n"," col_lemmas = ['lemmas_uniques','avg_lemmas_len']\n"," category_lemmas = ['category']\n"," for col in col_lemmas:\n"," category_lemmas.append('avg_'+col)\n"," category_lemmas.append('std_'+col)\n"," \n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_lemmas:\n"," self.df_category.loc[i,'docs'] = len(df_grupo)\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3) \n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 3)\n"," i+=1\n"," \n"," print('Lemmas for each text:')\n"," display(self.df.head())\n"," print('Lemmas for each category:')\n"," display(self.df_category[category_lemmas])\n","\n"," self.showGraph(col_lemmas,'strip')\n"," self.showGraph(col_lemmas,'box')\n"," self.showGraph(col_lemmas,'heatmap')\n","\n"," return self.df, self.df_category\n","\n"," def lemmas_freq(self, n = 50):\n"," # Most frequent lemmas by category\n"," dic_f_lemmas = self.dic_categorias.copy()\n"," for cat in self.dic_categorias:\n"," text = self.Text(self.dic_lemmas[cat])\n"," dic_f_lemmas[cat] = self.FreqDist(text).most_common(n)\n"," lista = []\n"," for tupla in dic_f_lemmas[cat]:\n"," lista.append((tupla[0], round(tupla[1] / len(self.dic_lemmas[cat]), 4)))\n"," while (len(lista) < n): # Rellenar huecos\n"," lista.append(np.nan)\n"," dic_f_lemmas[cat] = lista\n","\n"," df_freq_lemas = pd.DataFrame(dic_f_lemmas)\n"," df_freq_lemas_tr = df_freq_lemas.transpose()\n"," print('Most frequent lemmas by category')\n"," display(df_freq_lemas_tr)\n"," df_freq_lemas_tr.to_csv(\"lemas_freq.csv\") \n"," return df_freq_lemas.transpose()\n","\n"," def pos(self):\n"," # POS analysis for each text\n"," dic_pos_cat = {}\n"," for pos in self.POS_LIST:\n"," dic_pos_cat[pos] = {}\n"," for cat in self.dic_categorias:\n"," dic_pos_cat[pos][cat] = []\n","\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for text in tqdm(df_grupo['text'].tolist()):\n"," dic_pos = {}\n"," doc = self.nlp(text)\n"," for token in doc:\n"," if token.pos_ in dic_pos:\n"," dic_pos[token.pos_] += 1\n"," else:\n"," dic_pos[token.pos_] = 1\n"," dic_pos_cat[token.pos_][cat].append(token.text)\n"," total = len(doc)\n"," if total == 0:\n"," total = 1\n"," for pos in self.POS_LIST:\n"," if pos in dic_pos:\n"," self.df.loc[i,pos] = round(dic_pos[pos]/total,4)\n"," else:\n"," self.df.loc[i,pos] = np.nan\n"," i+=1\n"," self.dic_pos_cat = dic_pos_cat\n","\n"," # POS analysis for each category\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for pos in self.POS_LIST:\n"," if pos in df_grupo.columns.values:\n"," self.df_category.loc[i,'avg_'+pos] = round(df_grupo[pos].mean(), 3)\n"," self.df_category.loc[i,'std_'+pos] = round(df_grupo[pos].std(), 3)\n"," i+=1\n","\n"," print('POS analysis for each text')\n"," display(self.df.head())\n"," print('POS analysis for each category')\n"," display(self.df_category)\n"," self.showGraph(self.POS_LIST,'strip')\n"," self.showGraph(self.POS_LIST,'box')\n"," self.showGraph(self.POS_LIST,'heatmap')\n"," return self.df, self.df_category\n","\n"," def pos_freq(self, n = 15):\n"," # Most frequent words \n"," dic_f_palabras = self.dic_categorias.copy()\n"," for pos in self.POS_LIST:\n"," for cat in self.dic_categorias:\n"," if cat in self.dic_pos_cat[pos]:\n"," text = self.Text(self.dic_pos_cat[pos][cat])\n"," fdist = self.FreqDist(text)\n"," dic_f_palabras[cat] = fdist.most_common(n)\n"," lista = []\n"," for tupla in dic_f_palabras[cat]:\n"," lista.append((tupla[0],round(tupla[1] / len(self.dic_pos_cat[pos][cat]), 5))) \n"," dic_f_palabras[cat] = lista\n","\n"," while (len(dic_f_palabras[cat]) < n): # Rellenar huecos\n"," dic_f_palabras[cat].append(np.nan)\n","\n"," df_freq_palabras = pd.DataFrame(dic_f_palabras)\n"," print(\"---- Para \" + spacy.explain(pos) +\" las \"+ str(n)+\" palabras más frecuentes son: -------\")\n"," display(df_freq_palabras.transpose())\n"," df_freq_palabras_tr = df_freq_palabras.transpose()\n"," df_freq_palabras_tr.to_csv(\"POS_\"+ str(pos)+\"_freq.csv\") \n"," return df_freq_palabras.transpose()\n","\n"," def lexical_diversity(self):\n"," # Lexical diversity for each text\n"," i = 0\n"," for text in tqdm(self.df['text'].tolist()):\n"," flt = ld.flemmatize(text)\n"," self.df.loc[i,'simple_TTR'] = round(ld.ttr(flt), 4)\n"," self.df.loc[i,'root_TTR'] = round(ld.root_ttr(flt), 4)\n"," self.df.loc[i,'log_TTR'] = round(ld.log_ttr(flt), 4)\n"," self.df.loc[i,'maas_TTR'] = round(ld.maas_ttr(flt), 4)\n"," self.df.loc[i,'MSTTR'] = round(ld.msttr(flt), 4)\n"," self.df.loc[i,'MATTR'] = round(ld.mattr(flt), 4)\n"," self.df.loc[i,'HDD'] = round(ld.hdd(flt), 4)\n"," self.df.loc[i,'MTLD'] = round(ld.mtld(flt), 4)\n"," i+=1\n","\n"," # Lexical diversity for each category\n"," i = 0\n"," col_diversity = ['simple_TTR','root_TTR','log_TTR','maas_TTR','MSTTR','MATTR','HDD','MTLD']\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_diversity:\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(),4)\n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(),4)\n"," i+=1\n"," print('Lexical diversity for each text')\n"," display(self.df.head())\n"," print('Lexical diversity for each category')\n"," display(self.df_category)\n"," self.showGraph(col_diversity,'strip')\n"," self.showGraph(col_diversity,'box')\n"," self.showGraph(col_diversity,'heatmap')\n"," return self.df, self.df_category\n","\n"," def complexity(self):\n"," # Complexity diversity for each category\n"," i = 0\n"," for text in tqdm(self.df['text'].tolist()):\n"," if len(text) > 0:\n"," text_processed = self.textComplexitySpacy.textProcessing(text)\n"," pmarks = self.textComplexitySpacy.punctuationMarks()[0]\n"," self.df.loc[i,'lexcomplexity'] = self.textComplexitySpacy.lexicalComplexity()[6]\n"," self.df.loc[i,'ssreadability'] = self.textComplexitySpacy.ssReadability()[1]\n"," self.df.loc[i,'sencomplexity'] = self.textComplexitySpacy.sentenceComplexity()[3]\n"," self.df.loc[i,'autoreadability'] = self.textComplexitySpacy.autoReadability()[1]\n"," embeddingdepth = self.textComplexitySpacy.embeddingDepth()\n"," self.df.loc[i,'max_embeddingdepth'] = embeddingdepth[0]\n"," self.df.loc[i,'min_embeddingdepth'] = embeddingdepth[1]\n"," self.df.loc[i,'avg_embeddingdepth'] = embeddingdepth[2]\n"," readability = self.textComplexitySpacy.readability()\n"," self.df.loc[i,'huertareadability'] = round(readability[3],4)\n"," self.df.loc[i,'ifszreadability'] = round(readability[4],4)\n"," self.df.loc[i,'polinicompressibility'] = round(readability[5],4)\n"," self.df.loc[i,'mureadability'] = round(readability[6],4)\n"," self.df.loc[i,'agereadability'] = self.textComplexitySpacy.ageReadability()[0]\n"," self.df.loc[i,'yearscrawford'] = self.textComplexitySpacy.yearsCrawford()\n"," i+=1\n","\n"," # Complexity diversity for each category\n"," i = 0\n"," col_complexity = ['lexcomplexity','ssreadability','sencomplexity','autoreadability','max_embeddingdepth',\n"," 'min_embeddingdepth','avg_embeddingdepth','huertareadability','ifszreadability',\n"," 'polinicompressibility','mureadability','agereadability','yearscrawford']\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_complexity:\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 4)\n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 4)\n"," i+=1\n"," \n"," print('Complexity diversity for each text')\n"," display(self.df.head())\n"," print('Complexity diversity for each category')\n"," display(self.df_category)\n"," self.showGraph(col_complexity,'strip')\n"," self.showGraph(col_complexity,'box')\n"," self.showGraph(col_complexity,'heatmap')\n"," return self.df, self.df_category\n","\n"," def featureSelection(self):\n"," df = self.df.fillna(0)\n"," X = df.iloc[:,2:]\n"," y = df.iloc[:,0]\n","\n"," from sklearn.feature_selection import VarianceThreshold, SelectFromModel\n"," # Removing features with low variance\n"," sel = VarianceThreshold(threshold=(.8 * (1 - .8))) # No varía en más del 80% de datos\n"," arr = sel.fit_transform(X)\n"," print('Removing features with low variance...')\n"," print('Selected columns:',sel.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(arr))\n"," pd.DataFrame(arr).to_csv(\"VarianceThreshold.csv\") \n","\n"," # SelectFromModel\n"," # Selection of functions based on L1\n"," from sklearn.svm import LinearSVC\n","\n"," lsvc = LinearSVC(C=0.01, penalty=\"l1\", dual=False).fit(X, y)\n"," model = SelectFromModel(lsvc, prefit=True)\n"," X_new = model.transform(X)\n"," print('Removing features with SelectFromModel...')\n"," print('Selected columns:',model.get_feature_names_out(df.columns.values[2:]))\n"," display(pd.DataFrame(X_new))\n"," pd.DataFrame(X_new).to_csv(\"SelectFromModel.csv\") \n"," \n","\n"," def kBest(self, k = 10):\n"," df = self.df.fillna(0)\n"," X = df.iloc[:,2:]\n"," y = df.iloc[:,0]\n"," # Univariate feature selection\n"," from sklearn.feature_selection import SelectKBest\n"," from sklearn.feature_selection import f_classif, mutual_info_classif\n"," print('Highest scoring '+ str(k) +' features with f_classif...')\n"," kbest_classif = SelectKBest(f_classif, k=k) # Elimina todo menos las k características de puntuación más alta\n"," X_classif = kbest_classif.fit_transform(X, y)\n"," print('Selected columns:',kbest_classif.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(X_classif))\n"," pd.DataFrame(X_classif).to_csv(\"f_classif.csv\") \n","\n"," print('Highest scoring '+ str(k) +' features with mutual_info_classif...')\n"," kbest_mut = SelectKBest(mutual_info_classif, k=k)\n"," X_mut = kbest_mut.fit_transform(X, y)\n"," print('Selected columns:', kbest_mut.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(X_mut))\n"," pd.DataFrame(X_mut).to_csv(\"mutual_info_classif.csv\") \n"," "],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9Eo_cwABcmVf","executionInfo":{"status":"ok","timestamp":1644412305356,"user_tz":-60,"elapsed":11269,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}},"outputId":"b07b5c12-1bb0-4983-bfa5-106bdc142451"},"execution_count":88,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the model via spacy.load('es_core_news_sm')\n"]}]},{"cell_type":"markdown","source":["# Analisis"],"metadata":{"id":"WLTN5U8b32S6"}},{"cell_type":"code","source":["textAnalisisSpacy = TextAnalisisSpacy('es')"],"metadata":{"id":"ViskMg97pXwg","executionInfo":{"status":"ok","timestamp":1644412316691,"user_tz":-60,"elapsed":11349,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}}},"execution_count":89,"outputs":[]},{"cell_type":"markdown","source":["# Carga datos"],"metadata":{"id":"nrNo-f5p35ei"}},{"cell_type":"code","source":["df = pd.read_csv(\"./texts.csv\")\n","df.head()"],"metadata":{"id":"lVjjWxvEn5LP","colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"status":"ok","timestamp":1644412620771,"user_tz":-60,"elapsed":244,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}},"outputId":"e760b59b-ecf6-4308-ec13-8bff00a4f99c"},"execution_count":96,"outputs":[{"output_type":"execute_result","data":{"text/html":["\n"," <div id=\"df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>category</th>\n"," <th>author</th>\n"," <th>title</th>\n"," <th>text</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>Nereida</td>\n"," <td>TOC Y COMPROBACIONES</td>\n"," <td>Veo que en este foro, afortunadamente para vos...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>desesperacion15</td>\n"," <td>No se que me sucede.</td>\n"," <td>Hola a todos,Les escribo porque hace un tiempo...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>Holli</td>\n"," <td>No puedo con la Ansiedad</td>\n"," <td>Hola a todos, tengo 24 años y hace como 2 años...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Relaciones Padres e Hijos</td>\n"," <td>watermelon</td>\n"," <td>Les digo o no les digo que sufrí abusos de niñ...</td>\n"," <td>Hola, Soy nueva en esto del foro, de hecho ent...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Relaciones Padres e Hijos</td>\n"," <td>mr_rol</td>\n"," <td>Mi esposa trata mal a nuestro hijo de ocho a...</td>\n"," <td>Buenas noches, Tengo 34 años y mi esposa 25 t...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "],"text/plain":[" category ... text\n","0 Sección de Ansiedad y Fobias ... Veo que en este foro, afortunadamente para vos...\n","1 Sección de Ansiedad y Fobias ... Hola a todos,Les escribo porque hace un tiempo...\n","2 Sección de Ansiedad y Fobias ... Hola a todos, tengo 24 años y hace como 2 años...\n","3 Relaciones Padres e Hijos ... Hola, Soy nueva en esto del foro, de hecho ent...\n","4 Relaciones Padres e Hijos ... Buenas noches, Tengo 34 años y mi esposa 25 t...\n","\n","[5 rows x 4 columns]"]},"metadata":{},"execution_count":96}]},{"cell_type":"code","source":["textAnalisisSpacy.textProcessing(df['text'].tolist(),df['category'].tolist())\n","textAnalisisSpacy.volumetry()\n","textAnalisisSpacy.lemmas()\n","textAnalisisSpacy.lemmas_freq()\n","textAnalisisSpacy.pos()\n","textAnalisisSpacy.pos_freq()\n","textAnalisisSpacy.lexical_diversity()\n","textAnalisisSpacy.complexity()\n","textAnalisisSpacy.featureSelection()\n","textAnalisisSpacy.kBest()\n","textAnalisisSpacy.export()"],"metadata":{"id":"4LkSJ0COcetm","colab":{"base_uri":"https://localhost:8080/","height":1000,"output_embedded_package_id":"1vpzTTG4oQWTZufKEDJkd-zoAgVs4_JJu"},"executionInfo":{"status":"ok","timestamp":1644412718344,"user_tz":-60,"elapsed":95780,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}},"outputId":"c871b294-a937-4da4-a4c3-fde5b9f4215a"},"execution_count":97,"outputs":[{"output_type":"display_data","data":{"text/plain":"Output hidden; open in https://colab.research.google.com to view."},"metadata":{}}]}],"metadata":{"colab":{"collapsed_sections":[],"name":"Biblioteca Análisis-PLN.ipynb","provenance":[],"toc_visible":true,"mount_file_id":"1WdsL5lkOudV-Xpbrmg6f92v3ujY8QHAf","authorship_tag":"ABX9TyMv+hpn4P7K1p0PdKq3+UyM"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
\ No newline at end of file
{"cells":[{"cell_type":"markdown","metadata":{"id":"tgTuDdZIH7og"},"source":["# Librerías"]},{"cell_type":"code","execution_count":85,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":9817,"status":"ok","timestamp":1644412293290,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"ydZHklMLF2g9","outputId":"36f2c877-04f4-4e87-886d-3608f0757c0e"},"outputs":[{"name":"stdout","output_type":"stream","text":["Requirement already satisfied: lexical-diversity in /usr/local/lib/python3.7/dist-packages (0.1.1)\n","Requirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4)\n","Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (7.4.0)\n","Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.62.3)\n","Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1)\n","Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.9.0)\n","Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5)\n","Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.0)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.4.0)\n","Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.1.3)\n","Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.19.5)\n","Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0)\n","Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.6)\n","Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.6)\n","Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.6)\n","Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (4.10.1)\n","Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.10.0.2)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.7.0)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.10.8)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n","Requirement already satisfied: syllables in /usr/local/lib/python3.7/dist-packages (1.0.3)\n"]}],"source":["!pip install lexical-diversity # Lexical Diversity\n","!pip install spacy # Natural Language Processing\n","!pip install syllables # Syllable counter for english"]},{"cell_type":"code","execution_count":86,"metadata":{"executionInfo":{"elapsed":26,"status":"ok","timestamp":1644412293291,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"F8_bf3LFYobW"},"outputs":[],"source":["crea_total_path = './CREA_total.txt'"]},{"cell_type":"markdown","metadata":{"id":"iyyf3urz3ljW"},"source":["# TextComplexitySpacy"]},{"cell_type":"code","execution_count":87,"metadata":{"executionInfo":{"elapsed":824,"status":"ok","timestamp":1644412294091,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"fbsCkEC8vBLg"},"outputs":[],"source":["from functools import reduce\n","import math\n","import syllables\n","class TextComplexitySpacy():\n"," \n"," def __init__(self, lang='es'):\n"," self.lang = lang\n","\n"," # create language analyzer\n"," if lang == 'es':\n"," self.nlp = es_core_news_sm.load()\n"," if lang == 'en':\n"," self.nlp = spacy.load(\"en_core_web_sm\")\n","\n"," # Para leer el texto que introducimos\n"," f = open(crea_total_path)\n"," lines = f.readlines()\n"," f.close()\n"," crea = {}\n"," for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words\n"," data = l.strip().split()\n"," crea[data[1]] = float(data[2].replace(',', ''))\n"," self.crea = crea\n"," pass\n","\n"," def textProcessing(self, text):\n"," # Meter todas las funciones en una patron de los tokens válidos\n"," doc = self.nlp(text)\n"," self.tokens = [w for w in doc]\n"," self.sentences = [sent for sent in doc.sents]\n"," self.nsentences = len(self.sentences)\n"," \n"," '''\n"," Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos\n"," '''\n"," pos_content_sentences = []\n"," for sentence in self.sentences:\n"," ws = self.nlp(sentence.text)\n"," pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])\n"," self.pos_content_sentences = pos_content_sentences \n"," \n"," return self.pos_content_sentences\n","\n"," \n"," def punctuationMarks(self):\n"," # Solo nos interesa contar los tokens que sean signo de puntuación.\n"," # Number of words.\n"," punctuation = []\n"," N_words = []\n"," for w in self.tokens:\n"," if re.match('PUNCT.*', w.pos_):\n"," punctuation.append(w.text)\n"," else:\n"," N_words.append(w.text)\n","\n"," aux = len(N_words) \n"," if aux == 0:\n"," aux = 1\n"," self.N_words = aux\n"," \n"," self.npunctuation = len(punctuation)\n"," self.punctuation = punctuation\n"," \n"," return self.npunctuation, self.punctuation, self.N_words\n","\n"," def lexicalComplexity(self):\n"," # Number of low frequency words \n"," count = 0\n"," for sentence in self.pos_content_sentences:\n"," for w in sentence:\n"," if w.text not in self.crea:\n"," count+=1\n"," N_lfw = count\n"," self.N_lfw = N_lfw\n","\n"," # Number of distinct content words \n"," N_dcw = len(set([w.text.lower() for s in self.pos_content_sentences for w in s]))\n"," self.N_dcw =N_dcw\n","\n"," # Number of sentences\n"," self.N_s = len(self.pos_content_sentences)\n","\n"," # Number of total content words\n"," N_cw = reduce((lambda x, y: x + y), [len(s) for s in self.pos_content_sentences])\n"," self.N_cw = N_cw\n","\n"," # Lexical Distribution Index\n"," if self.N_s == 0:\n"," self.N_s = 1\n"," LDI = N_dcw / float(self.N_s)\n"," self.LDI = LDI\n"," \n"," # Index of Low Frequency Words\n"," if N_cw == 0:\n"," N_cw = 1\n"," ILFW = N_lfw / float(N_cw)\n"," self.ILFW =ILFW\n","\n"," # Lexical Complexity\n"," LC = (LDI + ILFW) / 2\n"," self.LC = LC\n"," \n"," return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC\n","\n"," def ssReadability(self): \n"," #Number of rare words\n"," byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)\n"," byfreq = byfreq[:1500]\n"," count = 0\n"," for sentence in self.pos_content_sentences:\n"," for w in sentence:\n"," if w.text.lower() not in byfreq:\n"," count +=1\n"," N_rw = count\n"," self.N_rw = N_rw\n"," \n"," SSR = 1.609*(self.N_words / self.N_s) + 331.8* (self.N_rw /self.N_words) + 22.0 \n"," self.SSR= SSR\n"," \n"," return self.N_rw, self.SSR\n","\n"," def sentenceComplexity(self):\n"," #Number of complex sentences\n"," N_cs = 0\n"," for sentence in self.sentences:\n"," previous_is_verb = False\n"," count = 0\n"," for w in sentence:\n"," if re.match('VERB.*', w.pos_):\n"," if (previous_is_verb):\n"," count += 1\n"," previous_is_verb = False\n"," else:\n"," previous_is_verb = True\n"," else:\n"," previous_is_verb = False\n"," if count>0:\n"," N_cs += 1 \n"," self.N_cs = N_cs\n"," \n"," ASL = self.N_words / self.N_s\n"," self.ASL = ASL\n"," \n"," CS = self.N_cs / self.N_s\n"," self.CS = CS\n"," \n"," SCI = (ASL + CS)/ 2\n"," self.SCI = SCI\n"," \n"," return self.N_cs, self.ASL, self.CS, self.SCI\n","\n"," def autoReadability(self):\n"," # Number of characters\n"," count = 0\n"," listwords = []\n"," for words in self.sentences:\n"," for w in words:\n"," if re.match('\\r\\n.*', w.text):\n"," count +=1\n"," else:\n"," listwords.append(w.text)\n"," \n"," self.listwords = listwords \n"," N_charac = 0\n"," for characters in self.listwords:\n"," N_charac += len(characters)\n"," \n"," self.N_charac = N_charac\n"," \n"," ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words/ self.N_s - 21.43\n"," self.ARI = ARI\n"," \n"," return self.N_charac, self.ARI, self.listwords\n","\n"," \n"," def tree_height(self,root, cont):\n"," if not list(root.children):\n"," return 1\n"," else:\n"," cont+=1\n"," if cont == 320:\n"," return 320\n"," return 1 + max(self.tree_height(x, cont) for x in root.children)\n","\n"," def embeddingDepth(self):\n"," ## Output results\n"," roots = [sent.root for sent in self.sentences]\n"," max_list = []\n"," max_list = [self.tree_height(root,0) for root in roots]\n"," mean_max_list = sum(max_list)/(len(max_list))\n"," max_max_list = max(max_list)\n"," min_max_list = min(max_list)\n"," \n"," self.max_max_list = max_max_list\n"," self.min_max_list = min_max_list\n"," self.mean_max_list = mean_max_list\n"," \n"," return self.max_max_list, self.min_max_list, self.mean_max_list\n","\n"," def syllable_counter_spanish(self,text):\n"," if self.lang == 'es':\n"," t = re.sub(r'y([aáeéiíoóuú])', '\\\\1', text.lower())\n"," t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())\n"," t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower()\n"," t = re.sub(r'[aáeéiíoóuúy]', 'A', t)\n"," return(len(t.split('A'))-1)\n","\n"," elif self.lang == 'en':\n"," return syllables.estimate(text)\n","\n"," def readability(self):\n"," # Number of syllables and Number of words with 3 or more syllables:tagger\n"," n_syllables = 0\n"," n_syllables3 = 0\n"," for words in self.listwords:\n"," syllables = self.syllable_counter_spanish(words)\n"," n_syllables += syllables\n"," if syllables>=3:\n"," n_syllables3 += 1\n"," \n"," self.n_syllables = n_syllables\n"," self.n_syllables3 = n_syllables3\n"," \n"," # Number of letters\n"," nletters= 0\n"," letters = []\n"," vecletters =[]\n"," for word in self.listwords:\n"," if re.match('[a-zA-Z]|á|ó|í|ú|é', word):\n"," letters.append(word)\n"," nletters+=len(word)\n"," vecletters.append(len(word))\n"," \n"," self.letters = letters\n"," self.nletters = nletters\n"," self.vecletters= vecletters\n"," \n"," huertareadability = 206.835 - 60 * (self.n_syllables / self.N_words) - 102 * (self.nsentences / self.N_words)\n"," self.huertareadability = huertareadability\n"," \n"," ifszreadability = 206.835 - 62.3 * (self.n_syllables / self.N_words) - (self.N_words / self.nsentences) \n"," self.ifszreadability = ifszreadability\n"," \n"," self.syll_words = self.n_syllables / self.N_words\n"," \n"," \n"," polinicompressibility = 95.2 - 9.7 * (self.nletters / self.N_words) - 0.35 * (self.N_words / self.nsentences) \n"," self.polinicompressibility = polinicompressibility\n"," \n"," self.words_sen = self.N_words / self.nsentences\n"," \n"," x=self.nletters / self.N_words\n"," varianza=np.var(self.vecletters)\n"," if varianza == 0:\n"," varianza =1\n"," aux = self.N_words-1\n"," if aux == 0:\n"," aux=1\n"," mureadability = (self.N_words /aux)*(x/varianza)*100\n"," self.mureadability = mureadability\n"," \n"," return self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen\n"," \n"," def ageReadability(self):\n"," \n"," minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395\n"," self.minimumage = minimumage\n"," \n"," solreadability= -2.51+0.74*(3.1291+1.0430*math.sqrt(self.n_syllables3*(30/self.nsentences)))\n"," self.solreadability = solreadability\n"," \n"," return self.minimumage, self.solreadability\n"," \n"," def yearsCrawford(self):\n"," \n"," years = -20.5 *(self.nsentences/self.N_words) + 4.9 * (self.n_syllables /self.N_words) - 3.407\n"," self.years = years\n"," \n"," return self.years"]},{"cell_type":"markdown","metadata":{"id":"GQm029q73tRQ"},"source":["# TextAnalysisSpacy"]},{"cell_type":"code","execution_count":88,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":11269,"status":"ok","timestamp":1644412305356,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"9Eo_cwABcmVf","outputId":"b07b5c12-1bb0-4983-bfa5-106bdc142451"},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the model via spacy.load('es_core_news_sm')\n"]}],"source":["# Only required for analysis in Spanish\n","import spacy.cli \n","spacy.cli.download(\"es_core_news_sm\")\n","import es_core_news_sm\n","\n","# Imports\n","import spacy\n","import numpy as np\n","from tqdm import tqdm\n","import re\n","import pandas as pd\n","\n","import matplotlib.pyplot as plt\n","%matplotlib inline\n","import seaborn as sb\n","import nltk\n","from nltk.probability import FreqDist\n","from nltk.text import Text\n","from lexical_diversity import lex_div as ld\n","\n","class TextAnalysisSpacy():\n"," \n"," def __init__(self, lang='es'):\n","\n"," # Create language analyzer\n"," if lang == 'es':\n"," self.nlp = es_core_news_sm.load()\n"," self.textComplexitySpacy = TextComplexitySpacy()\n"," self.nlp = es_core_news_sm.load()\n"," elif lang == 'en':\n"," self.nlp = spacy.load(\"en_core_web_sm\")\n"," self.textComplexitySpacy = TextComplexitySpacy('en')\n","\n"," self.Text = Text\n"," self.FreqDist = FreqDist\n"," self.POS_LIST = [\"ADJ\", \"ADP\", \"ADV\", \"AUX\",\"X\", \"CCONJ\",\"CONJ\", \"DET\", \"INTJ\", \"NOUN\", \"NUM\", \"PART\", \"PRON\", \"PROPN\", \"PUNCT\", \"SCONJ\", \"SYM\", \"VERB\", \"SPACE\"]\n"," pass\n","\n"," # \n"," # X = samples input , y = tags\n"," #\n"," def textProcessing(self, X, y):\n"," d = {'category':y, 'text':X}\n"," self.df = pd.DataFrame(d)\n"," # Replace gaps\n"," self.df['text'].replace(np.nan,'', inplace=True)\n"," print('Shape: ', self.df.shape)\n","\n"," # Create category dictionary\n"," self.dic_categorias = {}\n"," for i in range(len(df)): \n"," if df.iloc[i,0] in self.dic_categorias:\n"," self.dic_categorias[df.iloc[i,0]] += 1\n"," else:\n"," self.dic_categorias[df.iloc[i,0]] = 1\n","\n"," self.df_category = pd.DataFrame({'category': self.dic_categorias.keys()})\n"," print('Dictionary of categories:', self.dic_categorias)\n","\n"," # Initialising variables for graphs\n"," sb.set(rc={'figure.figsize':(14,6)})\n"," all_values = self.dic_categorias.values()\n"," self.max_value = max(all_values)\n","\n"," def showGraph(self, columnas, type_g='strip',export=False):\n"," # Graph generator\n"," for columna in columnas:\n"," if (type_g == 'strip'):\n"," splot = sb.stripplot(x=columna,y='category', data=self.df)\n"," elif (type_g == 'box'):\n"," splot = sb.boxplot(x=columna,y='category', data=self.df)\n"," elif (type_g == 'heatmap'):\n"," dic={}\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," dic[cat] = df_grupo[columna].tolist()\n"," while len(dic[cat]) < self.max_value:\n"," dic[cat].append(dic[cat][len(dic[cat])-1])\n"," df_n = pd.DataFrame(dic)\n"," splot = sb.heatmap(df_n.transpose()).set_title(columna)\n"," if export == False:\n"," plt.show()\n"," else:\n"," splot.get_figure().savefig(columna+\"-\"+ type_g+\".jpg\", bbox_inches='tight')\n"," plt.clf()\n","\n"," def export(self):\n"," print('Exporting...')\n"," self.df.to_csv(\"data.csv\") \n"," self.df_category.to_csv(\"data_cat.csv\")\n"," self.showGraph(self.df.columns[2:],'strip',True)\n"," self.showGraph(self.df.columns[2:],'box',True)\n"," self.showGraph(self.df.columns[2:],'heatmap',True)\n","\n"," def volumetry(self):\n"," # Volumetrics for each text\n"," self.df['words'] = [len(text.split()) for text in self.df['text'].tolist()] # Number of words\n"," self.df['uniques'] = [len(set(text.split())) for text in self.df['text'].tolist()] # Number of unique words\n"," self.df['chars'] = self.df['text'].str.len() # Number of characters\n"," self.df['avg_words_len'] = round(self.df['chars'] / self.df['words'], 3) # Average word length\n"," self.df = self.df.replace([np.inf, -np.inf, np.nan], 0)\n"," \n"," # Volumetrics for each category\n"," volumetry = ['words','uniques','chars','avg_words_len']\n"," category_columns = ['category','docs']\n"," for col in volumetry:\n"," category_columns.append('avg_'+col)\n"," category_columns.append('std_'+col)\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in volumetry:\n"," self.df_category.loc[i,'docs'] = len(df_grupo)\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3) \n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 5)\n"," i+=1\n"," \n"," print('Volumetrics for each text:')\n"," display(self.df.head())\n"," print('Volumetrics for each category:')\n"," display(self.df_category[category_columns])\n"," \n"," self.showGraph(volumetry,'strip')\n"," self.showGraph(volumetry,'box')\n"," self.showGraph(volumetry,'heatmap')\n","\n"," return self.df, self.df_category\n","\n"," def lemmas(self):\n"," # Number and length of different lemmas per text\n"," dic_lemmas = {}\n"," for cat in self.dic_categorias:\n"," dic_lemmas[cat] = []\n","\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in tqdm(self.dic_categorias):\n"," df_grupo = groups.get_group(cat)\n"," for text in df_grupo['text'].tolist():\n"," set_ = set()\n"," suma = 0\n"," doc = self.nlp(text)\n"," for token in doc:\n"," set_.add(token.lemma_)\n"," suma += len(token.lemma_)\n"," if re.match('PUNCT.*|SYM.*|SPACE.*', token.pos_) == None:\n"," dic_lemmas[cat].append(token.lemma_)\n"," self.df.loc[i,'lemmas_uniques'] = len(set_)\n"," if(len(set_) != 0):\n"," self.df.loc[i,'avg_lemmas_len'] = round(suma / len(set_), 3)\n"," else:\n"," self.df.loc[i,'avg_lemmas_len'] = suma\n"," i+=1\n"," self.dic_lemmas = dic_lemmas\n","\n"," # Average and variance of different lemmas and length by category\n"," i = 0\n"," col_lemmas = ['lemmas_uniques','avg_lemmas_len']\n"," category_lemmas = ['category']\n"," for col in col_lemmas:\n"," category_lemmas.append('avg_'+col)\n"," category_lemmas.append('std_'+col)\n"," \n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_lemmas:\n"," self.df_category.loc[i,'docs'] = len(df_grupo)\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3) \n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 3)\n"," i+=1\n"," \n"," print('Lemmas for each text:')\n"," display(self.df.head())\n"," print('Lemmas for each category:')\n"," display(self.df_category[category_lemmas])\n","\n"," self.showGraph(col_lemmas,'strip')\n"," self.showGraph(col_lemmas,'box')\n"," self.showGraph(col_lemmas,'heatmap')\n","\n"," return self.df, self.df_category\n","\n"," def lemmas_freq(self, n = 50):\n"," # Most frequent lemmas by category\n"," dic_f_lemmas = self.dic_categorias.copy()\n"," for cat in self.dic_categorias:\n"," text = self.Text(self.dic_lemmas[cat])\n"," dic_f_lemmas[cat] = self.FreqDist(text).most_common(n)\n"," lista = []\n"," for tupla in dic_f_lemmas[cat]:\n"," lista.append((tupla[0], round(tupla[1] / len(self.dic_lemmas[cat]), 4)))\n"," while (len(lista) < n): # Rellenar huecos\n"," lista.append(np.nan)\n"," dic_f_lemmas[cat] = lista\n","\n"," df_freq_lemas = pd.DataFrame(dic_f_lemmas)\n"," df_freq_lemas_tr = df_freq_lemas.transpose()\n"," print('Most frequent lemmas by category')\n"," display(df_freq_lemas_tr)\n"," df_freq_lemas_tr.to_csv(\"lemas_freq.csv\") \n"," return df_freq_lemas.transpose()\n","\n"," def pos(self):\n"," # POS analysis for each text\n"," dic_pos_cat = {}\n"," for pos in self.POS_LIST:\n"," dic_pos_cat[pos] = {}\n"," for cat in self.dic_categorias:\n"," dic_pos_cat[pos][cat] = []\n","\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for text in tqdm(df_grupo['text'].tolist()):\n"," dic_pos = {}\n"," doc = self.nlp(text)\n"," for token in doc:\n"," if token.pos_ in dic_pos:\n"," dic_pos[token.pos_] += 1\n"," else:\n"," dic_pos[token.pos_] = 1\n"," dic_pos_cat[token.pos_][cat].append(token.text)\n"," total = len(doc)\n"," if total == 0:\n"," total = 1\n"," for pos in self.POS_LIST:\n"," if pos in dic_pos:\n"," self.df.loc[i,pos] = round(dic_pos[pos]/total,4)\n"," else:\n"," self.df.loc[i,pos] = np.nan\n"," i+=1\n"," self.dic_pos_cat = dic_pos_cat\n","\n"," # POS analysis for each category\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for pos in self.POS_LIST:\n"," if pos in df_grupo.columns.values:\n"," self.df_category.loc[i,'avg_'+pos] = round(df_grupo[pos].mean(), 3)\n"," self.df_category.loc[i,'std_'+pos] = round(df_grupo[pos].std(), 3)\n"," i+=1\n","\n"," print('POS analysis for each text')\n"," display(self.df.head())\n"," print('POS analysis for each category')\n"," display(self.df_category)\n"," self.showGraph(self.POS_LIST,'strip')\n"," self.showGraph(self.POS_LIST,'box')\n"," self.showGraph(self.POS_LIST,'heatmap')\n"," return self.df, self.df_category\n","\n"," def pos_freq(self, n = 15):\n"," # Most frequent words \n"," dic_f_palabras = self.dic_categorias.copy()\n"," for pos in self.POS_LIST:\n"," for cat in self.dic_categorias:\n"," if cat in self.dic_pos_cat[pos]:\n"," text = self.Text(self.dic_pos_cat[pos][cat])\n"," fdist = self.FreqDist(text)\n"," dic_f_palabras[cat] = fdist.most_common(n)\n"," lista = []\n"," for tupla in dic_f_palabras[cat]:\n"," lista.append((tupla[0],round(tupla[1] / len(self.dic_pos_cat[pos][cat]), 5))) \n"," dic_f_palabras[cat] = lista\n","\n"," while (len(dic_f_palabras[cat]) < n): # Rellenar huecos\n"," dic_f_palabras[cat].append(np.nan)\n","\n"," df_freq_palabras = pd.DataFrame(dic_f_palabras)\n"," print(\"---- Para \" + spacy.explain(pos) +\" las \"+ str(n)+\" palabras más frecuentes son: -------\")\n"," display(df_freq_palabras.transpose())\n"," df_freq_palabras_tr = df_freq_palabras.transpose()\n"," df_freq_palabras_tr.to_csv(\"POS_\"+ str(pos)+\"_freq.csv\") \n"," return df_freq_palabras.transpose()\n","\n"," def lexical_diversity(self):\n"," # Lexical diversity for each text\n"," i = 0\n"," for text in tqdm(self.df['text'].tolist()):\n"," flt = ld.flemmatize(text)\n"," self.df.loc[i,'simple_TTR'] = round(ld.ttr(flt), 4)\n"," self.df.loc[i,'root_TTR'] = round(ld.root_ttr(flt), 4)\n"," self.df.loc[i,'log_TTR'] = round(ld.log_ttr(flt), 4)\n"," self.df.loc[i,'maas_TTR'] = round(ld.maas_ttr(flt), 4)\n"," self.df.loc[i,'MSTTR'] = round(ld.msttr(flt), 4)\n"," self.df.loc[i,'MATTR'] = round(ld.mattr(flt), 4)\n"," self.df.loc[i,'HDD'] = round(ld.hdd(flt), 4)\n"," self.df.loc[i,'MTLD'] = round(ld.mtld(flt), 4)\n"," i+=1\n","\n"," # Lexical diversity for each category\n"," i = 0\n"," col_diversity = ['simple_TTR','root_TTR','log_TTR','maas_TTR','MSTTR','MATTR','HDD','MTLD']\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_diversity:\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(),4)\n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(),4)\n"," i+=1\n"," print('Lexical diversity for each text')\n"," display(self.df.head())\n"," print('Lexical diversity for each category')\n"," display(self.df_category)\n"," self.showGraph(col_diversity,'strip')\n"," self.showGraph(col_diversity,'box')\n"," self.showGraph(col_diversity,'heatmap')\n"," return self.df, self.df_category\n","\n"," def complexity(self):\n"," # Complexity diversity for each category\n"," i = 0\n"," for text in tqdm(self.df['text'].tolist()):\n"," if len(text) > 0:\n"," text_processed = self.textComplexitySpacy.textProcessing(text)\n"," pmarks = self.textComplexitySpacy.punctuationMarks()[0]\n"," self.df.loc[i,'lexcomplexity'] = self.textComplexitySpacy.lexicalComplexity()[6]\n"," self.df.loc[i,'ssreadability'] = self.textComplexitySpacy.ssReadability()[1]\n"," self.df.loc[i,'sencomplexity'] = self.textComplexitySpacy.sentenceComplexity()[3]\n"," self.df.loc[i,'autoreadability'] = self.textComplexitySpacy.autoReadability()[1]\n"," embeddingdepth = self.textComplexitySpacy.embeddingDepth()\n"," self.df.loc[i,'max_embeddingdepth'] = embeddingdepth[0]\n"," self.df.loc[i,'min_embeddingdepth'] = embeddingdepth[1]\n"," self.df.loc[i,'avg_embeddingdepth'] = embeddingdepth[2]\n"," readability = self.textComplexitySpacy.readability()\n"," self.df.loc[i,'huertareadability'] = round(readability[3],4)\n"," self.df.loc[i,'ifszreadability'] = round(readability[4],4)\n"," self.df.loc[i,'polinicompressibility'] = round(readability[5],4)\n"," self.df.loc[i,'mureadability'] = round(readability[6],4)\n"," self.df.loc[i,'agereadability'] = self.textComplexitySpacy.ageReadability()[0]\n"," self.df.loc[i,'yearscrawford'] = self.textComplexitySpacy.yearsCrawford()\n"," i+=1\n","\n"," # Complexity diversity for each category\n"," i = 0\n"," col_complexity = ['lexcomplexity','ssreadability','sencomplexity','autoreadability','max_embeddingdepth',\n"," 'min_embeddingdepth','avg_embeddingdepth','huertareadability','ifszreadability',\n"," 'polinicompressibility','mureadability','agereadability','yearscrawford']\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_complexity:\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 4)\n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 4)\n"," i+=1\n"," \n"," print('Complexity diversity for each text')\n"," display(self.df.head())\n"," print('Complexity diversity for each category')\n"," display(self.df_category)\n"," self.showGraph(col_complexity,'strip')\n"," self.showGraph(col_complexity,'box')\n"," self.showGraph(col_complexity,'heatmap')\n"," return self.df, self.df_category\n","\n"," def featureSelection(self):\n"," df = self.df.fillna(0)\n"," X = df.iloc[:,2:]\n"," y = df.iloc[:,0]\n","\n"," from sklearn.feature_selection import VarianceThreshold, SelectFromModel\n"," # Removing features with low variance\n"," sel = VarianceThreshold(threshold=(.8 * (1 - .8))) # No varía en más del 80% de datos\n"," arr = sel.fit_transform(X)\n"," print('Removing features with low variance...')\n"," print('Selected columns:',sel.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(arr))\n"," pd.DataFrame(arr).to_csv(\"VarianceThreshold.csv\") \n","\n"," # SelectFromModel\n"," # Selection of functions based on L1\n"," from sklearn.svm import LinearSVC\n","\n"," lsvc = LinearSVC(C=0.01, penalty=\"l1\", dual=False).fit(X, y)\n"," model = SelectFromModel(lsvc, prefit=True)\n"," X_new = model.transform(X)\n"," print('Removing features with SelectFromModel...')\n"," print('Selected columns:',model.get_feature_names_out(df.columns.values[2:]))\n"," display(pd.DataFrame(X_new))\n"," pd.DataFrame(X_new).to_csv(\"SelectFromModel.csv\") \n"," \n","\n"," def kBest(self, k = 10):\n"," df = self.df.fillna(0)\n"," X = df.iloc[:,2:]\n"," y = df.iloc[:,0]\n"," # Univariate feature selection\n"," from sklearn.feature_selection import SelectKBest\n"," from sklearn.feature_selection import f_classif, mutual_info_classif\n"," print('Highest scoring '+ str(k) +' features with f_classif...')\n"," kbest_classif = SelectKBest(f_classif, k=k) # Elimina todo menos las k características de puntuación más alta\n"," X_classif = kbest_classif.fit_transform(X, y)\n"," print('Selected columns:',kbest_classif.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(X_classif))\n"," pd.DataFrame(X_classif).to_csv(\"f_classif.csv\") \n","\n"," print('Highest scoring '+ str(k) +' features with mutual_info_classif...')\n"," kbest_mut = SelectKBest(mutual_info_classif, k=k)\n"," X_mut = kbest_mut.fit_transform(X, y)\n"," print('Selected columns:', kbest_mut.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(X_mut))\n"," pd.DataFrame(X_mut).to_csv(\"mutual_info_classif.csv\") \n"," "]},{"cell_type":"markdown","metadata":{"id":"WLTN5U8b32S6"},"source":["# Analysis"]},{"cell_type":"code","execution_count":89,"metadata":{"executionInfo":{"elapsed":11349,"status":"ok","timestamp":1644412316691,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"ViskMg97pXwg"},"outputs":[],"source":["textAnalysisSpacy = TextAnalysisSpacy('es')"]},{"cell_type":"markdown","metadata":{"id":"nrNo-f5p35ei"},"source":["# Carga datos"]},{"cell_type":"code","execution_count":96,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":244,"status":"ok","timestamp":1644412620771,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"lVjjWxvEn5LP","outputId":"e760b59b-ecf6-4308-ec13-8bff00a4f99c"},"outputs":[{"data":{"text/html":["\n"," <div id=\"df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>category</th>\n"," <th>author</th>\n"," <th>title</th>\n"," <th>text</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>Nereida</td>\n"," <td>TOC Y COMPROBACIONES</td>\n"," <td>Veo que en este foro, afortunadamente para vos...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>desesperacion15</td>\n"," <td>No se que me sucede.</td>\n"," <td>Hola a todos,Les escribo porque hace un tiempo...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>Holli</td>\n"," <td>No puedo con la Ansiedad</td>\n"," <td>Hola a todos, tengo 24 años y hace como 2 años...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Relaciones Padres e Hijos</td>\n"," <td>watermelon</td>\n"," <td>Les digo o no les digo que sufrí abusos de niñ...</td>\n"," <td>Hola, Soy nueva en esto del foro, de hecho ent...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Relaciones Padres e Hijos</td>\n"," <td>mr_rol</td>\n"," <td>Mi esposa trata mal a nuestro hijo de ocho a...</td>\n"," <td>Buenas noches, Tengo 34 años y mi esposa 25 t...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "],"text/plain":[" category ... text\n","0 Sección de Ansiedad y Fobias ... Veo que en este foro, afortunadamente para vos...\n","1 Sección de Ansiedad y Fobias ... Hola a todos,Les escribo porque hace un tiempo...\n","2 Sección de Ansiedad y Fobias ... Hola a todos, tengo 24 años y hace como 2 años...\n","3 Relaciones Padres e Hijos ... Hola, Soy nueva en esto del foro, de hecho ent...\n","4 Relaciones Padres e Hijos ... Buenas noches, Tengo 34 años y mi esposa 25 t...\n","\n","[5 rows x 4 columns]"]},"execution_count":96,"metadata":{},"output_type":"execute_result"}],"source":["df = pd.read_csv(\"./texts.csv\")\n","df.head()"]},{"cell_type":"code","execution_count":97,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000,"output_embedded_package_id":"1vpzTTG4oQWTZufKEDJkd-zoAgVs4_JJu"},"executionInfo":{"elapsed":95780,"status":"ok","timestamp":1644412718344,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"4LkSJ0COcetm","outputId":"c871b294-a937-4da4-a4c3-fde5b9f4215a"},"outputs":[{"data":{"text/plain":["Output hidden; open in https://colab.research.google.com to view."]},"metadata":{},"output_type":"display_data"}],"source":["textAnalysisSpacy.textProcessing(df['text'].tolist(),df['category'].tolist())\n","textAnalysisSpacy.volumetry()\n","textAnalysisSpacy.lemmas()\n","textAnalysisSpacy.lemmas_freq()\n","textAnalysisSpacy.pos()\n","textAnalysisSpacy.pos_freq()\n","textAnalysisSpacy.lexical_diversity()\n","textAnalysisSpacy.complexity()\n","textAnalysisSpacy.featureSelection()\n","textAnalysisSpacy.kBest()\n","textAnalysisSpacy.export()"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyMv+hpn4P7K1p0PdKq3+UyM","collapsed_sections":[],"mount_file_id":"1WdsL5lkOudV-Xpbrmg6f92v3ujY8QHAf","name":"Biblioteca Análisis-PLN.ipynb","provenance":[],"toc_visible":true},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment