Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SINAI
/
texty
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
de47e349
authored
Feb 15, 2022
by
Alba Maria Mármol
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
update example
parent
0c1e24b8
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
1 additions
and
2 deletions
example.ipynb
example.ipynb
View file @
de47e349
{"cells":[{"cell_type":"markdown","metadata":{"id":"tgTuDdZIH7og"},"source":["# Librerías"]},{"cell_type":"code","execution_count":85,"metadata":{"id":"ydZHklMLF2g9","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1644412293290,"user_tz":-60,"elapsed":9817,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}},"outputId":"36f2c877-04f4-4e87-886d-3608f0757c0e"},"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: lexical-diversity in /usr/local/lib/python3.7/dist-packages (0.1.1)\n","Requirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4)\n","Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (7.4.0)\n","Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.62.3)\n","Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1)\n","Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.9.0)\n","Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5)\n","Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.0)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.4.0)\n","Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.1.3)\n","Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.19.5)\n","Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0)\n","Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.6)\n","Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.6)\n","Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.6)\n","Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (4.10.1)\n","Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.10.0.2)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.7.0)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.10.8)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n","Requirement already satisfied: syllables in /usr/local/lib/python3.7/dist-packages (1.0.3)\n"]}],"source":["!pip install lexical-diversity # Lexical Diversity\n","!pip install spacy # Natural Language Processing\n","!pip install syllables # Syllable counter for english"]},{"cell_type":"code","source":["crea_total_path = './CREA_total.txt'"],"metadata":{"id":"F8_bf3LFYobW","executionInfo":{"status":"ok","timestamp":1644412293291,"user_tz":-60,"elapsed":26,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}}},"execution_count":86,"outputs":[]},{"cell_type":"markdown","source":["# TextComplexitySpacy"],"metadata":{"id":"iyyf3urz3ljW"}},{"cell_type":"code","source":["from functools import reduce\n","import math\n","import syllables\n","class TextComplexitySpacy():\n"," \n"," def __init__(self, lang='es'):\n"," self.lang = lang\n","\n"," # create language analyzer\n"," if lang == 'es':\n"," self.nlp = es_core_news_sm.load()\n"," if lang == 'en':\n"," self.nlp = spacy.load(\"en_core_web_sm\")\n","\n"," # Para leer el texto que introducimos\n"," f = open(crea_total_path)\n"," lines = f.readlines()\n"," f.close()\n"," crea = {}\n"," for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words\n"," data = l.strip().split()\n"," crea[data[1]] = float(data[2].replace(',', ''))\n"," self.crea = crea\n"," pass\n","\n"," def textProcessing(self, text):\n"," # Meter todas las funciones en una patron de los tokens válidos\n"," doc = self.nlp(text)\n"," self.tokens = [w for w in doc]\n"," self.sentences = [sent for sent in doc.sents]\n"," self.nsentences = len(self.sentences)\n"," \n"," '''\n"," Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos\n"," '''\n"," pos_content_sentences = []\n"," for sentence in self.sentences:\n"," ws = self.nlp(sentence.text)\n"," pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])\n"," self.pos_content_sentences = pos_content_sentences \n"," \n"," return self.pos_content_sentences\n","\n"," \n"," def punctuationMarks(self):\n"," # Solo nos interesa contar los tokens que sean signo de puntuación.\n"," # Number of words.\n"," punctuation = []\n"," N_words = []\n"," for w in self.tokens:\n"," if re.match('PUNCT.*', w.pos_):\n"," punctuation.append(w.text)\n"," else:\n"," N_words.append(w.text)\n","\n"," aux = len(N_words) \n"," if aux == 0:\n"," aux = 1\n"," self.N_words = aux\n"," \n"," self.npunctuation = len(punctuation)\n"," self.punctuation = punctuation\n"," \n"," return self.npunctuation, self.punctuation, self.N_words\n","\n"," def lexicalComplexity(self):\n"," # Number of low frequency words \n"," count = 0\n"," for sentence in self.pos_content_sentences:\n"," for w in sentence:\n"," if w.text not in self.crea:\n"," count+=1\n"," N_lfw = count\n"," self.N_lfw = N_lfw\n","\n"," # Number of distinct content words \n"," N_dcw = len(set([w.text.lower() for s in self.pos_content_sentences for w in s]))\n"," self.N_dcw =N_dcw\n","\n"," # Number of sentences\n"," self.N_s = len(self.pos_content_sentences)\n","\n"," # Number of total content words\n"," N_cw = reduce((lambda x, y: x + y), [len(s) for s in self.pos_content_sentences])\n"," self.N_cw = N_cw\n","\n"," # Lexical Distribution Index\n"," if self.N_s == 0:\n"," self.N_s = 1\n"," LDI = N_dcw / float(self.N_s)\n"," self.LDI = LDI\n"," \n"," # Index of Low Frequency Words\n"," if N_cw == 0:\n"," N_cw = 1\n"," ILFW = N_lfw / float(N_cw)\n"," self.ILFW =ILFW\n","\n"," # Lexical Complexity\n"," LC = (LDI + ILFW) / 2\n"," self.LC = LC\n"," \n"," return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC\n","\n"," def ssReadability(self): \n"," #Number of rare words\n"," byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)\n"," byfreq = byfreq[:1500]\n"," count = 0\n"," for sentence in self.pos_content_sentences:\n"," for w in sentence:\n"," if w.text.lower() not in byfreq:\n"," count +=1\n"," N_rw = count\n"," self.N_rw = N_rw\n"," \n"," SSR = 1.609*(self.N_words / self.N_s) + 331.8* (self.N_rw /self.N_words) + 22.0 \n"," self.SSR= SSR\n"," \n"," return self.N_rw, self.SSR\n","\n"," def sentenceComplexity(self):\n"," #Number of complex sentences\n"," N_cs = 0\n"," for sentence in self.sentences:\n"," previous_is_verb = False\n"," count = 0\n"," for w in sentence:\n"," if re.match('VERB.*', w.pos_):\n"," if (previous_is_verb):\n"," count += 1\n"," previous_is_verb = False\n"," else:\n"," previous_is_verb = True\n"," else:\n"," previous_is_verb = False\n"," if count>0:\n"," N_cs += 1 \n"," self.N_cs = N_cs\n"," \n"," ASL = self.N_words / self.N_s\n"," self.ASL = ASL\n"," \n"," CS = self.N_cs / self.N_s\n"," self.CS = CS\n"," \n"," SCI = (ASL + CS)/ 2\n"," self.SCI = SCI\n"," \n"," return self.N_cs, self.ASL, self.CS, self.SCI\n","\n"," def autoReadability(self):\n"," # Number of characters\n"," count = 0\n"," listwords = []\n"," for words in self.sentences:\n"," for w in words:\n"," if re.match('\\r\\n.*', w.text):\n"," count +=1\n"," else:\n"," listwords.append(w.text)\n"," \n"," self.listwords = listwords \n"," N_charac = 0\n"," for characters in self.listwords:\n"," N_charac += len(characters)\n"," \n"," self.N_charac = N_charac\n"," \n"," ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words/ self.N_s - 21.43\n"," self.ARI = ARI\n"," \n"," return self.N_charac, self.ARI, self.listwords\n","\n"," \n"," def tree_height(self,root, cont):\n"," if not list(root.children):\n"," return 1\n"," else:\n"," cont+=1\n"," if cont == 320:\n"," return 320\n"," return 1 + max(self.tree_height(x, cont) for x in root.children)\n","\n"," def embeddingDepth(self):\n"," ## Output results\n"," roots = [sent.root for sent in self.sentences]\n"," max_list = []\n"," max_list = [self.tree_height(root,0) for root in roots]\n"," mean_max_list = sum(max_list)/(len(max_list))\n"," max_max_list = max(max_list)\n"," min_max_list = min(max_list)\n"," \n"," self.max_max_list = max_max_list\n"," self.min_max_list = min_max_list\n"," self.mean_max_list = mean_max_list\n"," \n"," return self.max_max_list, self.min_max_list, self.mean_max_list\n","\n"," def syllable_counter_spanish(self,text):\n"," if self.lang == 'es':\n"," t = re.sub(r'y([aáeéiíoóuú])', '\\\\1', text.lower())\n"," t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())\n"," t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower()\n"," t = re.sub(r'[aáeéiíoóuúy]', 'A', t)\n"," return(len(t.split('A'))-1)\n","\n"," elif self.lang == 'en':\n"," return syllables.estimate(text)\n","\n"," def readability(self):\n"," # Number of syllables and Number of words with 3 or more syllables:tagger\n"," n_syllables = 0\n"," n_syllables3 = 0\n"," for words in self.listwords:\n"," syllables = self.syllable_counter_spanish(words)\n"," n_syllables += syllables\n"," if syllables>=3:\n"," n_syllables3 += 1\n"," \n"," self.n_syllables = n_syllables\n"," self.n_syllables3 = n_syllables3\n"," \n"," # Number of letters\n"," nletters= 0\n"," letters = []\n"," vecletters =[]\n"," for word in self.listwords:\n"," if re.match('[a-zA-Z]|á|ó|í|ú|é', word):\n"," letters.append(word)\n"," nletters+=len(word)\n"," vecletters.append(len(word))\n"," \n"," self.letters = letters\n"," self.nletters = nletters\n"," self.vecletters= vecletters\n"," \n"," huertareadability = 206.835 - 60 * (self.n_syllables / self.N_words) - 102 * (self.nsentences / self.N_words)\n"," self.huertareadability = huertareadability\n"," \n"," ifszreadability = 206.835 - 62.3 * (self.n_syllables / self.N_words) - (self.N_words / self.nsentences) \n"," self.ifszreadability = ifszreadability\n"," \n"," self.syll_words = self.n_syllables / self.N_words\n"," \n"," \n"," polinicompressibility = 95.2 - 9.7 * (self.nletters / self.N_words) - 0.35 * (self.N_words / self.nsentences) \n"," self.polinicompressibility = polinicompressibility\n"," \n"," self.words_sen = self.N_words / self.nsentences\n"," \n"," x=self.nletters / self.N_words\n"," varianza=np.var(self.vecletters)\n"," if varianza == 0:\n"," varianza =1\n"," aux = self.N_words-1\n"," if aux == 0:\n"," aux=1\n"," mureadability = (self.N_words /aux)*(x/varianza)*100\n"," self.mureadability = mureadability\n"," \n"," return self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen\n"," \n"," def ageReadability(self):\n"," \n"," minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395\n"," self.minimumage = minimumage\n"," \n"," solreadability= -2.51+0.74*(3.1291+1.0430*math.sqrt(self.n_syllables3*(30/self.nsentences)))\n"," self.solreadability = solreadability\n"," \n"," return self.minimumage, self.solreadability\n"," \n"," def yearsCrawford(self):\n"," \n"," years = -20.5 *(self.nsentences/self.N_words) + 4.9 * (self.n_syllables /self.N_words) - 3.407\n"," self.years = years\n"," \n"," return self.years"],"metadata":{"id":"fbsCkEC8vBLg","executionInfo":{"status":"ok","timestamp":1644412294091,"user_tz":-60,"elapsed":824,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}}},"execution_count":87,"outputs":[]},{"cell_type":"markdown","source":["# TextAnalisisSpacy"],"metadata":{"id":"GQm029q73tRQ"}},{"cell_type":"code","source":["# Only required for analysis in Spanish\n","import spacy.cli \n","spacy.cli.download(\"es_core_news_sm\")\n","import es_core_news_sm\n","\n","# Imports\n","import spacy\n","import numpy as np\n","from tqdm import tqdm\n","import re\n","import pandas as pd\n","\n","import matplotlib.pyplot as plt\n","%matplotlib inline\n","import seaborn as sb\n","import nltk\n","from nltk.probability import FreqDist\n","from nltk.text import Text\n","from lexical_diversity import lex_div as ld\n","\n","class TextAnalisisSpacy():\n"," \n"," def __init__(self, lang='es'):\n","\n"," # Create language analyzer\n"," if lang == 'es':\n"," self.nlp = es_core_news_sm.load()\n"," self.textComplexitySpacy = TextComplexitySpacy()\n"," self.nlp = es_core_news_sm.load()\n"," elif lang == 'en':\n"," self.nlp = spacy.load(\"en_core_web_sm\")\n"," self.textComplexitySpacy = TextComplexitySpacy('en')\n","\n"," self.Text = Text\n"," self.FreqDist = FreqDist\n"," self.POS_LIST = [\"ADJ\", \"ADP\", \"ADV\", \"AUX\",\"X\", \"CCONJ\",\"CONJ\", \"DET\", \"INTJ\", \"NOUN\", \"NUM\", \"PART\", \"PRON\", \"PROPN\", \"PUNCT\", \"SCONJ\", \"SYM\", \"VERB\", \"SPACE\"]\n"," pass\n","\n"," # \n"," # X = samples input , y = tags\n"," #\n"," def textProcessing(self, X, y):\n"," d = {'category':y, 'text':X}\n"," self.df = pd.DataFrame(d)\n"," # Replace gaps\n"," self.df['text'].replace(np.nan,'', inplace=True)\n"," print('Shape: ', self.df.shape)\n","\n"," # Create category dictionary\n"," self.dic_categorias = {}\n"," for i in range(len(df)): \n"," if df.iloc[i,0] in self.dic_categorias:\n"," self.dic_categorias[df.iloc[i,0]] += 1\n"," else:\n"," self.dic_categorias[df.iloc[i,0]] = 1\n","\n"," self.df_category = pd.DataFrame({'category': self.dic_categorias.keys()})\n"," print('Dictionary of categories:', self.dic_categorias)\n","\n"," # Initialising variables for graphs\n"," sb.set(rc={'figure.figsize':(14,6)})\n"," all_values = self.dic_categorias.values()\n"," self.max_value = max(all_values)\n","\n"," def showGraph(self, columnas, type_g='strip',export=False):\n"," # Graph generator\n"," for columna in columnas:\n"," if (type_g == 'strip'):\n"," splot = sb.stripplot(x=columna,y='category', data=self.df)\n"," elif (type_g == 'box'):\n"," splot = sb.boxplot(x=columna,y='category', data=self.df)\n"," elif (type_g == 'heatmap'):\n"," dic={}\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," dic[cat] = df_grupo[columna].tolist()\n"," while len(dic[cat]) < self.max_value:\n"," dic[cat].append(dic[cat][len(dic[cat])-1])\n"," df_n = pd.DataFrame(dic)\n"," splot = sb.heatmap(df_n.transpose()).set_title(columna)\n"," if export == False:\n"," plt.show()\n"," else:\n"," splot.get_figure().savefig(columna+\"-\"+ type_g+\".jpg\", bbox_inches='tight')\n"," plt.clf()\n","\n"," def export(self):\n"," print('Exporting...')\n"," self.df.to_csv(\"data.csv\") \n"," self.df_category.to_csv(\"data_cat.csv\")\n"," self.showGraph(self.df.columns[2:],'strip',True)\n"," self.showGraph(self.df.columns[2:],'box',True)\n"," self.showGraph(self.df.columns[2:],'heatmap',True)\n","\n"," def volumetry(self):\n"," # Volumetrics for each text\n"," self.df['words'] = [len(text.split()) for text in self.df['text'].tolist()] # Number of words\n"," self.df['uniques'] = [len(set(text.split())) for text in self.df['text'].tolist()] # Number of unique words\n"," self.df['chars'] = self.df['text'].str.len() # Number of characters\n"," self.df['avg_words_len'] = round(self.df['chars'] / self.df['words'], 3) # Average word length\n"," self.df = self.df.replace([np.inf, -np.inf, np.nan], 0)\n"," \n"," # Volumetrics for each category\n"," volumetry = ['words','uniques','chars','avg_words_len']\n"," category_columns = ['category','docs']\n"," for col in volumetry:\n"," category_columns.append('avg_'+col)\n"," category_columns.append('std_'+col)\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in volumetry:\n"," self.df_category.loc[i,'docs'] = len(df_grupo)\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3) \n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 5)\n"," i+=1\n"," \n"," print('Volumetrics for each text:')\n"," display(self.df.head())\n"," print('Volumetrics for each category:')\n"," display(self.df_category[category_columns])\n"," \n"," self.showGraph(volumetry,'strip')\n"," self.showGraph(volumetry,'box')\n"," self.showGraph(volumetry,'heatmap')\n","\n"," return self.df, self.df_category\n","\n"," def lemmas(self):\n"," # Number and length of different lemmas per text\n"," dic_lemmas = {}\n"," for cat in self.dic_categorias:\n"," dic_lemmas[cat] = []\n","\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in tqdm(self.dic_categorias):\n"," df_grupo = groups.get_group(cat)\n"," for text in df_grupo['text'].tolist():\n"," set_ = set()\n"," suma = 0\n"," doc = self.nlp(text)\n"," for token in doc:\n"," set_.add(token.lemma_)\n"," suma += len(token.lemma_)\n"," if re.match('PUNCT.*|SYM.*|SPACE.*', token.pos_) == None:\n"," dic_lemmas[cat].append(token.lemma_)\n"," self.df.loc[i,'lemmas_uniques'] = len(set_)\n"," if(len(set_) != 0):\n"," self.df.loc[i,'avg_lemmas_len'] = round(suma / len(set_), 3)\n"," else:\n"," self.df.loc[i,'avg_lemmas_len'] = suma\n"," i+=1\n"," self.dic_lemmas = dic_lemmas\n","\n"," # Average and variance of different lemmas and length by category\n"," i = 0\n"," col_lemmas = ['lemmas_uniques','avg_lemmas_len']\n"," category_lemmas = ['category']\n"," for col in col_lemmas:\n"," category_lemmas.append('avg_'+col)\n"," category_lemmas.append('std_'+col)\n"," \n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_lemmas:\n"," self.df_category.loc[i,'docs'] = len(df_grupo)\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3) \n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 3)\n"," i+=1\n"," \n"," print('Lemmas for each text:')\n"," display(self.df.head())\n"," print('Lemmas for each category:')\n"," display(self.df_category[category_lemmas])\n","\n"," self.showGraph(col_lemmas,'strip')\n"," self.showGraph(col_lemmas,'box')\n"," self.showGraph(col_lemmas,'heatmap')\n","\n"," return self.df, self.df_category\n","\n"," def lemmas_freq(self, n = 50):\n"," # Most frequent lemmas by category\n"," dic_f_lemmas = self.dic_categorias.copy()\n"," for cat in self.dic_categorias:\n"," text = self.Text(self.dic_lemmas[cat])\n"," dic_f_lemmas[cat] = self.FreqDist(text).most_common(n)\n"," lista = []\n"," for tupla in dic_f_lemmas[cat]:\n"," lista.append((tupla[0], round(tupla[1] / len(self.dic_lemmas[cat]), 4)))\n"," while (len(lista) < n): # Rellenar huecos\n"," lista.append(np.nan)\n"," dic_f_lemmas[cat] = lista\n","\n"," df_freq_lemas = pd.DataFrame(dic_f_lemmas)\n"," df_freq_lemas_tr = df_freq_lemas.transpose()\n"," print('Most frequent lemmas by category')\n"," display(df_freq_lemas_tr)\n"," df_freq_lemas_tr.to_csv(\"lemas_freq.csv\") \n"," return df_freq_lemas.transpose()\n","\n"," def pos(self):\n"," # POS analysis for each text\n"," dic_pos_cat = {}\n"," for pos in self.POS_LIST:\n"," dic_pos_cat[pos] = {}\n"," for cat in self.dic_categorias:\n"," dic_pos_cat[pos][cat] = []\n","\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for text in tqdm(df_grupo['text'].tolist()):\n"," dic_pos = {}\n"," doc = self.nlp(text)\n"," for token in doc:\n"," if token.pos_ in dic_pos:\n"," dic_pos[token.pos_] += 1\n"," else:\n"," dic_pos[token.pos_] = 1\n"," dic_pos_cat[token.pos_][cat].append(token.text)\n"," total = len(doc)\n"," if total == 0:\n"," total = 1\n"," for pos in self.POS_LIST:\n"," if pos in dic_pos:\n"," self.df.loc[i,pos] = round(dic_pos[pos]/total,4)\n"," else:\n"," self.df.loc[i,pos] = np.nan\n"," i+=1\n"," self.dic_pos_cat = dic_pos_cat\n","\n"," # POS analysis for each category\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for pos in self.POS_LIST:\n"," if pos in df_grupo.columns.values:\n"," self.df_category.loc[i,'avg_'+pos] = round(df_grupo[pos].mean(), 3)\n"," self.df_category.loc[i,'std_'+pos] = round(df_grupo[pos].std(), 3)\n"," i+=1\n","\n"," print('POS analysis for each text')\n"," display(self.df.head())\n"," print('POS analysis for each category')\n"," display(self.df_category)\n"," self.showGraph(self.POS_LIST,'strip')\n"," self.showGraph(self.POS_LIST,'box')\n"," self.showGraph(self.POS_LIST,'heatmap')\n"," return self.df, self.df_category\n","\n"," def pos_freq(self, n = 15):\n"," # Most frequent words \n"," dic_f_palabras = self.dic_categorias.copy()\n"," for pos in self.POS_LIST:\n"," for cat in self.dic_categorias:\n"," if cat in self.dic_pos_cat[pos]:\n"," text = self.Text(self.dic_pos_cat[pos][cat])\n"," fdist = self.FreqDist(text)\n"," dic_f_palabras[cat] = fdist.most_common(n)\n"," lista = []\n"," for tupla in dic_f_palabras[cat]:\n"," lista.append((tupla[0],round(tupla[1] / len(self.dic_pos_cat[pos][cat]), 5))) \n"," dic_f_palabras[cat] = lista\n","\n"," while (len(dic_f_palabras[cat]) < n): # Rellenar huecos\n"," dic_f_palabras[cat].append(np.nan)\n","\n"," df_freq_palabras = pd.DataFrame(dic_f_palabras)\n"," print(\"---- Para \" + spacy.explain(pos) +\" las \"+ str(n)+\" palabras más frecuentes son: -------\")\n"," display(df_freq_palabras.transpose())\n"," df_freq_palabras_tr = df_freq_palabras.transpose()\n"," df_freq_palabras_tr.to_csv(\"POS_\"+ str(pos)+\"_freq.csv\") \n"," return df_freq_palabras.transpose()\n","\n"," def lexical_diversity(self):\n"," # Lexical diversity for each text\n"," i = 0\n"," for text in tqdm(self.df['text'].tolist()):\n"," flt = ld.flemmatize(text)\n"," self.df.loc[i,'simple_TTR'] = round(ld.ttr(flt), 4)\n"," self.df.loc[i,'root_TTR'] = round(ld.root_ttr(flt), 4)\n"," self.df.loc[i,'log_TTR'] = round(ld.log_ttr(flt), 4)\n"," self.df.loc[i,'maas_TTR'] = round(ld.maas_ttr(flt), 4)\n"," self.df.loc[i,'MSTTR'] = round(ld.msttr(flt), 4)\n"," self.df.loc[i,'MATTR'] = round(ld.mattr(flt), 4)\n"," self.df.loc[i,'HDD'] = round(ld.hdd(flt), 4)\n"," self.df.loc[i,'MTLD'] = round(ld.mtld(flt), 4)\n"," i+=1\n","\n"," # Lexical diversity for each category\n"," i = 0\n"," col_diversity = ['simple_TTR','root_TTR','log_TTR','maas_TTR','MSTTR','MATTR','HDD','MTLD']\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_diversity:\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(),4)\n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(),4)\n"," i+=1\n"," print('Lexical diversity for each text')\n"," display(self.df.head())\n"," print('Lexical diversity for each category')\n"," display(self.df_category)\n"," self.showGraph(col_diversity,'strip')\n"," self.showGraph(col_diversity,'box')\n"," self.showGraph(col_diversity,'heatmap')\n"," return self.df, self.df_category\n","\n"," def complexity(self):\n"," # Complexity diversity for each category\n"," i = 0\n"," for text in tqdm(self.df['text'].tolist()):\n"," if len(text) > 0:\n"," text_processed = self.textComplexitySpacy.textProcessing(text)\n"," pmarks = self.textComplexitySpacy.punctuationMarks()[0]\n"," self.df.loc[i,'lexcomplexity'] = self.textComplexitySpacy.lexicalComplexity()[6]\n"," self.df.loc[i,'ssreadability'] = self.textComplexitySpacy.ssReadability()[1]\n"," self.df.loc[i,'sencomplexity'] = self.textComplexitySpacy.sentenceComplexity()[3]\n"," self.df.loc[i,'autoreadability'] = self.textComplexitySpacy.autoReadability()[1]\n"," embeddingdepth = self.textComplexitySpacy.embeddingDepth()\n"," self.df.loc[i,'max_embeddingdepth'] = embeddingdepth[0]\n"," self.df.loc[i,'min_embeddingdepth'] = embeddingdepth[1]\n"," self.df.loc[i,'avg_embeddingdepth'] = embeddingdepth[2]\n"," readability = self.textComplexitySpacy.readability()\n"," self.df.loc[i,'huertareadability'] = round(readability[3],4)\n"," self.df.loc[i,'ifszreadability'] = round(readability[4],4)\n"," self.df.loc[i,'polinicompressibility'] = round(readability[5],4)\n"," self.df.loc[i,'mureadability'] = round(readability[6],4)\n"," self.df.loc[i,'agereadability'] = self.textComplexitySpacy.ageReadability()[0]\n"," self.df.loc[i,'yearscrawford'] = self.textComplexitySpacy.yearsCrawford()\n"," i+=1\n","\n"," # Complexity diversity for each category\n"," i = 0\n"," col_complexity = ['lexcomplexity','ssreadability','sencomplexity','autoreadability','max_embeddingdepth',\n"," 'min_embeddingdepth','avg_embeddingdepth','huertareadability','ifszreadability',\n"," 'polinicompressibility','mureadability','agereadability','yearscrawford']\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_complexity:\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 4)\n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 4)\n"," i+=1\n"," \n"," print('Complexity diversity for each text')\n"," display(self.df.head())\n"," print('Complexity diversity for each category')\n"," display(self.df_category)\n"," self.showGraph(col_complexity,'strip')\n"," self.showGraph(col_complexity,'box')\n"," self.showGraph(col_complexity,'heatmap')\n"," return self.df, self.df_category\n","\n"," def featureSelection(self):\n"," df = self.df.fillna(0)\n"," X = df.iloc[:,2:]\n"," y = df.iloc[:,0]\n","\n"," from sklearn.feature_selection import VarianceThreshold, SelectFromModel\n"," # Removing features with low variance\n"," sel = VarianceThreshold(threshold=(.8 * (1 - .8))) # No varía en más del 80% de datos\n"," arr = sel.fit_transform(X)\n"," print('Removing features with low variance...')\n"," print('Selected columns:',sel.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(arr))\n"," pd.DataFrame(arr).to_csv(\"VarianceThreshold.csv\") \n","\n"," # SelectFromModel\n"," # Selection of functions based on L1\n"," from sklearn.svm import LinearSVC\n","\n"," lsvc = LinearSVC(C=0.01, penalty=\"l1\", dual=False).fit(X, y)\n"," model = SelectFromModel(lsvc, prefit=True)\n"," X_new = model.transform(X)\n"," print('Removing features with SelectFromModel...')\n"," print('Selected columns:',model.get_feature_names_out(df.columns.values[2:]))\n"," display(pd.DataFrame(X_new))\n"," pd.DataFrame(X_new).to_csv(\"SelectFromModel.csv\") \n"," \n","\n"," def kBest(self, k = 10):\n"," df = self.df.fillna(0)\n"," X = df.iloc[:,2:]\n"," y = df.iloc[:,0]\n"," # Univariate feature selection\n"," from sklearn.feature_selection import SelectKBest\n"," from sklearn.feature_selection import f_classif, mutual_info_classif\n"," print('Highest scoring '+ str(k) +' features with f_classif...')\n"," kbest_classif = SelectKBest(f_classif, k=k) # Elimina todo menos las k características de puntuación más alta\n"," X_classif = kbest_classif.fit_transform(X, y)\n"," print('Selected columns:',kbest_classif.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(X_classif))\n"," pd.DataFrame(X_classif).to_csv(\"f_classif.csv\") \n","\n"," print('Highest scoring '+ str(k) +' features with mutual_info_classif...')\n"," kbest_mut = SelectKBest(mutual_info_classif, k=k)\n"," X_mut = kbest_mut.fit_transform(X, y)\n"," print('Selected columns:', kbest_mut.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(X_mut))\n"," pd.DataFrame(X_mut).to_csv(\"mutual_info_classif.csv\") \n"," "],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9Eo_cwABcmVf","executionInfo":{"status":"ok","timestamp":1644412305356,"user_tz":-60,"elapsed":11269,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}},"outputId":"b07b5c12-1bb0-4983-bfa5-106bdc142451"},"execution_count":88,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the model via spacy.load('es_core_news_sm')\n"]}]},{"cell_type":"markdown","source":["# Analisis"],"metadata":{"id":"WLTN5U8b32S6"}},{"cell_type":"code","source":["textAnalisisSpacy = TextAnalisisSpacy('es')"],"metadata":{"id":"ViskMg97pXwg","executionInfo":{"status":"ok","timestamp":1644412316691,"user_tz":-60,"elapsed":11349,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}}},"execution_count":89,"outputs":[]},{"cell_type":"markdown","source":["# Carga datos"],"metadata":{"id":"nrNo-f5p35ei"}},{"cell_type":"code","source":["df = pd.read_csv(\"./texts.csv\")\n","df.head()"],"metadata":{"id":"lVjjWxvEn5LP","colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"status":"ok","timestamp":1644412620771,"user_tz":-60,"elapsed":244,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}},"outputId":"e760b59b-ecf6-4308-ec13-8bff00a4f99c"},"execution_count":96,"outputs":[{"output_type":"execute_result","data":{"text/html":["\n"," <div id=\"df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>category</th>\n"," <th>author</th>\n"," <th>title</th>\n"," <th>text</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>Nereida</td>\n"," <td>TOC Y COMPROBACIONES</td>\n"," <td>Veo que en este foro, afortunadamente para vos...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>desesperacion15</td>\n"," <td>No se que me sucede.</td>\n"," <td>Hola a todos,Les escribo porque hace un tiempo...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>Holli</td>\n"," <td>No puedo con la Ansiedad</td>\n"," <td>Hola a todos, tengo 24 años y hace como 2 años...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Relaciones Padres e Hijos</td>\n"," <td>watermelon</td>\n"," <td>Les digo o no les digo que sufrí abusos de niñ...</td>\n"," <td>Hola, Soy nueva en esto del foro, de hecho ent...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Relaciones Padres e Hijos</td>\n"," <td>mr_rol</td>\n"," <td>Mi esposa trata mal a nuestro hijo de ocho a...</td>\n"," <td>Buenas noches, Tengo 34 años y mi esposa 25 t...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "],"text/plain":[" category ... text\n","0 Sección de Ansiedad y Fobias ... Veo que en este foro, afortunadamente para vos...\n","1 Sección de Ansiedad y Fobias ... Hola a todos,Les escribo porque hace un tiempo...\n","2 Sección de Ansiedad y Fobias ... Hola a todos, tengo 24 años y hace como 2 años...\n","3 Relaciones Padres e Hijos ... Hola, Soy nueva en esto del foro, de hecho ent...\n","4 Relaciones Padres e Hijos ... Buenas noches, Tengo 34 años y mi esposa 25 t...\n","\n","[5 rows x 4 columns]"]},"metadata":{},"execution_count":96}]},{"cell_type":"code","source":["textAnalisisSpacy.textProcessing(df['text'].tolist(),df['category'].tolist())\n","textAnalisisSpacy.volumetry()\n","textAnalisisSpacy.lemmas()\n","textAnalisisSpacy.lemmas_freq()\n","textAnalisisSpacy.pos()\n","textAnalisisSpacy.pos_freq()\n","textAnalisisSpacy.lexical_diversity()\n","textAnalisisSpacy.complexity()\n","textAnalisisSpacy.featureSelection()\n","textAnalisisSpacy.kBest()\n","textAnalisisSpacy.export()"],"metadata":{"id":"4LkSJ0COcetm","colab":{"base_uri":"https://localhost:8080/","height":1000,"output_embedded_package_id":"1vpzTTG4oQWTZufKEDJkd-zoAgVs4_JJu"},"executionInfo":{"status":"ok","timestamp":1644412718344,"user_tz":-60,"elapsed":95780,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"}},"outputId":"c871b294-a937-4da4-a4c3-fde5b9f4215a"},"execution_count":97,"outputs":[{"output_type":"display_data","data":{"text/plain":"Output hidden; open in https://colab.research.google.com to view."},"metadata":{}}]}],"metadata":{"colab":{"collapsed_sections":[],"name":"Biblioteca Análisis-PLN.ipynb","provenance":[],"toc_visible":true,"mount_file_id":"1WdsL5lkOudV-Xpbrmg6f92v3ujY8QHAf","authorship_tag":"ABX9TyMv+hpn4P7K1p0PdKq3+UyM"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
\ No newline at end of file
{"cells":[{"cell_type":"markdown","metadata":{"id":"tgTuDdZIH7og"},"source":["# Librerías"]},{"cell_type":"code","execution_count":85,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":9817,"status":"ok","timestamp":1644412293290,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"ydZHklMLF2g9","outputId":"36f2c877-04f4-4e87-886d-3608f0757c0e"},"outputs":[{"name":"stdout","output_type":"stream","text":["Requirement already satisfied: lexical-diversity in /usr/local/lib/python3.7/dist-packages (0.1.1)\n","Requirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4)\n","Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (7.4.0)\n","Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.62.3)\n","Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1)\n","Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.9.0)\n","Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5)\n","Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.0)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.4.0)\n","Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.1.3)\n","Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.19.5)\n","Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0)\n","Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.6)\n","Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.6)\n","Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.6)\n","Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (4.10.1)\n","Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.10.0.2)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.7.0)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.10.8)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n","Requirement already satisfied: syllables in /usr/local/lib/python3.7/dist-packages (1.0.3)\n"]}],"source":["!pip install lexical-diversity # Lexical Diversity\n","!pip install spacy # Natural Language Processing\n","!pip install syllables # Syllable counter for english"]},{"cell_type":"code","execution_count":86,"metadata":{"executionInfo":{"elapsed":26,"status":"ok","timestamp":1644412293291,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"F8_bf3LFYobW"},"outputs":[],"source":["crea_total_path = './CREA_total.txt'"]},{"cell_type":"markdown","metadata":{"id":"iyyf3urz3ljW"},"source":["# TextComplexitySpacy"]},{"cell_type":"code","execution_count":87,"metadata":{"executionInfo":{"elapsed":824,"status":"ok","timestamp":1644412294091,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"fbsCkEC8vBLg"},"outputs":[],"source":["from functools import reduce\n","import math\n","import syllables\n","class TextComplexitySpacy():\n"," \n"," def __init__(self, lang='es'):\n"," self.lang = lang\n","\n"," # create language analyzer\n"," if lang == 'es':\n"," self.nlp = es_core_news_sm.load()\n"," if lang == 'en':\n"," self.nlp = spacy.load(\"en_core_web_sm\")\n","\n"," # Para leer el texto que introducimos\n"," f = open(crea_total_path)\n"," lines = f.readlines()\n"," f.close()\n"," crea = {}\n"," for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words\n"," data = l.strip().split()\n"," crea[data[1]] = float(data[2].replace(',', ''))\n"," self.crea = crea\n"," pass\n","\n"," def textProcessing(self, text):\n"," # Meter todas las funciones en una patron de los tokens válidos\n"," doc = self.nlp(text)\n"," self.tokens = [w for w in doc]\n"," self.sentences = [sent for sent in doc.sents]\n"," self.nsentences = len(self.sentences)\n"," \n"," '''\n"," Filtra aquellos tokens que no sean adjetivos, verbos o sustantivos\n"," '''\n"," pos_content_sentences = []\n"," for sentence in self.sentences:\n"," ws = self.nlp(sentence.text)\n"," pos_content_sentences.append([w for w in ws if re.match('NOUN.*|VERB.*|ADJ.*', w.pos_)])\n"," self.pos_content_sentences = pos_content_sentences \n"," \n"," return self.pos_content_sentences\n","\n"," \n"," def punctuationMarks(self):\n"," # Solo nos interesa contar los tokens que sean signo de puntuación.\n"," # Number of words.\n"," punctuation = []\n"," N_words = []\n"," for w in self.tokens:\n"," if re.match('PUNCT.*', w.pos_):\n"," punctuation.append(w.text)\n"," else:\n"," N_words.append(w.text)\n","\n"," aux = len(N_words) \n"," if aux == 0:\n"," aux = 1\n"," self.N_words = aux\n"," \n"," self.npunctuation = len(punctuation)\n"," self.punctuation = punctuation\n"," \n"," return self.npunctuation, self.punctuation, self.N_words\n","\n"," def lexicalComplexity(self):\n"," # Number of low frequency words \n"," count = 0\n"," for sentence in self.pos_content_sentences:\n"," for w in sentence:\n"," if w.text not in self.crea:\n"," count+=1\n"," N_lfw = count\n"," self.N_lfw = N_lfw\n","\n"," # Number of distinct content words \n"," N_dcw = len(set([w.text.lower() for s in self.pos_content_sentences for w in s]))\n"," self.N_dcw =N_dcw\n","\n"," # Number of sentences\n"," self.N_s = len(self.pos_content_sentences)\n","\n"," # Number of total content words\n"," N_cw = reduce((lambda x, y: x + y), [len(s) for s in self.pos_content_sentences])\n"," self.N_cw = N_cw\n","\n"," # Lexical Distribution Index\n"," if self.N_s == 0:\n"," self.N_s = 1\n"," LDI = N_dcw / float(self.N_s)\n"," self.LDI = LDI\n"," \n"," # Index of Low Frequency Words\n"," if N_cw == 0:\n"," N_cw = 1\n"," ILFW = N_lfw / float(N_cw)\n"," self.ILFW =ILFW\n","\n"," # Lexical Complexity\n"," LC = (LDI + ILFW) / 2\n"," self.LC = LC\n"," \n"," return self.N_lfw, self.N_cw, self.N_dcw, self.N_s, self.LDI, self.ILFW, self.LC\n","\n"," def ssReadability(self): \n"," #Number of rare words\n"," byfreq = sorted(self.crea, key=self.crea.__getitem__, reverse=True)\n"," byfreq = byfreq[:1500]\n"," count = 0\n"," for sentence in self.pos_content_sentences:\n"," for w in sentence:\n"," if w.text.lower() not in byfreq:\n"," count +=1\n"," N_rw = count\n"," self.N_rw = N_rw\n"," \n"," SSR = 1.609*(self.N_words / self.N_s) + 331.8* (self.N_rw /self.N_words) + 22.0 \n"," self.SSR= SSR\n"," \n"," return self.N_rw, self.SSR\n","\n"," def sentenceComplexity(self):\n"," #Number of complex sentences\n"," N_cs = 0\n"," for sentence in self.sentences:\n"," previous_is_verb = False\n"," count = 0\n"," for w in sentence:\n"," if re.match('VERB.*', w.pos_):\n"," if (previous_is_verb):\n"," count += 1\n"," previous_is_verb = False\n"," else:\n"," previous_is_verb = True\n"," else:\n"," previous_is_verb = False\n"," if count>0:\n"," N_cs += 1 \n"," self.N_cs = N_cs\n"," \n"," ASL = self.N_words / self.N_s\n"," self.ASL = ASL\n"," \n"," CS = self.N_cs / self.N_s\n"," self.CS = CS\n"," \n"," SCI = (ASL + CS)/ 2\n"," self.SCI = SCI\n"," \n"," return self.N_cs, self.ASL, self.CS, self.SCI\n","\n"," def autoReadability(self):\n"," # Number of characters\n"," count = 0\n"," listwords = []\n"," for words in self.sentences:\n"," for w in words:\n"," if re.match('\\r\\n.*', w.text):\n"," count +=1\n"," else:\n"," listwords.append(w.text)\n"," \n"," self.listwords = listwords \n"," N_charac = 0\n"," for characters in self.listwords:\n"," N_charac += len(characters)\n"," \n"," self.N_charac = N_charac\n"," \n"," ARI = 4.71 * self.N_charac / self.N_words + 0.5 * self.N_words/ self.N_s - 21.43\n"," self.ARI = ARI\n"," \n"," return self.N_charac, self.ARI, self.listwords\n","\n"," \n"," def tree_height(self,root, cont):\n"," if not list(root.children):\n"," return 1\n"," else:\n"," cont+=1\n"," if cont == 320:\n"," return 320\n"," return 1 + max(self.tree_height(x, cont) for x in root.children)\n","\n"," def embeddingDepth(self):\n"," ## Output results\n"," roots = [sent.root for sent in self.sentences]\n"," max_list = []\n"," max_list = [self.tree_height(root,0) for root in roots]\n"," mean_max_list = sum(max_list)/(len(max_list))\n"," max_max_list = max(max_list)\n"," min_max_list = min(max_list)\n"," \n"," self.max_max_list = max_max_list\n"," self.min_max_list = min_max_list\n"," self.mean_max_list = mean_max_list\n"," \n"," return self.max_max_list, self.min_max_list, self.mean_max_list\n","\n"," def syllable_counter_spanish(self,text):\n"," if self.lang == 'es':\n"," t = re.sub(r'y([aáeéiíoóuú])', '\\\\1', text.lower())\n"," t = re.sub(r'[aáeéioóu][iuy]', 'A', t.lower())\n"," t = re.sub(r'[iu][aáeyéioóu]', 'A', t).lower()\n"," t = re.sub(r'[aáeéiíoóuúy]', 'A', t)\n"," return(len(t.split('A'))-1)\n","\n"," elif self.lang == 'en':\n"," return syllables.estimate(text)\n","\n"," def readability(self):\n"," # Number of syllables and Number of words with 3 or more syllables:tagger\n"," n_syllables = 0\n"," n_syllables3 = 0\n"," for words in self.listwords:\n"," syllables = self.syllable_counter_spanish(words)\n"," n_syllables += syllables\n"," if syllables>=3:\n"," n_syllables3 += 1\n"," \n"," self.n_syllables = n_syllables\n"," self.n_syllables3 = n_syllables3\n"," \n"," # Number of letters\n"," nletters= 0\n"," letters = []\n"," vecletters =[]\n"," for word in self.listwords:\n"," if re.match('[a-zA-Z]|á|ó|í|ú|é', word):\n"," letters.append(word)\n"," nletters+=len(word)\n"," vecletters.append(len(word))\n"," \n"," self.letters = letters\n"," self.nletters = nletters\n"," self.vecletters= vecletters\n"," \n"," huertareadability = 206.835 - 60 * (self.n_syllables / self.N_words) - 102 * (self.nsentences / self.N_words)\n"," self.huertareadability = huertareadability\n"," \n"," ifszreadability = 206.835 - 62.3 * (self.n_syllables / self.N_words) - (self.N_words / self.nsentences) \n"," self.ifszreadability = ifszreadability\n"," \n"," self.syll_words = self.n_syllables / self.N_words\n"," \n"," \n"," polinicompressibility = 95.2 - 9.7 * (self.nletters / self.N_words) - 0.35 * (self.N_words / self.nsentences) \n"," self.polinicompressibility = polinicompressibility\n"," \n"," self.words_sen = self.N_words / self.nsentences\n"," \n"," x=self.nletters / self.N_words\n"," varianza=np.var(self.vecletters)\n"," if varianza == 0:\n"," varianza =1\n"," aux = self.N_words-1\n"," if aux == 0:\n"," aux=1\n"," mureadability = (self.N_words /aux)*(x/varianza)*100\n"," self.mureadability = mureadability\n"," \n"," return self.n_syllables, self.n_syllables3, self.nletters, self.huertareadability, self.ifszreadability, self.polinicompressibility, self.mureadability, self.syll_words, self.words_sen\n"," \n"," def ageReadability(self):\n"," \n"," minimumage = 0.2495 *(self.N_words/self.nsentences) + 6.4763 * (self.n_syllables /self.N_words) - 7.1395\n"," self.minimumage = minimumage\n"," \n"," solreadability= -2.51+0.74*(3.1291+1.0430*math.sqrt(self.n_syllables3*(30/self.nsentences)))\n"," self.solreadability = solreadability\n"," \n"," return self.minimumage, self.solreadability\n"," \n"," def yearsCrawford(self):\n"," \n"," years = -20.5 *(self.nsentences/self.N_words) + 4.9 * (self.n_syllables /self.N_words) - 3.407\n"," self.years = years\n"," \n"," return self.years"]},{"cell_type":"markdown","metadata":{"id":"GQm029q73tRQ"},"source":["# TextAnalysisSpacy"]},{"cell_type":"code","execution_count":88,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":11269,"status":"ok","timestamp":1644412305356,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"9Eo_cwABcmVf","outputId":"b07b5c12-1bb0-4983-bfa5-106bdc142451"},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the model via spacy.load('es_core_news_sm')\n"]}],"source":["# Only required for analysis in Spanish\n","import spacy.cli \n","spacy.cli.download(\"es_core_news_sm\")\n","import es_core_news_sm\n","\n","# Imports\n","import spacy\n","import numpy as np\n","from tqdm import tqdm\n","import re\n","import pandas as pd\n","\n","import matplotlib.pyplot as plt\n","%matplotlib inline\n","import seaborn as sb\n","import nltk\n","from nltk.probability import FreqDist\n","from nltk.text import Text\n","from lexical_diversity import lex_div as ld\n","\n","class TextAnalysisSpacy():\n"," \n"," def __init__(self, lang='es'):\n","\n"," # Create language analyzer\n"," if lang == 'es':\n"," self.nlp = es_core_news_sm.load()\n"," self.textComplexitySpacy = TextComplexitySpacy()\n"," self.nlp = es_core_news_sm.load()\n"," elif lang == 'en':\n"," self.nlp = spacy.load(\"en_core_web_sm\")\n"," self.textComplexitySpacy = TextComplexitySpacy('en')\n","\n"," self.Text = Text\n"," self.FreqDist = FreqDist\n"," self.POS_LIST = [\"ADJ\", \"ADP\", \"ADV\", \"AUX\",\"X\", \"CCONJ\",\"CONJ\", \"DET\", \"INTJ\", \"NOUN\", \"NUM\", \"PART\", \"PRON\", \"PROPN\", \"PUNCT\", \"SCONJ\", \"SYM\", \"VERB\", \"SPACE\"]\n"," pass\n","\n"," # \n"," # X = samples input , y = tags\n"," #\n"," def textProcessing(self, X, y):\n"," d = {'category':y, 'text':X}\n"," self.df = pd.DataFrame(d)\n"," # Replace gaps\n"," self.df['text'].replace(np.nan,'', inplace=True)\n"," print('Shape: ', self.df.shape)\n","\n"," # Create category dictionary\n"," self.dic_categorias = {}\n"," for i in range(len(df)): \n"," if df.iloc[i,0] in self.dic_categorias:\n"," self.dic_categorias[df.iloc[i,0]] += 1\n"," else:\n"," self.dic_categorias[df.iloc[i,0]] = 1\n","\n"," self.df_category = pd.DataFrame({'category': self.dic_categorias.keys()})\n"," print('Dictionary of categories:', self.dic_categorias)\n","\n"," # Initialising variables for graphs\n"," sb.set(rc={'figure.figsize':(14,6)})\n"," all_values = self.dic_categorias.values()\n"," self.max_value = max(all_values)\n","\n"," def showGraph(self, columnas, type_g='strip',export=False):\n"," # Graph generator\n"," for columna in columnas:\n"," if (type_g == 'strip'):\n"," splot = sb.stripplot(x=columna,y='category', data=self.df)\n"," elif (type_g == 'box'):\n"," splot = sb.boxplot(x=columna,y='category', data=self.df)\n"," elif (type_g == 'heatmap'):\n"," dic={}\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," dic[cat] = df_grupo[columna].tolist()\n"," while len(dic[cat]) < self.max_value:\n"," dic[cat].append(dic[cat][len(dic[cat])-1])\n"," df_n = pd.DataFrame(dic)\n"," splot = sb.heatmap(df_n.transpose()).set_title(columna)\n"," if export == False:\n"," plt.show()\n"," else:\n"," splot.get_figure().savefig(columna+\"-\"+ type_g+\".jpg\", bbox_inches='tight')\n"," plt.clf()\n","\n"," def export(self):\n"," print('Exporting...')\n"," self.df.to_csv(\"data.csv\") \n"," self.df_category.to_csv(\"data_cat.csv\")\n"," self.showGraph(self.df.columns[2:],'strip',True)\n"," self.showGraph(self.df.columns[2:],'box',True)\n"," self.showGraph(self.df.columns[2:],'heatmap',True)\n","\n"," def volumetry(self):\n"," # Volumetrics for each text\n"," self.df['words'] = [len(text.split()) for text in self.df['text'].tolist()] # Number of words\n"," self.df['uniques'] = [len(set(text.split())) for text in self.df['text'].tolist()] # Number of unique words\n"," self.df['chars'] = self.df['text'].str.len() # Number of characters\n"," self.df['avg_words_len'] = round(self.df['chars'] / self.df['words'], 3) # Average word length\n"," self.df = self.df.replace([np.inf, -np.inf, np.nan], 0)\n"," \n"," # Volumetrics for each category\n"," volumetry = ['words','uniques','chars','avg_words_len']\n"," category_columns = ['category','docs']\n"," for col in volumetry:\n"," category_columns.append('avg_'+col)\n"," category_columns.append('std_'+col)\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in volumetry:\n"," self.df_category.loc[i,'docs'] = len(df_grupo)\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3) \n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 5)\n"," i+=1\n"," \n"," print('Volumetrics for each text:')\n"," display(self.df.head())\n"," print('Volumetrics for each category:')\n"," display(self.df_category[category_columns])\n"," \n"," self.showGraph(volumetry,'strip')\n"," self.showGraph(volumetry,'box')\n"," self.showGraph(volumetry,'heatmap')\n","\n"," return self.df, self.df_category\n","\n"," def lemmas(self):\n"," # Number and length of different lemmas per text\n"," dic_lemmas = {}\n"," for cat in self.dic_categorias:\n"," dic_lemmas[cat] = []\n","\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in tqdm(self.dic_categorias):\n"," df_grupo = groups.get_group(cat)\n"," for text in df_grupo['text'].tolist():\n"," set_ = set()\n"," suma = 0\n"," doc = self.nlp(text)\n"," for token in doc:\n"," set_.add(token.lemma_)\n"," suma += len(token.lemma_)\n"," if re.match('PUNCT.*|SYM.*|SPACE.*', token.pos_) == None:\n"," dic_lemmas[cat].append(token.lemma_)\n"," self.df.loc[i,'lemmas_uniques'] = len(set_)\n"," if(len(set_) != 0):\n"," self.df.loc[i,'avg_lemmas_len'] = round(suma / len(set_), 3)\n"," else:\n"," self.df.loc[i,'avg_lemmas_len'] = suma\n"," i+=1\n"," self.dic_lemmas = dic_lemmas\n","\n"," # Average and variance of different lemmas and length by category\n"," i = 0\n"," col_lemmas = ['lemmas_uniques','avg_lemmas_len']\n"," category_lemmas = ['category']\n"," for col in col_lemmas:\n"," category_lemmas.append('avg_'+col)\n"," category_lemmas.append('std_'+col)\n"," \n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_lemmas:\n"," self.df_category.loc[i,'docs'] = len(df_grupo)\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 3) \n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 3)\n"," i+=1\n"," \n"," print('Lemmas for each text:')\n"," display(self.df.head())\n"," print('Lemmas for each category:')\n"," display(self.df_category[category_lemmas])\n","\n"," self.showGraph(col_lemmas,'strip')\n"," self.showGraph(col_lemmas,'box')\n"," self.showGraph(col_lemmas,'heatmap')\n","\n"," return self.df, self.df_category\n","\n"," def lemmas_freq(self, n = 50):\n"," # Most frequent lemmas by category\n"," dic_f_lemmas = self.dic_categorias.copy()\n"," for cat in self.dic_categorias:\n"," text = self.Text(self.dic_lemmas[cat])\n"," dic_f_lemmas[cat] = self.FreqDist(text).most_common(n)\n"," lista = []\n"," for tupla in dic_f_lemmas[cat]:\n"," lista.append((tupla[0], round(tupla[1] / len(self.dic_lemmas[cat]), 4)))\n"," while (len(lista) < n): # Rellenar huecos\n"," lista.append(np.nan)\n"," dic_f_lemmas[cat] = lista\n","\n"," df_freq_lemas = pd.DataFrame(dic_f_lemmas)\n"," df_freq_lemas_tr = df_freq_lemas.transpose()\n"," print('Most frequent lemmas by category')\n"," display(df_freq_lemas_tr)\n"," df_freq_lemas_tr.to_csv(\"lemas_freq.csv\") \n"," return df_freq_lemas.transpose()\n","\n"," def pos(self):\n"," # POS analysis for each text\n"," dic_pos_cat = {}\n"," for pos in self.POS_LIST:\n"," dic_pos_cat[pos] = {}\n"," for cat in self.dic_categorias:\n"," dic_pos_cat[pos][cat] = []\n","\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for text in tqdm(df_grupo['text'].tolist()):\n"," dic_pos = {}\n"," doc = self.nlp(text)\n"," for token in doc:\n"," if token.pos_ in dic_pos:\n"," dic_pos[token.pos_] += 1\n"," else:\n"," dic_pos[token.pos_] = 1\n"," dic_pos_cat[token.pos_][cat].append(token.text)\n"," total = len(doc)\n"," if total == 0:\n"," total = 1\n"," for pos in self.POS_LIST:\n"," if pos in dic_pos:\n"," self.df.loc[i,pos] = round(dic_pos[pos]/total,4)\n"," else:\n"," self.df.loc[i,pos] = np.nan\n"," i+=1\n"," self.dic_pos_cat = dic_pos_cat\n","\n"," # POS analysis for each category\n"," i = 0\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for pos in self.POS_LIST:\n"," if pos in df_grupo.columns.values:\n"," self.df_category.loc[i,'avg_'+pos] = round(df_grupo[pos].mean(), 3)\n"," self.df_category.loc[i,'std_'+pos] = round(df_grupo[pos].std(), 3)\n"," i+=1\n","\n"," print('POS analysis for each text')\n"," display(self.df.head())\n"," print('POS analysis for each category')\n"," display(self.df_category)\n"," self.showGraph(self.POS_LIST,'strip')\n"," self.showGraph(self.POS_LIST,'box')\n"," self.showGraph(self.POS_LIST,'heatmap')\n"," return self.df, self.df_category\n","\n"," def pos_freq(self, n = 15):\n"," # Most frequent words \n"," dic_f_palabras = self.dic_categorias.copy()\n"," for pos in self.POS_LIST:\n"," for cat in self.dic_categorias:\n"," if cat in self.dic_pos_cat[pos]:\n"," text = self.Text(self.dic_pos_cat[pos][cat])\n"," fdist = self.FreqDist(text)\n"," dic_f_palabras[cat] = fdist.most_common(n)\n"," lista = []\n"," for tupla in dic_f_palabras[cat]:\n"," lista.append((tupla[0],round(tupla[1] / len(self.dic_pos_cat[pos][cat]), 5))) \n"," dic_f_palabras[cat] = lista\n","\n"," while (len(dic_f_palabras[cat]) < n): # Rellenar huecos\n"," dic_f_palabras[cat].append(np.nan)\n","\n"," df_freq_palabras = pd.DataFrame(dic_f_palabras)\n"," print(\"---- Para \" + spacy.explain(pos) +\" las \"+ str(n)+\" palabras más frecuentes son: -------\")\n"," display(df_freq_palabras.transpose())\n"," df_freq_palabras_tr = df_freq_palabras.transpose()\n"," df_freq_palabras_tr.to_csv(\"POS_\"+ str(pos)+\"_freq.csv\") \n"," return df_freq_palabras.transpose()\n","\n"," def lexical_diversity(self):\n"," # Lexical diversity for each text\n"," i = 0\n"," for text in tqdm(self.df['text'].tolist()):\n"," flt = ld.flemmatize(text)\n"," self.df.loc[i,'simple_TTR'] = round(ld.ttr(flt), 4)\n"," self.df.loc[i,'root_TTR'] = round(ld.root_ttr(flt), 4)\n"," self.df.loc[i,'log_TTR'] = round(ld.log_ttr(flt), 4)\n"," self.df.loc[i,'maas_TTR'] = round(ld.maas_ttr(flt), 4)\n"," self.df.loc[i,'MSTTR'] = round(ld.msttr(flt), 4)\n"," self.df.loc[i,'MATTR'] = round(ld.mattr(flt), 4)\n"," self.df.loc[i,'HDD'] = round(ld.hdd(flt), 4)\n"," self.df.loc[i,'MTLD'] = round(ld.mtld(flt), 4)\n"," i+=1\n","\n"," # Lexical diversity for each category\n"," i = 0\n"," col_diversity = ['simple_TTR','root_TTR','log_TTR','maas_TTR','MSTTR','MATTR','HDD','MTLD']\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_diversity:\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(),4)\n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(),4)\n"," i+=1\n"," print('Lexical diversity for each text')\n"," display(self.df.head())\n"," print('Lexical diversity for each category')\n"," display(self.df_category)\n"," self.showGraph(col_diversity,'strip')\n"," self.showGraph(col_diversity,'box')\n"," self.showGraph(col_diversity,'heatmap')\n"," return self.df, self.df_category\n","\n"," def complexity(self):\n"," # Complexity diversity for each category\n"," i = 0\n"," for text in tqdm(self.df['text'].tolist()):\n"," if len(text) > 0:\n"," text_processed = self.textComplexitySpacy.textProcessing(text)\n"," pmarks = self.textComplexitySpacy.punctuationMarks()[0]\n"," self.df.loc[i,'lexcomplexity'] = self.textComplexitySpacy.lexicalComplexity()[6]\n"," self.df.loc[i,'ssreadability'] = self.textComplexitySpacy.ssReadability()[1]\n"," self.df.loc[i,'sencomplexity'] = self.textComplexitySpacy.sentenceComplexity()[3]\n"," self.df.loc[i,'autoreadability'] = self.textComplexitySpacy.autoReadability()[1]\n"," embeddingdepth = self.textComplexitySpacy.embeddingDepth()\n"," self.df.loc[i,'max_embeddingdepth'] = embeddingdepth[0]\n"," self.df.loc[i,'min_embeddingdepth'] = embeddingdepth[1]\n"," self.df.loc[i,'avg_embeddingdepth'] = embeddingdepth[2]\n"," readability = self.textComplexitySpacy.readability()\n"," self.df.loc[i,'huertareadability'] = round(readability[3],4)\n"," self.df.loc[i,'ifszreadability'] = round(readability[4],4)\n"," self.df.loc[i,'polinicompressibility'] = round(readability[5],4)\n"," self.df.loc[i,'mureadability'] = round(readability[6],4)\n"," self.df.loc[i,'agereadability'] = self.textComplexitySpacy.ageReadability()[0]\n"," self.df.loc[i,'yearscrawford'] = self.textComplexitySpacy.yearsCrawford()\n"," i+=1\n","\n"," # Complexity diversity for each category\n"," i = 0\n"," col_complexity = ['lexcomplexity','ssreadability','sencomplexity','autoreadability','max_embeddingdepth',\n"," 'min_embeddingdepth','avg_embeddingdepth','huertareadability','ifszreadability',\n"," 'polinicompressibility','mureadability','agereadability','yearscrawford']\n"," groups = self.df.groupby(self.df.category)\n"," for cat in self.dic_categorias:\n"," df_grupo = groups.get_group(cat)\n"," for col in col_complexity:\n"," self.df_category.loc[i,'avg_'+col] = round(df_grupo[col].mean(), 4)\n"," self.df_category.loc[i,'std_'+col] = round(df_grupo[col].std(), 4)\n"," i+=1\n"," \n"," print('Complexity diversity for each text')\n"," display(self.df.head())\n"," print('Complexity diversity for each category')\n"," display(self.df_category)\n"," self.showGraph(col_complexity,'strip')\n"," self.showGraph(col_complexity,'box')\n"," self.showGraph(col_complexity,'heatmap')\n"," return self.df, self.df_category\n","\n"," def featureSelection(self):\n"," df = self.df.fillna(0)\n"," X = df.iloc[:,2:]\n"," y = df.iloc[:,0]\n","\n"," from sklearn.feature_selection import VarianceThreshold, SelectFromModel\n"," # Removing features with low variance\n"," sel = VarianceThreshold(threshold=(.8 * (1 - .8))) # No varía en más del 80% de datos\n"," arr = sel.fit_transform(X)\n"," print('Removing features with low variance...')\n"," print('Selected columns:',sel.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(arr))\n"," pd.DataFrame(arr).to_csv(\"VarianceThreshold.csv\") \n","\n"," # SelectFromModel\n"," # Selection of functions based on L1\n"," from sklearn.svm import LinearSVC\n","\n"," lsvc = LinearSVC(C=0.01, penalty=\"l1\", dual=False).fit(X, y)\n"," model = SelectFromModel(lsvc, prefit=True)\n"," X_new = model.transform(X)\n"," print('Removing features with SelectFromModel...')\n"," print('Selected columns:',model.get_feature_names_out(df.columns.values[2:]))\n"," display(pd.DataFrame(X_new))\n"," pd.DataFrame(X_new).to_csv(\"SelectFromModel.csv\") \n"," \n","\n"," def kBest(self, k = 10):\n"," df = self.df.fillna(0)\n"," X = df.iloc[:,2:]\n"," y = df.iloc[:,0]\n"," # Univariate feature selection\n"," from sklearn.feature_selection import SelectKBest\n"," from sklearn.feature_selection import f_classif, mutual_info_classif\n"," print('Highest scoring '+ str(k) +' features with f_classif...')\n"," kbest_classif = SelectKBest(f_classif, k=k) # Elimina todo menos las k características de puntuación más alta\n"," X_classif = kbest_classif.fit_transform(X, y)\n"," print('Selected columns:',kbest_classif.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(X_classif))\n"," pd.DataFrame(X_classif).to_csv(\"f_classif.csv\") \n","\n"," print('Highest scoring '+ str(k) +' features with mutual_info_classif...')\n"," kbest_mut = SelectKBest(mutual_info_classif, k=k)\n"," X_mut = kbest_mut.fit_transform(X, y)\n"," print('Selected columns:', kbest_mut.get_feature_names_out(self.df.columns.values[2:]))\n"," display(pd.DataFrame(X_mut))\n"," pd.DataFrame(X_mut).to_csv(\"mutual_info_classif.csv\") \n"," "]},{"cell_type":"markdown","metadata":{"id":"WLTN5U8b32S6"},"source":["# Analysis"]},{"cell_type":"code","execution_count":89,"metadata":{"executionInfo":{"elapsed":11349,"status":"ok","timestamp":1644412316691,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"ViskMg97pXwg"},"outputs":[],"source":["textAnalysisSpacy = TextAnalysisSpacy('es')"]},{"cell_type":"markdown","metadata":{"id":"nrNo-f5p35ei"},"source":["# Carga datos"]},{"cell_type":"code","execution_count":96,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":244,"status":"ok","timestamp":1644412620771,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"lVjjWxvEn5LP","outputId":"e760b59b-ecf6-4308-ec13-8bff00a4f99c"},"outputs":[{"data":{"text/html":["\n"," <div id=\"df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>category</th>\n"," <th>author</th>\n"," <th>title</th>\n"," <th>text</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>Nereida</td>\n"," <td>TOC Y COMPROBACIONES</td>\n"," <td>Veo que en este foro, afortunadamente para vos...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>desesperacion15</td>\n"," <td>No se que me sucede.</td>\n"," <td>Hola a todos,Les escribo porque hace un tiempo...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>Sección de Ansiedad y Fobias</td>\n"," <td>Holli</td>\n"," <td>No puedo con la Ansiedad</td>\n"," <td>Hola a todos, tengo 24 años y hace como 2 años...</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>Relaciones Padres e Hijos</td>\n"," <td>watermelon</td>\n"," <td>Les digo o no les digo que sufrí abusos de niñ...</td>\n"," <td>Hola, Soy nueva en esto del foro, de hecho ent...</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>Relaciones Padres e Hijos</td>\n"," <td>mr_rol</td>\n"," <td>Mi esposa trata mal a nuestro hijo de ocho a...</td>\n"," <td>Buenas noches, Tengo 34 años y mi esposa 25 t...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-c12b3f47-d8e4-4178-abb8-a8cebf52ef9e');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "],"text/plain":[" category ... text\n","0 Sección de Ansiedad y Fobias ... Veo que en este foro, afortunadamente para vos...\n","1 Sección de Ansiedad y Fobias ... Hola a todos,Les escribo porque hace un tiempo...\n","2 Sección de Ansiedad y Fobias ... Hola a todos, tengo 24 años y hace como 2 años...\n","3 Relaciones Padres e Hijos ... Hola, Soy nueva en esto del foro, de hecho ent...\n","4 Relaciones Padres e Hijos ... Buenas noches, Tengo 34 años y mi esposa 25 t...\n","\n","[5 rows x 4 columns]"]},"execution_count":96,"metadata":{},"output_type":"execute_result"}],"source":["df = pd.read_csv(\"./texts.csv\")\n","df.head()"]},{"cell_type":"code","execution_count":97,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000,"output_embedded_package_id":"1vpzTTG4oQWTZufKEDJkd-zoAgVs4_JJu"},"executionInfo":{"elapsed":95780,"status":"ok","timestamp":1644412718344,"user":{"displayName":"Alba María Mármol Romero","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"17964384655629355542"},"user_tz":-60},"id":"4LkSJ0COcetm","outputId":"c871b294-a937-4da4-a4c3-fde5b9f4215a"},"outputs":[{"data":{"text/plain":["Output hidden; open in https://colab.research.google.com to view."]},"metadata":{},"output_type":"display_data"}],"source":["textAnalysisSpacy.textProcessing(df['text'].tolist(),df['category'].tolist())\n","textAnalysisSpacy.volumetry()\n","textAnalysisSpacy.lemmas()\n","textAnalysisSpacy.lemmas_freq()\n","textAnalysisSpacy.pos()\n","textAnalysisSpacy.pos_freq()\n","textAnalysisSpacy.lexical_diversity()\n","textAnalysisSpacy.complexity()\n","textAnalysisSpacy.featureSelection()\n","textAnalysisSpacy.kBest()\n","textAnalysisSpacy.export()"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyMv+hpn4P7K1p0PdKq3+UyM","collapsed_sections":[],"mount_file_id":"1WdsL5lkOudV-Xpbrmg6f92v3ujY8QHAf","name":"Biblioteca Análisis-PLN.ipynb","provenance":[],"toc_visible":true},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment