fixing some bugs

parent 61cb8472
This diff could not be displayed because it is too large.
...@@ -29,12 +29,24 @@ ...@@ -29,12 +29,24 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"id": "-OACG_k2-zh2" "id": "-OACG_k2-zh2",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "ed951cba-8b10-469d-f910-ed62987324c9"
}, },
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
"You can now load the package via spacy.load('es_core_news_sm')\n"
]
}
],
"source": [ "source": [
"import sys\n", "\n",
"sys.path.append('..')\n",
"from textflow.SequenceDirectory import SequenceDirectory\n", "from textflow.SequenceDirectory import SequenceDirectory\n",
"from textflow.SequenceFile import SequenceFile\n", "from textflow.SequenceFile import SequenceFile\n",
"from textflow.SequenceString import SequenceString\n", "from textflow.SequenceString import SequenceString\n",
...@@ -53,7 +65,7 @@ ...@@ -53,7 +65,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"source": [ "source": [
"s = SequenceDirectory(os.getcwd()+\"/ExampleDirectory\",[\"directory\",\"file\",\"string\"],[\"directories\",\"files\",\"words\"],[SequenceFile,SequenceString])" "s = SequenceDirectory(os.getcwd()+\"/ExampleDirectory\",[\"directories\",\"files\",\"words\"],[SequenceFile,SequenceString])"
], ],
"metadata": { "metadata": {
"id": "aW_DovfAZ8TL" "id": "aW_DovfAZ8TL"
...@@ -71,7 +83,7 @@ ...@@ -71,7 +83,7 @@
"base_uri": "https://localhost:8080/" "base_uri": "https://localhost:8080/"
}, },
"id": "l30ebggfl_rc", "id": "l30ebggfl_rc",
"outputId": "ee5ca4aa-016a-4b4c-a75c-2b7f4fe60a5b" "outputId": "8eb4cd20-04b7-4b64-b8a5-8daacd0772df"
}, },
"execution_count": null, "execution_count": null,
"outputs": [ "outputs": [
...@@ -85,7 +97,7 @@ ...@@ -85,7 +97,7 @@
" children: {'files': [Sequence(\n", " children: {'files': [Sequence(\n",
" format: file\n", " format: file\n",
" metadata: {'text': '\\ufeffVeo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.\\nSé que abrí un post parecido hace tiempo, pero ya quedó abajo y por tanto en el olvido, por eso abro este por si alguna persona nueva con este problema lo lee.Me gustaría saber qué os recetan a vosotros para esto y si os va bien.\\n\\n\\nSaludos.\\nNereida.', 'nameFile': 'Documento sin título.txt'}\n", " metadata: {'text': '\\ufeffVeo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.\\nSé que abrí un post parecido hace tiempo, pero ya quedó abajo y por tanto en el olvido, por eso abro este por si alguna persona nueva con este problema lo lee.Me gustaría saber qué os recetan a vosotros para esto y si os va bien.\\n\\n\\nSaludos.\\nNereida.', 'nameFile': 'Documento sin título.txt'}\n",
" children: {'words': [Sequence(\n", " children: {'files': [Sequence(\n",
" format: string\n", " format: string\n",
" metadata: {'text': '\\ufeffVeo'}\n", " metadata: {'text': '\\ufeffVeo'}\n",
" children: {}\n", " children: {}\n",
...@@ -519,7 +531,7 @@ ...@@ -519,7 +531,7 @@
] ]
}, },
"metadata": {}, "metadata": {},
"execution_count": 12 "execution_count": 7
} }
] ]
}, },
...@@ -545,7 +557,7 @@ ...@@ -545,7 +557,7 @@
"base_uri": "https://localhost:8080/" "base_uri": "https://localhost:8080/"
}, },
"id": "OVYdTcDOn04r", "id": "OVYdTcDOn04r",
"outputId": "357883b2-8b15-454c-b6c0-68c4aeca7f19" "outputId": "cf00a1a7-1fbf-42fd-ea58-04770d4f4dbc"
}, },
"execution_count": null, "execution_count": null,
"outputs": [ "outputs": [
...@@ -555,11 +567,11 @@ ...@@ -555,11 +567,11 @@
"text/plain": [ "text/plain": [
"Sequence(\n", "Sequence(\n",
" format: directory\n", " format: directory\n",
" metadata: {'nameFiles': ['Documento sin título.txt'], 'directoriesPath': [], 'POS': [{'srcPOS': ['NOUN', 'SCONJ', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'PUNCT', 'ADV', 'AUX', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'PROPN', 'AUX', 'PRON', 'ADP', 'ADV', 'PUNCT', 'PRON', 'VERB', 'SCONJ', 'VERB', 'NOUN', 'NOUN', 'PUNCT', 'PRON', 'VERB', 'DET', 'NOUN', 'PRON', 'VERB', 'SCONJ', 'VERB', 'NOUN', 'ADP', 'NOUN', 'ADJ', 'CONJ', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'DET', 'NOUN', 'AUX', 'ADJ', 'PUNCT', 'CONJ', 'PRON', 'VERB', 'DET', 'CONJ', 'PRON', 'NOUN', 'PUNCT', 'CONJ', 'PRON', 'PRON', 'VERB', 'PUNCT', 'SPACE', 'VERB', 'PRON', 'VERB', 'DET', 'NOUN', 'ADJ', 'AUX', 'NOUN', 'PUNCT', 'CONJ', 'ADV', 'VERB', 'ADV', 'CONJ', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADP', 'PRON', 'VERB', 'PRON', 'ADP', 'SCONJ', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'PUNCT', 'PRON', 'VERB', 'VERB', 'PRON', 'PRON', 'VERB', 'ADP', 'NOUN', 'ADP', 'PRON', 'CONJ', 'SCONJ', 'PRON', 'VERB', 'ADV', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT'], 'FreqPOS': {'NOUN': 19, 'SCONJ': 5, 'ADP': 13, 'DET': 10, 'PUNCT': 13, 'ADV': 6, 'AUX': 4, 'PRON': 19, 'VERB': 18, 'PROPN': 3, 'ADJ': 4, 'CONJ': 7, 'SPACE': 3}}]}\n", " metadata: {'nameFiles': ['Documento sin título.txt'], 'directoriesPath': [], 'POS': [{'srcPOS': ['VERB', 'SCONJ', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'PUNCT', 'ADV', 'AUX', 'DET', 'NOUN', 'SCONJ', 'VERB', 'ADP', 'PROPN', 'AUX', 'PRON', 'ADP', 'ADV', 'PUNCT', 'PRON', 'VERB', 'SCONJ', 'VERB', 'DET', 'NOUN', 'PUNCT', 'PRON', 'VERB', 'DET', 'NOUN', 'PRON', 'VERB', 'SCONJ', 'VERB', 'NOUN', 'ADP', 'VERB', 'ADJ', 'CCONJ', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'DET', 'NOUN', 'AUX', 'VERB', 'PUNCT', 'CCONJ', 'DET', 'VERB', 'DET', 'CCONJ', 'PRON', 'NOUN', 'PUNCT', 'CCONJ', 'PRON', 'PRON', 'VERB', 'PUNCT', 'SPACE', 'VERB', 'SCONJ', 'VERB', 'DET', 'NOUN', 'ADJ', 'VERB', 'NOUN', 'PUNCT', 'CCONJ', 'ADV', 'VERB', 'ADV', 'CCONJ', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADP', 'PRON', 'VERB', 'PRON', 'ADP', 'SCONJ', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'PUNCT', 'PRON', 'VERB', 'VERB', 'PRON', 'PRON', 'VERB', 'ADP', 'NOUN', 'ADP', 'PRON', 'CCONJ', 'SCONJ', 'PRON', 'VERB', 'ADV', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT'], 'FreqPOS': {'VERB': 22, 'SCONJ': 7, 'ADP': 13, 'DET': 12, 'NOUN': 16, 'PUNCT': 13, 'ADV': 6, 'AUX': 3, 'PROPN': 3, 'PRON': 16, 'ADJ': 3, 'CCONJ': 7, 'SPACE': 3}}]}\n",
" children: {'files': [Sequence(\n", " children: {'files': [Sequence(\n",
" format: file\n", " format: file\n",
" metadata: {'text': '\\ufeffVeo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.\\nSé que abrí un post parecido hace tiempo, pero ya quedó abajo y por tanto en el olvido, por eso abro este por si alguna persona nueva con este problema lo lee.Me gustaría saber qué os recetan a vosotros para esto y si os va bien.\\n\\n\\nSaludos.\\nNereida.', 'nameFile': 'Documento sin título.txt'}\n", " metadata: {'text': '\\ufeffVeo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.\\nSé que abrí un post parecido hace tiempo, pero ya quedó abajo y por tanto en el olvido, por eso abro este por si alguna persona nueva con este problema lo lee.Me gustaría saber qué os recetan a vosotros para esto y si os va bien.\\n\\n\\nSaludos.\\nNereida.', 'nameFile': 'Documento sin título.txt'}\n",
" children: {'words': [Sequence(\n", " children: {'files': [Sequence(\n",
" format: string\n", " format: string\n",
" metadata: {'text': '\\ufeffVeo'}\n", " metadata: {'text': '\\ufeffVeo'}\n",
" children: {}\n", " children: {}\n",
...@@ -993,7 +1005,7 @@ ...@@ -993,7 +1005,7 @@
] ]
}, },
"metadata": {}, "metadata": {},
"execution_count": 13 "execution_count": 8
} }
] ]
}, },
...@@ -1019,7 +1031,7 @@ ...@@ -1019,7 +1031,7 @@
"base_uri": "https://localhost:8080/" "base_uri": "https://localhost:8080/"
}, },
"id": "1fllpdvltmHH", "id": "1fllpdvltmHH",
"outputId": "ca8d418a-7f42-45e6-8e74-40439e37f5b0" "outputId": "d4505c18-843d-44ce-cabb-4d5ff23d5187"
}, },
"execution_count": null, "execution_count": null,
"outputs": [ "outputs": [
...@@ -1032,8 +1044,8 @@ ...@@ -1032,8 +1044,8 @@
" metadata: {'nameFiles': ['Documento sin título.txt'], 'directoriesPath': []}\n", " metadata: {'nameFiles': ['Documento sin título.txt'], 'directoriesPath': []}\n",
" children: {'files': [Sequence(\n", " children: {'files': [Sequence(\n",
" format: file\n", " format: file\n",
" metadata: {'text': '\\ufeffVeo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.\\nSé que abrí un post parecido hace tiempo, pero ya quedó abajo y por tanto en el olvido, por eso abro este por si alguna persona nueva con este problema lo lee.Me gustaría saber qué os recetan a vosotros para esto y si os va bien.\\n\\n\\nSaludos.\\nNereida.', 'nameFile': 'Documento sin título.txt', 'POS': [{'srcPOS': ['NOUN', 'SCONJ', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'PUNCT', 'ADV', 'AUX', 'DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'PROPN', 'AUX', 'PRON', 'ADP', 'ADV', 'PUNCT', 'PRON', 'VERB', 'SCONJ', 'VERB', 'NOUN', 'NOUN', 'PUNCT', 'PRON', 'VERB', 'DET', 'NOUN', 'PRON', 'VERB', 'SCONJ', 'VERB', 'NOUN', 'ADP', 'NOUN', 'ADJ', 'CONJ', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'DET', 'NOUN', 'AUX', 'ADJ', 'PUNCT', 'CONJ', 'PRON', 'VERB', 'DET', 'CONJ', 'PRON', 'NOUN', 'PUNCT', 'CONJ', 'PRON', 'PRON', 'VERB', 'PUNCT', 'SPACE', 'VERB', 'PRON', 'VERB', 'DET', 'NOUN', 'ADJ', 'AUX', 'NOUN', 'PUNCT', 'CONJ', 'ADV', 'VERB', 'ADV', 'CONJ', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADP', 'PRON', 'VERB', 'PRON', 'ADP', 'SCONJ', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'PUNCT', 'PRON', 'VERB', 'VERB', 'PRON', 'PRON', 'VERB', 'ADP', 'NOUN', 'ADP', 'PRON', 'CONJ', 'SCONJ', 'PRON', 'VERB', 'ADV', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT'], 'FreqPOS': {'NOUN': 19, 'SCONJ': 5, 'ADP': 13, 'DET': 10, 'PUNCT': 13, 'ADV': 6, 'AUX': 4, 'PRON': 19, 'VERB': 18, 'PROPN': 3, 'ADJ': 4, 'CONJ': 7, 'SPACE': 3}}]}\n", " metadata: {'text': '\\ufeffVeo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.\\nSé que abrí un post parecido hace tiempo, pero ya quedó abajo y por tanto en el olvido, por eso abro este por si alguna persona nueva con este problema lo lee.Me gustaría saber qué os recetan a vosotros para esto y si os va bien.\\n\\n\\nSaludos.\\nNereida.', 'nameFile': 'Documento sin título.txt', 'POS': [{'srcPOS': ['VERB', 'SCONJ', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'PUNCT', 'ADV', 'AUX', 'DET', 'NOUN', 'SCONJ', 'VERB', 'ADP', 'PROPN', 'AUX', 'PRON', 'ADP', 'ADV', 'PUNCT', 'PRON', 'VERB', 'SCONJ', 'VERB', 'DET', 'NOUN', 'PUNCT', 'PRON', 'VERB', 'DET', 'NOUN', 'PRON', 'VERB', 'SCONJ', 'VERB', 'NOUN', 'ADP', 'VERB', 'ADJ', 'CCONJ', 'VERB', 'ADP', 'DET', 'NOUN', 'PRON', 'DET', 'NOUN', 'AUX', 'VERB', 'PUNCT', 'CCONJ', 'DET', 'VERB', 'DET', 'CCONJ', 'PRON', 'NOUN', 'PUNCT', 'CCONJ', 'PRON', 'PRON', 'VERB', 'PUNCT', 'SPACE', 'VERB', 'SCONJ', 'VERB', 'DET', 'NOUN', 'ADJ', 'VERB', 'NOUN', 'PUNCT', 'CCONJ', 'ADV', 'VERB', 'ADV', 'CCONJ', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADP', 'PRON', 'VERB', 'PRON', 'ADP', 'SCONJ', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'PUNCT', 'PRON', 'VERB', 'VERB', 'PRON', 'PRON', 'VERB', 'ADP', 'NOUN', 'ADP', 'PRON', 'CCONJ', 'SCONJ', 'PRON', 'VERB', 'ADV', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT', 'SPACE', 'PROPN', 'PUNCT'], 'FreqPOS': {'VERB': 22, 'SCONJ': 7, 'ADP': 13, 'DET': 12, 'NOUN': 16, 'PUNCT': 13, 'ADV': 6, 'AUX': 3, 'PROPN': 3, 'PRON': 16, 'ADJ': 3, 'CCONJ': 7, 'SPACE': 3}}]}\n",
" children: {'tokens': [Sequence(\n", " children: {'files': [Sequence(\n",
" format: string\n", " format: string\n",
" metadata: {'text': '\\ufeffVeo'}\n", " metadata: {'text': '\\ufeffVeo'}\n",
" children: {}\n", " children: {}\n",
...@@ -1467,7 +1479,318 @@ ...@@ -1467,7 +1479,318 @@
] ]
}, },
"metadata": {}, "metadata": {},
"execution_count": 14 "execution_count": 9
}
]
},
{
"cell_type": "markdown",
"source": [
"Use of the filter method of a sequence:"
],
"metadata": {
"id": "0YK8UKkIToJm"
}
},
{
"cell_type": "markdown",
"source": [
" Criteria function"
],
"metadata": {
"id": "DHbRPI8MUANE"
}
},
{
"cell_type": "code",
"source": [
"s = SequenceDirectory(os.getcwd()+\"/ExampleDirectory\")"
],
"metadata": {
"id": "bkytecNImwv3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def sequenciasMenores4letras(arraySequencias):\n",
" result=[]\n",
" for i in arraySequencias:\n",
" if len(i.metadata[\"text\"]) < 4:\n",
" result.append(i)\n",
" return result"
],
"metadata": {
"id": "N3dlbFKCT_KQ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Using de filter"
],
"metadata": {
"id": "ruj6I_hK7MzL"
}
},
{
"cell_type": "code",
"source": [
"for i in s.filter(\"files/files\",sequenciasMenores4letras):\n",
" print(i)\n",
" print(type(i))"
],
"metadata": {
"id": "QwHAoIERVHOn",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "0c978552-2783-42a2-cb76-063e02cd011c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"que\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"en\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"no\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"hay\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"que\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"de\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"hay\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"por\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"me\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"que\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"yo\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"las\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"que\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"de\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"y\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"en\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"las\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"que\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"ese\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"y\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"las\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"una\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"y\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"y\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"me\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"Sé\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"que\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"un\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"ya\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"y\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"por\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"en\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"el\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"por\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"eso\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"por\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"si\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"con\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"lo\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"qué\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"os\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"a\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"y\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"si\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"os\n",
"<class 'textflow.SequenceString.SequenceString'>\n",
"va\n",
"<class 'textflow.SequenceString.SequenceString'>\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"Creating a new criteria function for use in filtermetada function:"
],
"metadata": {
"id": "cWCfr8xV7l3Y"
}
},
{
"cell_type": "code",
"source": [
"def sequenciasMenores4letrasMetadata(arraySequencias):\n",
" result=[]\n",
" for i in arraySequencias:\n",
" if len(i) < 4:\n",
" result.append(i)\n",
" return result"
],
"metadata": {
"id": "Mol4alQq7elK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for i in s.filterMetadata(\"files/files/text\", sequenciasMenores4letrasMetadata):\n",
" print(i)\n",
" print(type(i))"
],
"metadata": {
"id": "wYs7VwmRTxnL",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "536f4b80-6a0e-4092-ba38-7798c90b9c8c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"que\n",
"<class 'str'>\n",
"en\n",
"<class 'str'>\n",
"no\n",
"<class 'str'>\n",
"hay\n",
"<class 'str'>\n",
"que\n",
"<class 'str'>\n",
"de\n",
"<class 'str'>\n",
"hay\n",
"<class 'str'>\n",
"por\n",
"<class 'str'>\n",
"me\n",
"<class 'str'>\n",
"que\n",
"<class 'str'>\n",
"yo\n",
"<class 'str'>\n",
"las\n",
"<class 'str'>\n",
"que\n",
"<class 'str'>\n",
"de\n",
"<class 'str'>\n",
"y\n",
"<class 'str'>\n",
"en\n",
"<class 'str'>\n",
"las\n",
"<class 'str'>\n",
"que\n",
"<class 'str'>\n",
"ese\n",
"<class 'str'>\n",
"y\n",
"<class 'str'>\n",
"las\n",
"<class 'str'>\n",
"una\n",
"<class 'str'>\n",
"y\n",
"<class 'str'>\n",
"y\n",
"<class 'str'>\n",
"me\n",
"<class 'str'>\n",
"Sé\n",
"<class 'str'>\n",
"que\n",
"<class 'str'>\n",
"un\n",
"<class 'str'>\n",
"ya\n",
"<class 'str'>\n",
"y\n",
"<class 'str'>\n",
"por\n",
"<class 'str'>\n",
"en\n",
"<class 'str'>\n",
"el\n",
"<class 'str'>\n",
"por\n",
"<class 'str'>\n",
"eso\n",
"<class 'str'>\n",
"por\n",
"<class 'str'>\n",
"si\n",
"<class 'str'>\n",
"con\n",
"<class 'str'>\n",
"lo\n",
"<class 'str'>\n",
"qué\n",
"<class 'str'>\n",
"os\n",
"<class 'str'>\n",
"a\n",
"<class 'str'>\n",
"y\n",
"<class 'str'>\n",
"si\n",
"<class 'str'>\n",
"os\n",
"<class 'str'>\n",
"va\n",
"<class 'str'>\n"
]
} }
] ]
} }
......
...@@ -14,7 +14,7 @@ class IronityAnalyzer(Analyzer): ...@@ -14,7 +14,7 @@ class IronityAnalyzer(Analyzer):
maxEmbedding: The number of max_position_embedings in the config.json of the model selected. maxEmbedding: The number of max_position_embedings in the config.json of the model selected.
""" """
def __init__(self, task = "text-classification",modelIronity = 'dtomas/roberta-base-bne-irony', allScores = True, maxEmbedding = 512): def __init__(self, task = "text-classification",modelIronity = 'dtomas/roberta-base-bne-irony', allScores = True, maxEmbedding = 514):
""" """
Create an ironic analyzer. Create an ironic analyzer.
...@@ -28,8 +28,8 @@ class IronityAnalyzer(Analyzer): ...@@ -28,8 +28,8 @@ class IronityAnalyzer(Analyzer):
model = AutoModelForSequenceClassification.from_pretrained(modelIronity) model = AutoModelForSequenceClassification.from_pretrained(modelIronity)
model.config.id2label = {0: 'NI', 1: 'I'} model.config.id2label = {0: 'NI', 1: 'I'}
model.config.label2id = {'NI': 0, 'I': 1} model.config.label2id = {'NI': 0, 'I': 1}
tokenizer = AutoTokenizer.from_pretrained(modelIronity, model_max_length=512) tokenizer = AutoTokenizer.from_pretrained(modelIronity)
self.ironityClassifier = pipeline(task,model= model, tokenizer=tokenizer,return_all_scores=allScores) self.ironityClassifier = pipeline(task,model= model, tokenizer=tokenizer,return_all_scores=allScores, truncation=True)
else: else:
self.ironityClassifier = pipeline(task,model= modelIronity, return_all_scores=allScores) self.ironityClassifier = pipeline(task,model= modelIronity, return_all_scores=allScores)
self.maxEmbeding = maxEmbedding self.maxEmbeding = maxEmbedding
......
...@@ -78,7 +78,6 @@ class NERAnalyzer(Analyzer): ...@@ -78,7 +78,6 @@ class NERAnalyzer(Analyzer):
textner.append(doc[i].ent_type_) textner.append(doc[i].ent_type_)
else: else:
textner.append(doc[i].text) textner.append(doc[i].text)
print(textner)
self.textNER = " ".join(textner) self.textNER = " ".join(textner)
for ent in doc.ents: for ent in doc.ents:
#Guardamos el diccionario obtenido para la categoria de la palabra (si este existe) #Guardamos el diccionario obtenido para la categoria de la palabra (si este existe)
......
...@@ -199,10 +199,10 @@ class Sequence(ABC): ...@@ -199,10 +199,10 @@ class Sequence(ABC):
ruta = level.split("/") ruta = level.split("/")
children = [self.children] children = [self.children]
results=[] results=[]
for r in ruta: for idx, r in enumerate(ruta):
for child in children: for child in children:
if r in child: if r in child:
if r == ruta[-1]: if r == ruta[-1] and idx == len(ruta)-1:
results.extend(child[r]) results.extend(child[r])
else: else:
children = [c.children for c in child[r]] children = [c.children for c in child[r]]
...@@ -230,8 +230,8 @@ class Sequence(ABC): ...@@ -230,8 +230,8 @@ class Sequence(ABC):
children = [self.children] children = [self.children]
metadata = [self.metadata] metadata = [self.metadata]
results=[] results=[]
for r in ruta: for idx, r in enumerate(ruta):
if r == ruta[-1]: if r == ruta[-1] and idx == len(ruta)-1:
for m in metadata: for m in metadata:
if r in m: if r in m:
results.append(m[r]) results.append(m[r])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment