Example notebook and analyze-complexity command

7d61f758 · Jaime Collado · 182e38fe · 7d61f758 · 182e38fe · 7d61f758
Commit 7d61f758 authored Mar 25, 2022 by Jaime Collado
Showing with 194 additions and 3 deletions
.gitignore
__init__.py
examples/example.ipynb
examples/example_text.txt
examples/texty_example.ipynb
setup.cfg
src/texty/analyze_complexity.py
src/texty/complexity.py
--- a/.gitignore
+++ b/.gitignore
 __pycache__
 *.pyc
-.ipynb_checkpoints
\ No newline at end of file
+.ipynb_checkpoints
+*.egg-info/
\ No newline at end of file
--- a/__init__.py
+++ b/__init__.py
--- a/examples/example.ipynb
+++ b/examples/example.ipynb
--- a/examples/example_text.txt
+++ b/examples/example_text.txt
+Veo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.
+Sé que abrí un post parecido hace tiempo, pero ya quedó abajo y por tanto en el olvido, por eso abro este por si alguna persona nueva con este problema lo lee.Me gustaría saber qué os recetan a vosotros para esto y si os va bien.
+
+Saludos.
+Nereida.
\ No newline at end of file
--- a/examples/texty_example.ipynb
+++ b/examples/texty_example.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5745bcf4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "from texty.complexity import ComplexityAnalyzer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb2f4283",
+   "metadata": {},
+   "source": [
+    "Load input text file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "63c5bfcb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Veo que en este foro, afortunadamente para vosotros, no hay mucha gente que sufra de TOC.Si hay alguien por ahí, me gustaría que compartiérais vuestras opiniones, yo compruebo las cosas que hago porque tengo miedo de haberme equivocado y pienso en las consecuencias que ese error podría acarrearme, y las compruebo una y otra vez, y esto me angustia.\\nSé que abrí un post parecido hace tiempo, pero ya quedó abajo y por tanto en el olvido, por eso abro este por si alguna persona nueva con este problema lo lee.Me gustaría saber qué os recetan a vosotros para esto y si os va bien.\\n\\nSaludos.\\nNereida.'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filename = \"example_text.txt\"\n",
+    "\n",
+    "with open(filename) as f:\n",
+    "    text = f.readlines()\n",
+    "    f.close()\n",
+    "\n",
+    "text = \"\".join(text)\n",
+    "\n",
+    "text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dab48c52",
+   "metadata": {},
+   "source": [
+    "Instantiate the ComplexityAnalyzer and calculate metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "52fd5e8e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'npunct': 13,\n",
+       " 'nword': 111,\n",
+       " 'ILFW': 0.6578947368421053,\n",
+       " 'LDI': 7.0,\n",
+       " 'LC': 3.8289473684210527,\n",
+       " 'nrword': 24,\n",
+       " 'SSR': 129.46034054054056,\n",
+       " 'avgsentl': 22.2,\n",
+       " 'ncompsent': 0,\n",
+       " 'nsent': 5,\n",
+       " 'SCI': 11.1,\n",
+       " 'nchar': 496,\n",
+       " 'ARI': 10.716486486486488,\n",
+       " 'min_depth': 2,\n",
+       " 'max_depth': 9,\n",
+       " 'mean_depth': 5.0,\n",
+       " 'nsyllab': 208,\n",
+       " 'huerta': 89.80797297297298,\n",
+       " 'IFSZ': 67.89265765765767,\n",
+       " 'polini': 45.57144144144145,\n",
+       " 'mu': 58.89338054313761,\n",
+       " 'minage': 10.535169369369369,\n",
+       " 'SOL': 9.258359866374562,\n",
+       " 'crawford': 4.851558558558558}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp = spacy.load(\"es_core_news_sm\")\n",
+    "\n",
+    "ca = ComplexityAnalyzer(\"es\", nlp)\n",
+    "\n",
+    "ca.read(\"\".join(text))\n",
+    "metrics = ca.get_all_metrics()\n",
+    "\n",
+    "metrics"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/setup.cfg
+++ b/setup.cfg
@@ -20,5 +20,9 @@ package_dir =
 packages = find:
 python_requires = >=3.6

+[options.entry_points]
+console_scripts =
+    analyze-complexity = texty.analyze_complexity:analyze_complexity
+
 [options.packages.find]
 where = src
\ No newline at end of file
--- a/src/texty/analyze_complexity.py
+++ b/src/texty/analyze_complexity.py
+import argparse
+
+import pandas as pd
+import spacy
+from texty.complexity import ComplexityAnalyzer
+
+
+def load_file(filename):
+    with open(filename) as f:
+        lines = f.readlines()
+        f.close()
+    return lines
+
+def analyze_complexity(args=None):
+    parser = argparse.ArgumentParser(description="Calculate complexity metrics")
+    parser.add_argument("filename", type=str, help="Text file input")
+    parser.add_argument("-o", "--output", type=str, help="Output format", choices=["csv", "tsv", "json"], default="csv")
+    args = parser.parse_args()
+
+    # Load file
+    try:
+        lines = load_file(args.filename)
+    except:
+        print("Error while reading the file")
+        exit()
+    
+    # Instantiate Complexity class
+    nlp = spacy.load("es_core_news_sm")
+    ca = ComplexityAnalyzer("es", nlp)
+
+    # Read file
+    ca.read("".join(lines))
+
+    # Calculate metrics
+    metrics = ca.get_all_metrics()
+    output_df = pd.DataFrame([metrics.values()], columns=metrics.keys())
+
+    if args.output == "csv":
+        output_df.to_csv("output.csv", index=False)
+    elif args.output == "tsv":
+        output_df.to_csv("output.tsv", sep="\t", index=False)
+    elif args.output == "json":
+        output_df.to_json("output.json")
+
+    print("Output file generated.")
+
+if __name__ == "__main__":
+    analyze_complexity()
--- a/src/texty/complexity.py
+++ b/src/texty/complexity.py
@@ -216,7 +216,6 @@ class ComplexityAnalyzer():
        
        return self.N_charac, self.ARI, self.listwords

-       
    def tree_height(self,root, cont):
        if not list(root.children):
            return 1