Implementing some analyzers

parent f3e06440
...@@ -60,7 +60,7 @@ unicode_backport = ["unicodedata2"] ...@@ -60,7 +60,7 @@ unicode_backport = ["unicodedata2"]
[[package]] [[package]]
name = "click" name = "click"
version = "8.1.2" version = "8.1.3"
description = "Composable command line interface toolkit" description = "Composable command line interface toolkit"
category = "main" category = "main"
optional = false optional = false
...@@ -87,7 +87,7 @@ python-versions = "*" ...@@ -87,7 +87,7 @@ python-versions = "*"
[[package]] [[package]]
name = "filelock" name = "filelock"
version = "3.6.0" version = "3.7.0"
description = "A platform independent file lock." description = "A platform independent file lock."
category = "main" category = "main"
optional = false optional = false
...@@ -98,16 +98,8 @@ docs = ["furo (>=2021.8.17b43)", "sphinx (>=4.1)", "sphinx-autodoc-typehints (>= ...@@ -98,16 +98,8 @@ docs = ["furo (>=2021.8.17b43)", "sphinx (>=4.1)", "sphinx-autodoc-typehints (>=
testing = ["covdefaults (>=1.2.0)", "coverage (>=4)", "pytest (>=4)", "pytest-cov", "pytest-timeout (>=1.4.2)"] testing = ["covdefaults (>=1.2.0)", "coverage (>=4)", "pytest (>=4)", "pytest-cov", "pytest-timeout (>=1.4.2)"]
[[package]] [[package]]
name = "functools"
version = "0.5"
description = "Fast tools for functional programming"
category = "main"
optional = false
python-versions = "*"
[[package]]
name = "huggingface-hub" name = "huggingface-hub"
version = "0.5.1" version = "0.6.0"
description = "Client library to download and publish models on the huggingface.co hub" description = "Client library to download and publish models on the huggingface.co hub"
category = "main" category = "main"
optional = false optional = false
...@@ -124,6 +116,7 @@ typing-extensions = ">=3.7.4.3" ...@@ -124,6 +116,7 @@ typing-extensions = ">=3.7.4.3"
[package.extras] [package.extras]
all = ["pytest", "datasets", "soundfile", "black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"] all = ["pytest", "datasets", "soundfile", "black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"]
dev = ["pytest", "datasets", "soundfile", "black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"] dev = ["pytest", "datasets", "soundfile", "black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"]
fastai = ["toml", "fastai (>=2.4)", "fastcore (>=1.3.27)"]
quality = ["black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"] quality = ["black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"]
tensorflow = ["tensorflow", "pydot", "graphviz"] tensorflow = ["tensorflow", "pydot", "graphviz"]
testing = ["pytest", "datasets", "soundfile"] testing = ["pytest", "datasets", "soundfile"]
...@@ -180,7 +173,7 @@ python-versions = ">=3.7" ...@@ -180,7 +173,7 @@ python-versions = ">=3.7"
[[package]] [[package]]
name = "more-itertools" name = "more-itertools"
version = "8.12.0" version = "8.13.0"
description = "More routines for operating on iterables, beyond itertools" description = "More routines for operating on iterables, beyond itertools"
category = "dev" category = "dev"
optional = false optional = false
...@@ -301,14 +294,14 @@ email = ["email-validator (>=1.0.3)"] ...@@ -301,14 +294,14 @@ email = ["email-validator (>=1.0.3)"]
[[package]] [[package]]
name = "pyparsing" name = "pyparsing"
version = "3.0.7" version = "3.0.9"
description = "Python parsing module" description = "pyparsing module - Classes and methods to define and execute parsing grammars"
category = "main" category = "main"
optional = false optional = false
python-versions = ">=3.6" python-versions = ">=3.6.8"
[package.extras] [package.extras]
diagrams = ["jinja2", "railroad-diagrams"] diagrams = ["railroad-diagrams", "jinja2"]
[[package]] [[package]]
name = "pytest" name = "pytest"
...@@ -367,29 +360,6 @@ socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] ...@@ -367,29 +360,6 @@ socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
[[package]] [[package]]
name = "sacremoses"
version = "0.0.53"
description = "SacreMoses"
category = "main"
optional = false
python-versions = "*"
[package.dependencies]
click = "*"
joblib = "*"
regex = "*"
six = "*"
tqdm = "*"
[[package]]
name = "six"
version = "1.16.0"
description = "Python 2 and 3 compatibility utilities"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
[[package]]
name = "smart-open" name = "smart-open"
version = "5.2.1" version = "5.2.1"
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
...@@ -540,6 +510,17 @@ docs = ["sphinx", "sphinx-rtd-theme", "setuptools-rust"] ...@@ -540,6 +510,17 @@ docs = ["sphinx", "sphinx-rtd-theme", "setuptools-rust"]
testing = ["pytest", "requests", "numpy", "datasets"] testing = ["pytest", "requests", "numpy", "datasets"]
[[package]] [[package]]
name = "torch"
version = "1.11.0"
description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
category = "main"
optional = false
python-versions = ">=3.7.0"
[package.dependencies]
typing-extensions = "*"
[[package]]
name = "tqdm" name = "tqdm"
version = "4.64.0" version = "4.64.0"
description = "Fast, Extensible Progress Meter" description = "Fast, Extensible Progress Meter"
...@@ -558,11 +539,11 @@ telegram = ["requests"] ...@@ -558,11 +539,11 @@ telegram = ["requests"]
[[package]] [[package]]
name = "transformers" name = "transformers"
version = "4.18.0" version = "4.19.1"
description = "State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
category = "main" category = "main"
optional = false optional = false
python-versions = ">=3.6.0" python-versions = ">=3.7.0"
[package.dependencies] [package.dependencies]
filelock = "*" filelock = "*"
...@@ -572,22 +553,22 @@ packaging = ">=20.0" ...@@ -572,22 +553,22 @@ packaging = ">=20.0"
pyyaml = ">=5.1" pyyaml = ">=5.1"
regex = "!=2019.12.17" regex = "!=2019.12.17"
requests = "*" requests = "*"
sacremoses = "*"
tokenizers = ">=0.11.1,<0.11.3 || >0.11.3,<0.13" tokenizers = ">=0.11.1,<0.11.3 || >0.11.3,<0.13"
tqdm = ">=4.27" tqdm = ">=4.27"
[package.extras] [package.extras]
all = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2)", "jaxlib (>=0.1.65)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)"] all = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)"]
audio = ["librosa", "pyctcdecode (>=0.3.0)", "phonemizer"] audio = ["librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
codecarbon = ["codecarbon (==1.2.0)"] codecarbon = ["codecarbon (==1.2.0)"]
deepspeed = ["deepspeed (>=0.6.0)"] deepspeed = ["deepspeed (>=0.6.4)"]
dev = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2)", "jaxlib (>=0.1.65)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)", "faiss-cpu", "cookiecutter (==1.7.3)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)", "hf-doc-builder", "scikit-learn"] deepspeed-testing = ["deepspeed (>=0.6.4)", "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)", "optuna"]
dev-tensorflow = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)", "faiss-cpu", "cookiecutter (==1.7.3)", "tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "pillow", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "hf-doc-builder", "scikit-learn", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"] dev = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)", "hf-doc-builder", "scikit-learn"]
dev-torch = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)", "faiss-cpu", "cookiecutter (==1.7.3)", "torch (>=1.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)", "hf-doc-builder", "scikit-learn", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] dev-tensorflow = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)", "tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "pillow", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "hf-doc-builder", "scikit-learn", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
docs = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2)", "jaxlib (>=0.1.65)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "hf-doc-builder"] dev-torch = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)", "torch (>=1.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)", "hf-doc-builder", "scikit-learn", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
docs = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "hf-doc-builder"]
docs_specific = ["hf-doc-builder"] docs_specific = ["hf-doc-builder"]
fairscale = ["fairscale (>0.3)"] fairscale = ["fairscale (>0.3)"]
flax = ["jax (>=0.2.8,!=0.3.2)", "jaxlib (>=0.1.65)", "flax (>=0.3.5)", "optax (>=0.0.8)"] flax = ["jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "flax (>=0.3.5)", "optax (>=0.0.8)"]
flax-speech = ["librosa", "pyctcdecode (>=0.3.0)", "phonemizer"] flax-speech = ["librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
ftfy = ["ftfy"] ftfy = ["ftfy"]
integrations = ["optuna", "ray", "sigopt"] integrations = ["optuna", "ray", "sigopt"]
...@@ -596,7 +577,7 @@ modelcreation = ["cookiecutter (==1.7.3)"] ...@@ -596,7 +577,7 @@ modelcreation = ["cookiecutter (==1.7.3)"]
onnx = ["onnxconverter-common", "tf2onnx", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] onnx = ["onnxconverter-common", "tf2onnx", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
optuna = ["optuna"] optuna = ["optuna"]
quality = ["black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)"] quality = ["black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)"]
ray = ["ray"] ray = ["ray"]
retrieval = ["faiss-cpu", "datasets"] retrieval = ["faiss-cpu", "datasets"]
sagemaker = ["sagemaker (>=2.31.0)"] sagemaker = ["sagemaker (>=2.31.0)"]
...@@ -605,7 +586,7 @@ serving = ["pydantic", "uvicorn", "fastapi", "starlette"] ...@@ -605,7 +586,7 @@ serving = ["pydantic", "uvicorn", "fastapi", "starlette"]
sigopt = ["sigopt"] sigopt = ["sigopt"]
sklearn = ["scikit-learn"] sklearn = ["scikit-learn"]
speech = ["torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"] speech = ["torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
testing = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)", "faiss-cpu", "cookiecutter (==1.7.3)"] testing = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)"]
tf = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx"] tf = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx"]
tf-cpu = ["tensorflow-cpu (>=2.3)", "onnxconverter-common", "tf2onnx"] tf-cpu = ["tensorflow-cpu (>=2.3)", "onnxconverter-common", "tf2onnx"]
tf-speech = ["librosa", "pyctcdecode (>=0.3.0)", "phonemizer"] tf-speech = ["librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
...@@ -613,7 +594,7 @@ timm = ["timm"] ...@@ -613,7 +594,7 @@ timm = ["timm"]
tokenizers = ["tokenizers (>=0.11.1,!=0.11.3,<0.13)"] tokenizers = ["tokenizers (>=0.11.1,!=0.11.3,<0.13)"]
torch = ["torch (>=1.0)"] torch = ["torch (>=1.0)"]
torch-speech = ["torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"] torch-speech = ["torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
torchhub = ["filelock", "huggingface-hub (>=0.1.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.0)", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "tqdm (>=4.27)"] torchhub = ["filelock", "huggingface-hub (>=0.1.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.0)", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "tqdm (>=4.27)"]
vision = ["pillow"] vision = ["pillow"]
[[package]] [[package]]
...@@ -672,8 +653,8 @@ python-versions = "*" ...@@ -672,8 +653,8 @@ python-versions = "*"
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.8" python-versions = "3.8"
content-hash = "d3639d0f322d79260a5fe40ea43817a72bcd16885b66b7a1b1bb3ec355d37264" content-hash = "f559d5695f1365c162f02c2146df48de52ad2d38e1b4a26476c7a662dc065365"
[metadata.files] [metadata.files]
atomicwrites = [ atomicwrites = [
...@@ -715,8 +696,8 @@ charset-normalizer = [ ...@@ -715,8 +696,8 @@ charset-normalizer = [
{file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"}, {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"},
] ]
click = [ click = [
{file = "click-8.1.2-py3-none-any.whl", hash = "sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e"}, {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
{file = "click-8.1.2.tar.gz", hash = "sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"}, {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
] ]
colorama = [ colorama = [
{file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
...@@ -741,15 +722,12 @@ cymem = [ ...@@ -741,15 +722,12 @@ cymem = [
{file = "cymem-2.0.6.tar.gz", hash = "sha256:169725b5816959d34de2545b33fee6a8021a6e08818794a426c5a4f981f17e5e"}, {file = "cymem-2.0.6.tar.gz", hash = "sha256:169725b5816959d34de2545b33fee6a8021a6e08818794a426c5a4f981f17e5e"},
] ]
filelock = [ filelock = [
{file = "filelock-3.6.0-py3-none-any.whl", hash = "sha256:f8314284bfffbdcfa0ff3d7992b023d4c628ced6feb957351d4c48d059f56bc0"}, {file = "filelock-3.7.0-py3-none-any.whl", hash = "sha256:c7b5fdb219b398a5b28c8e4c1893ef5f98ece6a38c6ab2c22e26ec161556fed6"},
{file = "filelock-3.6.0.tar.gz", hash = "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85"}, {file = "filelock-3.7.0.tar.gz", hash = "sha256:b795f1b42a61bbf8ec7113c341dad679d772567b936fbd1bf43c9a238e673e20"},
]
functools = [
{file = "functools-0.5.tar.gz", hash = "sha256:596ed8999dee419c0749a41bfdd82e4697e80ea27ee01c716003ef55be9a54c5"},
] ]
huggingface-hub = [ huggingface-hub = [
{file = "huggingface_hub-0.5.1-py3-none-any.whl", hash = "sha256:b9fd1f567a3fb16e73acc613e78d075d1926d4b0c5c56ba08c4f125707b50c70"}, {file = "huggingface_hub-0.6.0-py3-none-any.whl", hash = "sha256:585d72adade562a1f7038acf39eb7677b7649bdc0ce082b70f99e01164d9d8b5"},
{file = "huggingface_hub-0.5.1.tar.gz", hash = "sha256:d90d657dca0d6a577f640ff684a58da8e5c76258e485100e885a0e7307e2eb12"}, {file = "huggingface_hub-0.6.0.tar.gz", hash = "sha256:f5109065222185d129933d44159e483a9e3378c577127d0281e4c921dfadbd23"},
] ]
idna = [ idna = [
{file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
...@@ -810,8 +788,8 @@ markupsafe = [ ...@@ -810,8 +788,8 @@ markupsafe = [
{file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
] ]
more-itertools = [ more-itertools = [
{file = "more-itertools-8.12.0.tar.gz", hash = "sha256:7dc6ad46f05f545f900dd59e8dfb4e84a4827b97b3cfecb175ea0c7d247f6064"}, {file = "more-itertools-8.13.0.tar.gz", hash = "sha256:a42901a0a5b169d925f6f217cd5a190e32ef54360905b9c39ee7db5313bfec0f"},
{file = "more_itertools-8.12.0-py3-none-any.whl", hash = "sha256:43e6dd9942dffd72661a2c4ef383ad7da1e6a3e968a927ad7a6083ab410a688b"}, {file = "more_itertools-8.13.0-py3-none-any.whl", hash = "sha256:c5122bffc5f104d37c1626b8615b511f3427aa5389b94d61e5ef8236bfbc3ddb"},
] ]
murmurhash = [ murmurhash = [
{file = "murmurhash-1.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:966d2efec6e01aa32c5774c44906724efca00da3507f06faa11acafb47ea1230"}, {file = "murmurhash-1.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:966d2efec6e01aa32c5774c44906724efca00da3507f06faa11acafb47ea1230"},
...@@ -916,8 +894,8 @@ pydantic = [ ...@@ -916,8 +894,8 @@ pydantic = [
{file = "pydantic-1.8.2.tar.gz", hash = "sha256:26464e57ccaafe72b7ad156fdaa4e9b9ef051f69e175dbbb463283000c05ab7b"}, {file = "pydantic-1.8.2.tar.gz", hash = "sha256:26464e57ccaafe72b7ad156fdaa4e9b9ef051f69e175dbbb463283000c05ab7b"},
] ]
pyparsing = [ pyparsing = [
{file = "pyparsing-3.0.7-py3-none-any.whl", hash = "sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484"}, {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
{file = "pyparsing-3.0.7.tar.gz", hash = "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea"}, {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
] ]
pytest = [ pytest = [
{file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"}, {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"},
...@@ -1038,13 +1016,6 @@ requests = [ ...@@ -1038,13 +1016,6 @@ requests = [
{file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"},
{file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"},
] ]
sacremoses = [
{file = "sacremoses-0.0.53.tar.gz", hash = "sha256:43715868766c643b35de4b8046cce236bfe59a7fa88b25eaf6ddf02bacf53a7a"},
]
six = [
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
]
smart-open = [ smart-open = [
{file = "smart_open-5.2.1-py3-none-any.whl", hash = "sha256:71d14489da58b60ce12fc3ecb823facc59a8b23cd1b58edb97175640350d3a62"}, {file = "smart_open-5.2.1-py3-none-any.whl", hash = "sha256:71d14489da58b60ce12fc3ecb823facc59a8b23cd1b58edb97175640350d3a62"},
{file = "smart_open-5.2.1.tar.gz", hash = "sha256:75abf758717a92a8f53aa96953f0c245c8cedf8e1e4184903db3659b419d4c17"}, {file = "smart_open-5.2.1.tar.gz", hash = "sha256:75abf758717a92a8f53aa96953f0c245c8cedf8e1e4184903db3659b419d4c17"},
...@@ -1146,13 +1117,34 @@ tokenizers = [ ...@@ -1146,13 +1117,34 @@ tokenizers = [
{file = "tokenizers-0.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:2158baf80cbc09259bfd6e0e0fc4597b611e7a72ad5443dad63918a90f1dd304"}, {file = "tokenizers-0.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:2158baf80cbc09259bfd6e0e0fc4597b611e7a72ad5443dad63918a90f1dd304"},
{file = "tokenizers-0.12.1.tar.gz", hash = "sha256:070746f86efa6c873db341e55cf17bb5e7bdd5450330ca8eca542f5c3dab2c66"}, {file = "tokenizers-0.12.1.tar.gz", hash = "sha256:070746f86efa6c873db341e55cf17bb5e7bdd5450330ca8eca542f5c3dab2c66"},
] ]
torch = [
{file = "torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:62052b50fffc29ca7afc0c04ef8206b6f1ca9d10629cb543077e12967e8d0398"},
{file = "torch-1.11.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:866bfba29ac98dec35d893d8e17eaec149d0ac7a53be7baae5c98069897db667"},
{file = "torch-1.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:951640fb8db308a59d9b510e7d1ad910aff92913323bbe4bc75435347ddd346d"},
{file = "torch-1.11.0-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:5d77b5ece78fdafa5c7f42995ff9474399d22571cd6b2de21a5d666306a2ff8c"},
{file = "torch-1.11.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:b5a38682769b544c875ecc34bcb81fbad5c922139b61319aacffcfd8a32f528c"},
{file = "torch-1.11.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f82d77695a60626f2b7382d85bc566de8a6b3e50d32080755abc040db802e419"},
{file = "torch-1.11.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b96654d42566080a134e784705f33f8536b3b95b5dcde357ed7879b1692a5f78"},
{file = "torch-1.11.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8ee7c2e8d7f7020d5bfbc1bb91b9591044c26bbd0cee5e4f694cfd7ed8649260"},
{file = "torch-1.11.0-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:6860b1d1bf0bb0b67a6bd47f85a0e4c825b518eea13b5d6101999dbbcbd5bc0c"},
{file = "torch-1.11.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:4322aa29f50da7f404db06cdf30896ea67b09f673af4a985afc7162bc897864d"},
{file = "torch-1.11.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e4d2e0ddd652f30e94cff750220324ec45705d4ecc69658f773b3cb1c7a28dd0"},
{file = "torch-1.11.0-cp38-cp38-win_amd64.whl", hash = "sha256:34ce5ea4d8d85da32cdbadb50d4585106901e9f8a3527991daa70c13a09de1f7"},
{file = "torch-1.11.0-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:0ccc85cd06227a3edf809e2c795fd5762c3d4e8a38b5c9f744c6e7cf841361bb"},
{file = "torch-1.11.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c1554e49d74f1b2c3e7202d77056ba2dd7465437585bac64062b580f714a44e9"},
{file = "torch-1.11.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:58c7814502b1c129a650d7092033bbb0bbd64faf1a7941631aaa1aeaddc37570"},
{file = "torch-1.11.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:831cf588f01dda9409e75576741d2823453990dee2983d670f2584b37a01adf7"},
{file = "torch-1.11.0-cp39-cp39-win_amd64.whl", hash = "sha256:44a1d02fd20f827f0f36dc26fdcfc45e793806a6ad52769a22260655a77a4369"},
{file = "torch-1.11.0-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:50fd9bf85c578c871c28f1cb0ace9dfc6024401c7f399b174fb0f370899f4454"},
{file = "torch-1.11.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:0e48af66ad755f0f9c5f2664028a414f57c49d6adc37e77e06fe0004da4edb61"},
]
tqdm = [ tqdm = [
{file = "tqdm-4.64.0-py2.py3-none-any.whl", hash = "sha256:74a2cdefe14d11442cedf3ba4e21a3b84ff9a2dbdc6cfae2c34addb2a14a5ea6"}, {file = "tqdm-4.64.0-py2.py3-none-any.whl", hash = "sha256:74a2cdefe14d11442cedf3ba4e21a3b84ff9a2dbdc6cfae2c34addb2a14a5ea6"},
{file = "tqdm-4.64.0.tar.gz", hash = "sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d"}, {file = "tqdm-4.64.0.tar.gz", hash = "sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d"},
] ]
transformers = [ transformers = [
{file = "transformers-4.18.0-py3-none-any.whl", hash = "sha256:6ae54fc29bd4bba5b0230d429cb55b8b3eb5feb9e3c9913c61203999f1f0c2c9"}, {file = "transformers-4.19.1-py3-none-any.whl", hash = "sha256:16d3dd257d459c2598e2548a9e6875c10b7db5e44494d93b3c0a5c60afad667f"},
{file = "transformers-4.18.0.tar.gz", hash = "sha256:16f7751c44f31d8f9a3811bccd80f1995e1cb0ffd9b7de60ef6ede2ab90a6fd4"}, {file = "transformers-4.19.1.tar.gz", hash = "sha256:6fb30ee534a25b6b3fc7064c280b7f44abf8c9bd1fb358860ebe4fd392bf15f5"},
] ]
typer = [ typer = [
{file = "typer-0.4.1-py3-none-any.whl", hash = "sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3"}, {file = "typer-0.4.1-py3-none-any.whl", hash = "sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3"},
......
...@@ -5,14 +5,16 @@ description = "A text analysis library for Python" ...@@ -5,14 +5,16 @@ description = "A text analysis library for Python"
authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"] authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.8" python = "3.8"
nltk = "^3.7" nltk = "^3.7"
spacy = "^3.3.0" spacy = "^3.3.0"
transformers = "^4.18.0" transformers = "^4.19.0"
torch = {version = "^1.11.0", python = "^3.7", platform = "linux"}
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pytest = "^5.2" pytest = "^5.2"
[build-system] [build-system]
requires = ["poetry-core>=1.0.0"] requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
...@@ -3,29 +3,13 @@ import spacy ...@@ -3,29 +3,13 @@ import spacy
import spacy.cli import spacy.cli
from typing import Optional from typing import Optional
from textflow.Sequence import Sequence from textflow.Sequence import Sequence
#from transformers import pipeline from abc import ABC, abstractmethod
class Analyzer: class Analyzer(ABC):
def __init__(self, function, isMetadata: Optional[bool] = False,lang : Optional[str] = "es"):
"""Creates an analyzer from an input object.
Args: @abstractmethod
function: the function of the analyzer like count word, files... def analyze(self, functionAnalyzer,sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = "", analyzeMetadata: Optional[bool] = False): #TODO
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
if lang == "es":
spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm")
elif lang == "en":
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load("en_core_web_sm")
self.lang = lang
self.function = function
self.isMetadata = isMetadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = "", analyzeMetadata: Optional[bool] = False): #TODO
"""Analyze a sequence """Analyze a sequence
Args: Args:
...@@ -40,7 +24,7 @@ class Analyzer: ...@@ -40,7 +24,7 @@ class Analyzer:
""" """
if levelOfResult == "": if levelOfResult == "":
if analyzeMetadata: if analyzeMetadata:
analyzeResult = sequence.filterMetadata(levelOfAnalyzer, self.function) analyzeResult = sequence.filterMetadata(levelOfAnalyzer, functionAnalyzer)
resultOfAnalisys= [] resultOfAnalisys= []
for i in analyzeResult: for i in analyzeResult:
resultOfAnalisys.append(i) resultOfAnalisys.append(i)
...@@ -57,9 +41,7 @@ class Analyzer: ...@@ -57,9 +41,7 @@ class Analyzer:
if r == ruta[-1]: if r == ruta[-1]:
for seq in child[r]: for seq in child[r]:
if analyzeMetadata: if analyzeMetadata:
analyzeResult = seq.filterMetadata(levelOfAnalyzer, self.function) analyzeResult = seq.filterMetadata(levelOfAnalyzer, functionAnalyzer)
'''for i in analyzeResult:
resultOfAnalisys = i'''
resultOfAnalisys= [] resultOfAnalisys= []
for i in analyzeResult: for i in analyzeResult:
...@@ -70,7 +52,7 @@ class Analyzer: ...@@ -70,7 +52,7 @@ class Analyzer:
seq.metadata[tag] = resultOfAnalisys seq.metadata[tag] = resultOfAnalisys
else: else:
analyzeResult = seq.filter(levelOfAnalyzer, self.function) analyzeResult = seq.filter(levelOfAnalyzer, functionAnalyzer)
for i in analyzeResult: for i in analyzeResult:
resultOfAnalisys = i resultOfAnalisys = i
if isinstance(resultOfAnalisys[0], Sequence): if isinstance(resultOfAnalisys[0], Sequence):
...@@ -83,147 +65,7 @@ class Analyzer: ...@@ -83,147 +65,7 @@ class Analyzer:
else: else:
raise ValueError(f"Sequence level '{r}' not found in {child}") raise ValueError(f"Sequence level '{r}' not found in {child}")
#La secuencia siempre debe tener un atributo texto para que este funcione
#Contar el numero de palabras, numero de palabras unicas, numero de caracteres y numero medio de caracteres
def volumetry(self,sequence,levelOfAnalyze): #TODO: Revisar
children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
text = seq.metadata["text"].split(" ")
volumetry= {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(seq.metadata["text"]),
"avgWordsLen" : round(volumetry["chars"] / volumetry["words"])
}
seq.metadata["volumetry"] = volumetry
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
def lemmas(self, sequence, levelOfAnalyze): #TODO: Revisar
children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
sequenceLemmas = []
setLemmas = set()
lemma ={}
sumaLenLemmas=0
text = seq.metadata["text"]
doc= self.nlp(text)
for token in doc:
if token.pos_ not in ["PUNCT", "SPACE", "SYM"]:
sumaLenLemmas += len(token.lemma_)
setLemmas.add(token.lemma_)
s = Sequence("token",token.lemma_)
sequenceLemmas.append(s)
lemma["uniqueLemmas"] = len(setLemmas)
lemma["avgLemmasLen"] = round(sumaLenLemmas/len(sequenceLemmas))
seq.metadata["lemmas"] = lemma
seq.children["lemmas"] = sequenceLemmas
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
#Es necesario tener una etiqueta de token en children, si esta no existe, se creará
def pos (self, sequence, levelOfAnalyze): #TODO: Revisar
children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError("The sequence of the level {levelOfAnalyze} don't have atribute text")
else:
doc = self.nlp(seq.metadata["text"])
if "tokens" not in seq.children:
#Creamos uno
pos=[]
for token in doc:
s = Sequence("token",token.text)
s.metadata["pos"] = token.pos_
pos.append(s)
seq.children["tokens"] = pos
else:
pos=[]
for token in doc:
pos.append(token.pos_)
for seqToken in seq.children["tokens"]:
seqToken.metadata["pos"] = pos.pop(0)
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
'''
def polaridad(self, sequence, levelOfAnalyze):
#https://huggingface.co/finiteautomata/beto-sentiment-analysis
if self.lang == "es":
polarityClassifier = pipeline("text-classification",model='finiteautomata/beto-sentiment-analysis', return_all_scores=True)
elif self.lang == "en":
polarityClassifier = pipeline("text-classification",model='finiteautomata/bertweet-base-sentiment-analysis', return_all_scores=True)
children = [sequence.children]
ruta = levelOfAnalyze.split("/") \ No newline at end of file
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
prediction = polarityClassifier(seq.metadata["text"])
seq.metadata["polarity"] = prediction
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
pass
def emotions(self, sequence, levelOfAnalyze):
if self.lang == "es":
emotionsClassifier = pipeline("text-classification",model='pysentimiento/robertuito-emotion-analysis', return_all_scores=True)
elif self.lang == "en":
emotionsClassifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
prediction = emotionsClassifier(seq.metadata["text"])
seq.metadata["emotions"] = prediction
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")'''
\ No newline at end of file
...@@ -7,11 +7,13 @@ import re ...@@ -7,11 +7,13 @@ import re
import numpy as np import numpy as np
import math import math
from functools import reduce from functools import reduce
from textflow.Analyzer import Analyzer
creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt') creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt')
class ComplexityAnalyzer: class ComplexityAnalyzer(Analyzer):
def __init__(self, lang = "es"): def __init__(self, rutaArchivoCrea = creaPath,lang = "es"):
"""Creates an analyzer from an input object. """Creates an analyzer from an input object.
Args: Args:
...@@ -22,11 +24,7 @@ class ComplexityAnalyzer: ...@@ -22,11 +24,7 @@ class ComplexityAnalyzer:
spacy.cli.download("es_core_news_sm") spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm") self.nlp = spacy.load("es_core_news_sm")
#Vamos a cargar CREA: #Vamos a cargar CREA:
self.dicFreqWords=self.read(creaPath) self.dicFreqWords=self.read(rutaArchivoCrea)
self.function = self.complexity
'''elif lang == "en":
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load("en_core_web_sm")'''
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
...@@ -42,7 +40,8 @@ class ComplexityAnalyzer: ...@@ -42,7 +40,8 @@ class ComplexityAnalyzer:
Raises: Raises:
ValueError if the levelOfResult is incorrect ValueError if the levelOfResult is incorrect
""" """
if levelOfResult == "": super().analyze(self.complexity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''if levelOfResult == "":
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
resultOfAnalisys= [] resultOfAnalisys= []
for i in analyzeResult: for i in analyzeResult:
...@@ -64,7 +63,7 @@ class ComplexityAnalyzer: ...@@ -64,7 +63,7 @@ class ComplexityAnalyzer:
else: else:
children = [c.children for c in child[r]] children = [c.children for c in child[r]]
else: else:
raise ValueError(f"Sequence level '{r}' not found in {child}") raise ValueError(f"Sequence level '{r}' not found in {child}") '''
def read(self,fichero): def read(self,fichero):
......
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class EmotionAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.emotionsClassifier = pipeline(task,model=modelEmotions, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.emotions,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def emotions(self, arrayText):
arrayResults =[]
for text in arrayText:
prediction = self.emotionsClassifier(text)
#arrayResults.append(prediction[0][0])
arrayResults.append(prediction)
return arrayResults
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class LemmaAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm"), posNoContent = ["PUNCT", "SPACE", "SYM"]):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.nlp = nlp
self.posNoContent = posNoContent
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.lemmas,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def lemmas(self, arrayText):
arrayResult = []
for text in arrayText:
sequenceLemmas = []
setLemmas = set()
sumaLenLemmas=0
doc= self.nlp(text)
for token in doc:
if token.pos_ not in self.posNoContent:
sumaLenLemmas += len(token.lemma_)
setLemmas.add(token.lemma_)
sequenceLemmas.append(token.lemma_)
lemma={
"srclemmas" : sequenceLemmas,
"uniqueLemmas" : len(setLemmas),
"avgLemmas" : round(sumaLenLemmas/len(sequenceLemmas))
}
arrayResult.append(lemma)
return arrayResult
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class POSAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm")):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.nlp = nlp
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.pos,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def pos(self,arrayText):
arrayResults = []
for text in arrayText:
srcPOS = []
dicFreqPOS = {}
doc = self.nlp(text)
for token in doc:
srcPOS.append(token.pos_)
if token.pos_ in dicFreqPOS:
dicFreqPOS[token.pos_] += 1
else:
dicFreqPOS[token.pos_] = 1
pos = {
"srcPOS": srcPOS,
"FreqPOS": dicFreqPOS
}
arrayResults.append(pos)
return arrayResults
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class PolarityAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.polarityClassifier = pipeline(task,model= modelPolarity, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def polarity(self, arrayText):
arrayResults =[]
for text in arrayText:
prediction = self.polarityClassifier(text)
#arrayResults.append(prediction[0][0])
arrayResults.append(prediction)
return arrayResults
...@@ -4,6 +4,7 @@ from nltk.tokenize import TreebankWordTokenizer ...@@ -4,6 +4,7 @@ from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import WhitespaceTokenizer from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import SpaceTokenizer from nltk.tokenize import SpaceTokenizer
from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import RegexpTokenizer
class SequenceIterator: class SequenceIterator:
......
import string
from typing import Optional from typing import Optional
import spacy #import spacy
import spacy.cli #import spacy.cli
from nltk.text import Text
from nltk.tokenize import WhitespaceTokenizer
import math
from textflow.Analyzer import Analyzer
class StylometryyAnalyzer: #TODO class StylometryAnalyzer(Analyzer): #TODO
def __init__(self, lang = "es"):
if lang == "es": def __init__(self,stopwords, puntuation = string.punctuation,tokenizer = WhitespaceTokenizer()):
spacy.cli.download("es_core_news_sm") self.stopwords = stopwords
self.nlp = spacy.load("es_core_news_sm") self.puntuation = puntuation
self.function = self.stylometry self.tokenizer = tokenizer
pass
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata #Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str]= ""):
"""Analyze a sequence super().analyze(self.stylometry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
Args: def stylometry(self, arrayText):
sequence: the Sequence we want to analyze resultsList = []
tag: the label to store the analysis resut for t in arrayText:
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado) #doc = self.nlp(text)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion) t.lower()
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False) tokens = self.tokenizer.tokenize (t)
text= [token.lower() for token in tokens]
Raises: self.freqWords(text,self.stopwords,self.puntuation)
ValueError if the levelOfResult is incorrect self.funcionesTTR(text)
""" result={
if levelOfResult == "": "uniqueWords": len(self.uniqueWords),
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO "TTR": self.TTR,
resultOfAnalisys= [] "RTTR": self.RTTR,
for i in analyzeResult: "Herdan": self.herdan,
resultOfAnalisys.append(i) "Mass": self.mass,
sequence.metadata[tag] = resultOfAnalisys "Somers": self.somers,
"Dugast": self.dugast,
"Honore": self.honore,
"FreqStopWords": self.freqStopWords,
"FreqPuntuationMarks": self.freqPuntuationMarks,
"FreqWords": self.freqWord
}
resultsList.append(result)
return resultsList
def funcionesTTR(self, text):
self.uniqueWords = [token[0] for token in self.freqWord]
self.numWordFreqOne = len( [token[0] for token in self.freqWord if token[1] == 1 ])
self.TTR = len(self.uniqueWords) / len(text)
self.RTTR = len(self.uniqueWords) / math.sqrt(len(text))
self.herdan = math.log(len(self.uniqueWords),10) / math.log(len(text),10)
self.mass = (math.log(len(text),10)- math.log(len(self.uniqueWords),10)) / pow(math.log(len(self.uniqueWords),10),2)
self.somers = math.log(math.log(len(self.uniqueWords),10),10) / math.log(math.log(len(text),10),10)
if math.log(len(text),10)- math.log(len(self.uniqueWords),10) == 0:
self.dugast = pow(math.log(len(text),10),2)
else: else:
children = [sequence.children] self.dugast = pow(math.log(len(text),10),2) / (math.log(len(text),10)- math.log(len(self.uniqueWords),10))
ruta = levelOfResult.split("/") if 1-(self.numWordFreqOne/len(self.uniqueWords)) == 0:
for r in ruta: #Para cada nivel de la ruta self.honore = 100*(math.log(len(text),10))
for child in children: #Miramos en todas las secuencias disponibles else:
if r in child: #Si dentro de la secuencia actual está r self.honore = 100*(math.log(len(text),10)/(1-(self.numWordFreqOne/len(self.uniqueWords))))
if r == ruta[-1]:
for seq in child[r]:
analyzeResult = seq.filterMetadata(levelOfAnalyzer,self.function) def freqWords(self,tokens, stopWords, puntuationMarks):
resultOfAnalisys= [] freqStopWords = {}
for i in analyzeResult: freqPuntuationMarks = {}
resultOfAnalisys.append(i) freqWord ={}
seq.metadata[tag] = resultOfAnalisys for token in tokens:
else: if token in stopWords:
children = [c.children for c in child[r]] if token in freqStopWords:
else: freqStopWords[token] += 1
raise ValueError(f"Sequence level '{r}' not found in {child}") else:
freqStopWords[token] = 1
def stylometry(self): elif token in puntuationMarks:
pass if token in freqPuntuationMarks:
freqPuntuationMarks[token] += 1
else:
freqPuntuationMarks[token] = 1
else:
if token in freqWord:
freqWord[token] += 1
else:
freqWord[token] = 1
self.freqWord = sorted(freqWord.items(), reverse = True)
self.freqPuntuationMarks = sorted(freqPuntuationMarks.items(), reverse = True)
self.freqStopWords = sorted(freqStopWords.items(), reverse = True)
\ No newline at end of file
from typing import Optional
from textflow.Sequence import Sequence
from nltk.tokenize import WhitespaceTokenizer
from textflow.Analyzer import Analyzer
class VolumetryAnalyzer(Analyzer):
def __init__(self, tokenizer= WhitespaceTokenizer()):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.tokenizer = tokenizer
def volumetry(self, arrayText):
arrayResults =[]
for texts in arrayText:
text = self.tokenizer.tokenize(texts)
dicResults = {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(texts),
"avgWordsLen" : round(len(texts) / len(text))
}
arrayResults.append(dicResults)
return arrayResults
#La secuencia siempre debe tener un atributo texto(metadata) para que este funcione
#Contar el numero de palabras, numero de palabras unicas, numero de caracteres y numero medio de caracteres
def analyze(self,sequence,tag,levelOfAnalyzer,levelOfResult:Optional[str] = ""):
super().analyze(self.volumetry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
text = seq.metadata["text"].split(" ")
volumetry= {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(seq.metadata["text"]),
"avgWordsLen" : round(volumetry["chars"] / volumetry["words"])
}
seq.metadata["volumetry"] = volumetry
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")'''
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment