Implementing some analyzers

parent f3e06440
......@@ -60,7 +60,7 @@ unicode_backport = ["unicodedata2"]
[[package]]
name = "click"
version = "8.1.2"
version = "8.1.3"
description = "Composable command line interface toolkit"
category = "main"
optional = false
......@@ -87,7 +87,7 @@ python-versions = "*"
[[package]]
name = "filelock"
version = "3.6.0"
version = "3.7.0"
description = "A platform independent file lock."
category = "main"
optional = false
......@@ -98,16 +98,8 @@ docs = ["furo (>=2021.8.17b43)", "sphinx (>=4.1)", "sphinx-autodoc-typehints (>=
testing = ["covdefaults (>=1.2.0)", "coverage (>=4)", "pytest (>=4)", "pytest-cov", "pytest-timeout (>=1.4.2)"]
[[package]]
name = "functools"
version = "0.5"
description = "Fast tools for functional programming"
category = "main"
optional = false
python-versions = "*"
[[package]]
name = "huggingface-hub"
version = "0.5.1"
version = "0.6.0"
description = "Client library to download and publish models on the huggingface.co hub"
category = "main"
optional = false
......@@ -124,6 +116,7 @@ typing-extensions = ">=3.7.4.3"
[package.extras]
all = ["pytest", "datasets", "soundfile", "black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"]
dev = ["pytest", "datasets", "soundfile", "black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"]
fastai = ["toml", "fastai (>=2.4)", "fastcore (>=1.3.27)"]
quality = ["black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)"]
tensorflow = ["tensorflow", "pydot", "graphviz"]
testing = ["pytest", "datasets", "soundfile"]
......@@ -180,7 +173,7 @@ python-versions = ">=3.7"
[[package]]
name = "more-itertools"
version = "8.12.0"
version = "8.13.0"
description = "More routines for operating on iterables, beyond itertools"
category = "dev"
optional = false
......@@ -301,14 +294,14 @@ email = ["email-validator (>=1.0.3)"]
[[package]]
name = "pyparsing"
version = "3.0.7"
description = "Python parsing module"
version = "3.0.9"
description = "pyparsing module - Classes and methods to define and execute parsing grammars"
category = "main"
optional = false
python-versions = ">=3.6"
python-versions = ">=3.6.8"
[package.extras]
diagrams = ["jinja2", "railroad-diagrams"]
diagrams = ["railroad-diagrams", "jinja2"]
[[package]]
name = "pytest"
......@@ -367,29 +360,6 @@ socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
[[package]]
name = "sacremoses"
version = "0.0.53"
description = "SacreMoses"
category = "main"
optional = false
python-versions = "*"
[package.dependencies]
click = "*"
joblib = "*"
regex = "*"
six = "*"
tqdm = "*"
[[package]]
name = "six"
version = "1.16.0"
description = "Python 2 and 3 compatibility utilities"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
[[package]]
name = "smart-open"
version = "5.2.1"
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
......@@ -540,6 +510,17 @@ docs = ["sphinx", "sphinx-rtd-theme", "setuptools-rust"]
testing = ["pytest", "requests", "numpy", "datasets"]
[[package]]
name = "torch"
version = "1.11.0"
description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
category = "main"
optional = false
python-versions = ">=3.7.0"
[package.dependencies]
typing-extensions = "*"
[[package]]
name = "tqdm"
version = "4.64.0"
description = "Fast, Extensible Progress Meter"
......@@ -558,11 +539,11 @@ telegram = ["requests"]
[[package]]
name = "transformers"
version = "4.18.0"
description = "State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch"
version = "4.19.1"
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
category = "main"
optional = false
python-versions = ">=3.6.0"
python-versions = ">=3.7.0"
[package.dependencies]
filelock = "*"
......@@ -572,22 +553,22 @@ packaging = ">=20.0"
pyyaml = ">=5.1"
regex = "!=2019.12.17"
requests = "*"
sacremoses = "*"
tokenizers = ">=0.11.1,<0.11.3 || >0.11.3,<0.13"
tqdm = ">=4.27"
[package.extras]
all = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2)", "jaxlib (>=0.1.65)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)"]
all = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)"]
audio = ["librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
codecarbon = ["codecarbon (==1.2.0)"]
deepspeed = ["deepspeed (>=0.6.0)"]
dev = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2)", "jaxlib (>=0.1.65)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)", "faiss-cpu", "cookiecutter (==1.7.3)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)", "hf-doc-builder", "scikit-learn"]
dev-tensorflow = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)", "faiss-cpu", "cookiecutter (==1.7.3)", "tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "pillow", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "hf-doc-builder", "scikit-learn", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
dev-torch = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)", "faiss-cpu", "cookiecutter (==1.7.3)", "torch (>=1.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)", "hf-doc-builder", "scikit-learn", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
docs = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2)", "jaxlib (>=0.1.65)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "hf-doc-builder"]
deepspeed = ["deepspeed (>=0.6.4)"]
deepspeed-testing = ["deepspeed (>=0.6.4)", "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)", "optuna"]
dev = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)", "hf-doc-builder", "scikit-learn"]
dev-tensorflow = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)", "tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "pillow", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "hf-doc-builder", "scikit-learn", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
dev-torch = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)", "torch (>=1.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "unidic-lite (>=1.0.7)", "unidic (>=1.0.2)", "hf-doc-builder", "scikit-learn", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
docs = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx", "torch (>=1.0)", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "flax (>=0.3.5)", "optax (>=0.0.8)", "sentencepiece (>=0.1.91,!=0.1.92)", "protobuf", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer", "pillow", "optuna", "ray", "sigopt", "timm", "codecarbon (==1.2.0)", "hf-doc-builder"]
docs_specific = ["hf-doc-builder"]
fairscale = ["fairscale (>0.3)"]
flax = ["jax (>=0.2.8,!=0.3.2)", "jaxlib (>=0.1.65)", "flax (>=0.3.5)", "optax (>=0.0.8)"]
flax = ["jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "flax (>=0.3.5)", "optax (>=0.0.8)"]
flax-speech = ["librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
ftfy = ["ftfy"]
integrations = ["optuna", "ray", "sigopt"]
......@@ -596,7 +577,7 @@ modelcreation = ["cookiecutter (==1.7.3)"]
onnx = ["onnxconverter-common", "tf2onnx", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
optuna = ["optuna"]
quality = ["black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)"]
quality = ["black (>=22.0,<23.0)", "isort (>=5.5.4)", "flake8 (>=3.8.3)", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)"]
ray = ["ray"]
retrieval = ["faiss-cpu", "datasets"]
sagemaker = ["sagemaker (>=2.31.0)"]
......@@ -605,7 +586,7 @@ serving = ["pydantic", "uvicorn", "fastapi", "starlette"]
sigopt = ["sigopt"]
sklearn = ["scikit-learn"]
speech = ["torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
testing = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.2.0)", "faiss-cpu", "cookiecutter (==1.7.3)"]
testing = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black (>=22.0,<23.0)", "sacrebleu (>=1.4.12,<2.0.0)", "rouge-score", "nltk", "GitPython (<3.1.19)", "hf-doc-builder (>=0.3.0)", "sacremoses", "rjieba", "faiss-cpu", "cookiecutter (==1.7.3)"]
tf = ["tensorflow (>=2.3)", "onnxconverter-common", "tf2onnx"]
tf-cpu = ["tensorflow-cpu (>=2.3)", "onnxconverter-common", "tf2onnx"]
tf-speech = ["librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
......@@ -613,7 +594,7 @@ timm = ["timm"]
tokenizers = ["tokenizers (>=0.11.1,!=0.11.3,<0.13)"]
torch = ["torch (>=1.0)"]
torch-speech = ["torchaudio", "librosa", "pyctcdecode (>=0.3.0)", "phonemizer"]
torchhub = ["filelock", "huggingface-hub (>=0.1.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.0)", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "tqdm (>=4.27)"]
torchhub = ["filelock", "huggingface-hub (>=0.1.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.0)", "tokenizers (>=0.11.1,!=0.11.3,<0.13)", "tqdm (>=4.27)"]
vision = ["pillow"]
[[package]]
......@@ -672,8 +653,8 @@ python-versions = "*"
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "d3639d0f322d79260a5fe40ea43817a72bcd16885b66b7a1b1bb3ec355d37264"
python-versions = "3.8"
content-hash = "f559d5695f1365c162f02c2146df48de52ad2d38e1b4a26476c7a662dc065365"
[metadata.files]
atomicwrites = [
......@@ -715,8 +696,8 @@ charset-normalizer = [
{file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"},
]
click = [
{file = "click-8.1.2-py3-none-any.whl", hash = "sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e"},
{file = "click-8.1.2.tar.gz", hash = "sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"},
{file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
{file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
]
colorama = [
{file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
......@@ -741,15 +722,12 @@ cymem = [
{file = "cymem-2.0.6.tar.gz", hash = "sha256:169725b5816959d34de2545b33fee6a8021a6e08818794a426c5a4f981f17e5e"},
]
filelock = [
{file = "filelock-3.6.0-py3-none-any.whl", hash = "sha256:f8314284bfffbdcfa0ff3d7992b023d4c628ced6feb957351d4c48d059f56bc0"},
{file = "filelock-3.6.0.tar.gz", hash = "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85"},
]
functools = [
{file = "functools-0.5.tar.gz", hash = "sha256:596ed8999dee419c0749a41bfdd82e4697e80ea27ee01c716003ef55be9a54c5"},
{file = "filelock-3.7.0-py3-none-any.whl", hash = "sha256:c7b5fdb219b398a5b28c8e4c1893ef5f98ece6a38c6ab2c22e26ec161556fed6"},
{file = "filelock-3.7.0.tar.gz", hash = "sha256:b795f1b42a61bbf8ec7113c341dad679d772567b936fbd1bf43c9a238e673e20"},
]
huggingface-hub = [
{file = "huggingface_hub-0.5.1-py3-none-any.whl", hash = "sha256:b9fd1f567a3fb16e73acc613e78d075d1926d4b0c5c56ba08c4f125707b50c70"},
{file = "huggingface_hub-0.5.1.tar.gz", hash = "sha256:d90d657dca0d6a577f640ff684a58da8e5c76258e485100e885a0e7307e2eb12"},
{file = "huggingface_hub-0.6.0-py3-none-any.whl", hash = "sha256:585d72adade562a1f7038acf39eb7677b7649bdc0ce082b70f99e01164d9d8b5"},
{file = "huggingface_hub-0.6.0.tar.gz", hash = "sha256:f5109065222185d129933d44159e483a9e3378c577127d0281e4c921dfadbd23"},
]
idna = [
{file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
......@@ -810,8 +788,8 @@ markupsafe = [
{file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
]
more-itertools = [
{file = "more-itertools-8.12.0.tar.gz", hash = "sha256:7dc6ad46f05f545f900dd59e8dfb4e84a4827b97b3cfecb175ea0c7d247f6064"},
{file = "more_itertools-8.12.0-py3-none-any.whl", hash = "sha256:43e6dd9942dffd72661a2c4ef383ad7da1e6a3e968a927ad7a6083ab410a688b"},
{file = "more-itertools-8.13.0.tar.gz", hash = "sha256:a42901a0a5b169d925f6f217cd5a190e32ef54360905b9c39ee7db5313bfec0f"},
{file = "more_itertools-8.13.0-py3-none-any.whl", hash = "sha256:c5122bffc5f104d37c1626b8615b511f3427aa5389b94d61e5ef8236bfbc3ddb"},
]
murmurhash = [
{file = "murmurhash-1.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:966d2efec6e01aa32c5774c44906724efca00da3507f06faa11acafb47ea1230"},
......@@ -916,8 +894,8 @@ pydantic = [
{file = "pydantic-1.8.2.tar.gz", hash = "sha256:26464e57ccaafe72b7ad156fdaa4e9b9ef051f69e175dbbb463283000c05ab7b"},
]
pyparsing = [
{file = "pyparsing-3.0.7-py3-none-any.whl", hash = "sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484"},
{file = "pyparsing-3.0.7.tar.gz", hash = "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea"},
{file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
{file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
]
pytest = [
{file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"},
......@@ -1038,13 +1016,6 @@ requests = [
{file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"},
{file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"},
]
sacremoses = [
{file = "sacremoses-0.0.53.tar.gz", hash = "sha256:43715868766c643b35de4b8046cce236bfe59a7fa88b25eaf6ddf02bacf53a7a"},
]
six = [
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
]
smart-open = [
{file = "smart_open-5.2.1-py3-none-any.whl", hash = "sha256:71d14489da58b60ce12fc3ecb823facc59a8b23cd1b58edb97175640350d3a62"},
{file = "smart_open-5.2.1.tar.gz", hash = "sha256:75abf758717a92a8f53aa96953f0c245c8cedf8e1e4184903db3659b419d4c17"},
......@@ -1146,13 +1117,34 @@ tokenizers = [
{file = "tokenizers-0.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:2158baf80cbc09259bfd6e0e0fc4597b611e7a72ad5443dad63918a90f1dd304"},
{file = "tokenizers-0.12.1.tar.gz", hash = "sha256:070746f86efa6c873db341e55cf17bb5e7bdd5450330ca8eca542f5c3dab2c66"},
]
torch = [
{file = "torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:62052b50fffc29ca7afc0c04ef8206b6f1ca9d10629cb543077e12967e8d0398"},
{file = "torch-1.11.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:866bfba29ac98dec35d893d8e17eaec149d0ac7a53be7baae5c98069897db667"},
{file = "torch-1.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:951640fb8db308a59d9b510e7d1ad910aff92913323bbe4bc75435347ddd346d"},
{file = "torch-1.11.0-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:5d77b5ece78fdafa5c7f42995ff9474399d22571cd6b2de21a5d666306a2ff8c"},
{file = "torch-1.11.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:b5a38682769b544c875ecc34bcb81fbad5c922139b61319aacffcfd8a32f528c"},
{file = "torch-1.11.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f82d77695a60626f2b7382d85bc566de8a6b3e50d32080755abc040db802e419"},
{file = "torch-1.11.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b96654d42566080a134e784705f33f8536b3b95b5dcde357ed7879b1692a5f78"},
{file = "torch-1.11.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8ee7c2e8d7f7020d5bfbc1bb91b9591044c26bbd0cee5e4f694cfd7ed8649260"},
{file = "torch-1.11.0-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:6860b1d1bf0bb0b67a6bd47f85a0e4c825b518eea13b5d6101999dbbcbd5bc0c"},
{file = "torch-1.11.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:4322aa29f50da7f404db06cdf30896ea67b09f673af4a985afc7162bc897864d"},
{file = "torch-1.11.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e4d2e0ddd652f30e94cff750220324ec45705d4ecc69658f773b3cb1c7a28dd0"},
{file = "torch-1.11.0-cp38-cp38-win_amd64.whl", hash = "sha256:34ce5ea4d8d85da32cdbadb50d4585106901e9f8a3527991daa70c13a09de1f7"},
{file = "torch-1.11.0-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:0ccc85cd06227a3edf809e2c795fd5762c3d4e8a38b5c9f744c6e7cf841361bb"},
{file = "torch-1.11.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c1554e49d74f1b2c3e7202d77056ba2dd7465437585bac64062b580f714a44e9"},
{file = "torch-1.11.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:58c7814502b1c129a650d7092033bbb0bbd64faf1a7941631aaa1aeaddc37570"},
{file = "torch-1.11.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:831cf588f01dda9409e75576741d2823453990dee2983d670f2584b37a01adf7"},
{file = "torch-1.11.0-cp39-cp39-win_amd64.whl", hash = "sha256:44a1d02fd20f827f0f36dc26fdcfc45e793806a6ad52769a22260655a77a4369"},
{file = "torch-1.11.0-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:50fd9bf85c578c871c28f1cb0ace9dfc6024401c7f399b174fb0f370899f4454"},
{file = "torch-1.11.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:0e48af66ad755f0f9c5f2664028a414f57c49d6adc37e77e06fe0004da4edb61"},
]
tqdm = [
{file = "tqdm-4.64.0-py2.py3-none-any.whl", hash = "sha256:74a2cdefe14d11442cedf3ba4e21a3b84ff9a2dbdc6cfae2c34addb2a14a5ea6"},
{file = "tqdm-4.64.0.tar.gz", hash = "sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d"},
]
transformers = [
{file = "transformers-4.18.0-py3-none-any.whl", hash = "sha256:6ae54fc29bd4bba5b0230d429cb55b8b3eb5feb9e3c9913c61203999f1f0c2c9"},
{file = "transformers-4.18.0.tar.gz", hash = "sha256:16f7751c44f31d8f9a3811bccd80f1995e1cb0ffd9b7de60ef6ede2ab90a6fd4"},
{file = "transformers-4.19.1-py3-none-any.whl", hash = "sha256:16d3dd257d459c2598e2548a9e6875c10b7db5e44494d93b3c0a5c60afad667f"},
{file = "transformers-4.19.1.tar.gz", hash = "sha256:6fb30ee534a25b6b3fc7064c280b7f44abf8c9bd1fb358860ebe4fd392bf15f5"},
]
typer = [
{file = "typer-0.4.1-py3-none-any.whl", hash = "sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3"},
......
......@@ -5,14 +5,16 @@ description = "A text analysis library for Python"
authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@red.ujaen.es>"]
[tool.poetry.dependencies]
python = "^3.8"
python = "3.8"
nltk = "^3.7"
spacy = "^3.3.0"
transformers = "^4.18.0"
transformers = "^4.19.0"
torch = {version = "^1.11.0", python = "^3.7", platform = "linux"}
[tool.poetry.dev-dependencies]
pytest = "^5.2"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
......@@ -3,29 +3,13 @@ import spacy
import spacy.cli
from typing import Optional
from textflow.Sequence import Sequence
#from transformers import pipeline
from abc import ABC, abstractmethod
class Analyzer:
def __init__(self, function, isMetadata: Optional[bool] = False,lang : Optional[str] = "es"):
"""Creates an analyzer from an input object.
class Analyzer(ABC):
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
if lang == "es":
spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm")
elif lang == "en":
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load("en_core_web_sm")
self.lang = lang
self.function = function
self.isMetadata = isMetadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = "", analyzeMetadata: Optional[bool] = False): #TODO
@abstractmethod
def analyze(self, functionAnalyzer,sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = "", analyzeMetadata: Optional[bool] = False): #TODO
"""Analyze a sequence
Args:
......@@ -40,7 +24,7 @@ class Analyzer:
"""
if levelOfResult == "":
if analyzeMetadata:
analyzeResult = sequence.filterMetadata(levelOfAnalyzer, self.function)
analyzeResult = sequence.filterMetadata(levelOfAnalyzer, functionAnalyzer)
resultOfAnalisys= []
for i in analyzeResult:
resultOfAnalisys.append(i)
......@@ -57,9 +41,7 @@ class Analyzer:
if r == ruta[-1]:
for seq in child[r]:
if analyzeMetadata:
analyzeResult = seq.filterMetadata(levelOfAnalyzer, self.function)
'''for i in analyzeResult:
resultOfAnalisys = i'''
analyzeResult = seq.filterMetadata(levelOfAnalyzer, functionAnalyzer)
resultOfAnalisys= []
for i in analyzeResult:
......@@ -70,7 +52,7 @@ class Analyzer:
seq.metadata[tag] = resultOfAnalisys
else:
analyzeResult = seq.filter(levelOfAnalyzer, self.function)
analyzeResult = seq.filter(levelOfAnalyzer, functionAnalyzer)
for i in analyzeResult:
resultOfAnalisys = i
if isinstance(resultOfAnalisys[0], Sequence):
......@@ -84,146 +66,6 @@ class Analyzer:
raise ValueError(f"Sequence level '{r}' not found in {child}")
#La secuencia siempre debe tener un atributo texto para que este funcione
#Contar el numero de palabras, numero de palabras unicas, numero de caracteres y numero medio de caracteres
def volumetry(self,sequence,levelOfAnalyze): #TODO: Revisar
children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
text = seq.metadata["text"].split(" ")
volumetry= {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(seq.metadata["text"]),
"avgWordsLen" : round(volumetry["chars"] / volumetry["words"])
}
seq.metadata["volumetry"] = volumetry
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
def lemmas(self, sequence, levelOfAnalyze): #TODO: Revisar
children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
sequenceLemmas = []
setLemmas = set()
lemma ={}
sumaLenLemmas=0
text = seq.metadata["text"]
doc= self.nlp(text)
for token in doc:
if token.pos_ not in ["PUNCT", "SPACE", "SYM"]:
sumaLenLemmas += len(token.lemma_)
setLemmas.add(token.lemma_)
s = Sequence("token",token.lemma_)
sequenceLemmas.append(s)
lemma["uniqueLemmas"] = len(setLemmas)
lemma["avgLemmasLen"] = round(sumaLenLemmas/len(sequenceLemmas))
seq.metadata["lemmas"] = lemma
seq.children["lemmas"] = sequenceLemmas
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
\ No newline at end of file
#Es necesario tener una etiqueta de token en children, si esta no existe, se creará
def pos (self, sequence, levelOfAnalyze): #TODO: Revisar
children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError("The sequence of the level {levelOfAnalyze} don't have atribute text")
else:
doc = self.nlp(seq.metadata["text"])
if "tokens" not in seq.children:
#Creamos uno
pos=[]
for token in doc:
s = Sequence("token",token.text)
s.metadata["pos"] = token.pos_
pos.append(s)
seq.children["tokens"] = pos
else:
pos=[]
for token in doc:
pos.append(token.pos_)
for seqToken in seq.children["tokens"]:
seqToken.metadata["pos"] = pos.pop(0)
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
'''
def polaridad(self, sequence, levelOfAnalyze):
#https://huggingface.co/finiteautomata/beto-sentiment-analysis
if self.lang == "es":
polarityClassifier = pipeline("text-classification",model='finiteautomata/beto-sentiment-analysis', return_all_scores=True)
elif self.lang == "en":
polarityClassifier = pipeline("text-classification",model='finiteautomata/bertweet-base-sentiment-analysis', return_all_scores=True)
children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
prediction = polarityClassifier(seq.metadata["text"])
seq.metadata["polarity"] = prediction
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
pass
def emotions(self, sequence, levelOfAnalyze):
if self.lang == "es":
emotionsClassifier = pipeline("text-classification",model='pysentimiento/robertuito-emotion-analysis', return_all_scores=True)
elif self.lang == "en":
emotionsClassifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
prediction = emotionsClassifier(seq.metadata["text"])
seq.metadata["emotions"] = prediction
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")'''
\ No newline at end of file
......@@ -7,11 +7,13 @@ import re
import numpy as np
import math
from functools import reduce
from textflow.Analyzer import Analyzer
creaPath = os.path.join(os.path.dirname(__file__), 'Crea-5000.txt')
class ComplexityAnalyzer:
def __init__(self, lang = "es"):
class ComplexityAnalyzer(Analyzer):
def __init__(self, rutaArchivoCrea = creaPath,lang = "es"):
"""Creates an analyzer from an input object.
Args:
......@@ -22,11 +24,7 @@ class ComplexityAnalyzer:
spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm")
#Vamos a cargar CREA:
self.dicFreqWords=self.read(creaPath)
self.function = self.complexity
'''elif lang == "en":
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load("en_core_web_sm")'''
self.dicFreqWords=self.read(rutaArchivoCrea)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
......@@ -42,7 +40,8 @@ class ComplexityAnalyzer:
Raises:
ValueError if the levelOfResult is incorrect
"""
if levelOfResult == "":
super().analyze(self.complexity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''if levelOfResult == "":
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
resultOfAnalisys= []
for i in analyzeResult:
......@@ -64,7 +63,7 @@ class ComplexityAnalyzer:
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
raise ValueError(f"Sequence level '{r}' not found in {child}") '''
def read(self,fichero):
......
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class EmotionAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelEmotions = 'pysentimiento/robertuito-emotion-analysis', allScores = True):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.emotionsClassifier = pipeline(task,model=modelEmotions, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.emotions,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def emotions(self, arrayText):
arrayResults =[]
for text in arrayText:
prediction = self.emotionsClassifier(text)
#arrayResults.append(prediction[0][0])
arrayResults.append(prediction)
return arrayResults
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class LemmaAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm"), posNoContent = ["PUNCT", "SPACE", "SYM"]):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.nlp = nlp
self.posNoContent = posNoContent
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.lemmas,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def lemmas(self, arrayText):
arrayResult = []
for text in arrayText:
sequenceLemmas = []
setLemmas = set()
sumaLenLemmas=0
doc= self.nlp(text)
for token in doc:
if token.pos_ not in self.posNoContent:
sumaLenLemmas += len(token.lemma_)
setLemmas.add(token.lemma_)
sequenceLemmas.append(token.lemma_)
lemma={
"srclemmas" : sequenceLemmas,
"uniqueLemmas" : len(setLemmas),
"avgLemmas" : round(sumaLenLemmas/len(sequenceLemmas))
}
arrayResult.append(lemma)
return arrayResult
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
spacy.cli.download("es_core_news_sm")
class POSAnalyzer(Analyzer):
def __init__(self, nlp = spacy.load("es_core_news_sm")):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.nlp = nlp
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.pos,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def pos(self,arrayText):
arrayResults = []
for text in arrayText:
srcPOS = []
dicFreqPOS = {}
doc = self.nlp(text)
for token in doc:
srcPOS.append(token.pos_)
if token.pos_ in dicFreqPOS:
dicFreqPOS[token.pos_] += 1
else:
dicFreqPOS[token.pos_] = 1
pos = {
"srcPOS": srcPOS,
"FreqPOS": dicFreqPOS
}
arrayResults.append(pos)
return arrayResults
import os
import spacy
import spacy.cli
from typing import Optional
from textflow.Analyzer import Analyzer
from transformers import pipeline
import torch
class PolarityAnalyzer(Analyzer):
def __init__(self, task = "text-classification",modelPolarity = 'finiteautomata/beto-sentiment-analysis', allScores = True):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.polarityClassifier = pipeline(task,model= modelPolarity, return_all_scores=allScores)
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
super().analyze(self.polarity,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def polarity(self, arrayText):
arrayResults =[]
for text in arrayText:
prediction = self.polarityClassifier(text)
#arrayResults.append(prediction[0][0])
arrayResults.append(prediction)
return arrayResults
......@@ -4,6 +4,7 @@ from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import SpaceTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import RegexpTokenizer
class SequenceIterator:
......
import string
from typing import Optional
import spacy
import spacy.cli
#import spacy
#import spacy.cli
from nltk.text import Text
from nltk.tokenize import WhitespaceTokenizer
import math
from textflow.Analyzer import Analyzer
class StylometryyAnalyzer: #TODO
def __init__(self, lang = "es"):
if lang == "es":
spacy.cli.download("es_core_news_sm")
self.nlp = spacy.load("es_core_news_sm")
self.function = self.stylometry
pass
class StylometryAnalyzer(Analyzer): #TODO
def __init__(self,stopwords, puntuation = string.punctuation,tokenizer = WhitespaceTokenizer()):
self.stopwords = stopwords
self.puntuation = puntuation
self.tokenizer = tokenizer
#Este analizador, solo puede analizar cadenas de texto, por lo que solo tiene sentido que use el atributo text de metadata
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str] = ""): #TODO
"""Analyze a sequence
Args:
sequence: the Sequence we want to analyze
tag: the label to store the analysis resut
levelOfAnalyzer: the path of the sequence level to analyze inside of the result(la subsequencia a analizar dentro de la sequencia en la que queremos almacenar el resultado)
levelOfResult: the path of the sequence level to store the result. (Podemos querer analizar los tokens pero almacenarlo a nivel de oracion)
analyzeMetadata: boolean, if the result of the analyzer is applied in metadata (True) or in children(False)
Raises:
ValueError if the levelOfResult is incorrect
"""
if levelOfResult == "":
analyzeResult = sequence.filterMetadata(levelOfAnalyzer,self.function)#TODO
resultOfAnalisys= []
for i in analyzeResult:
resultOfAnalisys.append(i)
sequence.metadata[tag] = resultOfAnalisys
else:
children = [sequence.children]
ruta = levelOfResult.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
analyzeResult = seq.filterMetadata(levelOfAnalyzer,self.function)
resultOfAnalisys= []
for i in analyzeResult:
resultOfAnalisys.append(i)
seq.metadata[tag] = resultOfAnalisys
def analyze(self, sequence, tag, levelOfAnalyzer, levelOfResult:Optional[str]= ""):
super().analyze(self.stylometry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
def stylometry(self, arrayText):
resultsList = []
for t in arrayText:
#doc = self.nlp(text)
t.lower()
tokens = self.tokenizer.tokenize (t)
text= [token.lower() for token in tokens]
self.freqWords(text,self.stopwords,self.puntuation)
self.funcionesTTR(text)
result={
"uniqueWords": len(self.uniqueWords),
"TTR": self.TTR,
"RTTR": self.RTTR,
"Herdan": self.herdan,
"Mass": self.mass,
"Somers": self.somers,
"Dugast": self.dugast,
"Honore": self.honore,
"FreqStopWords": self.freqStopWords,
"FreqPuntuationMarks": self.freqPuntuationMarks,
"FreqWords": self.freqWord
}
resultsList.append(result)
return resultsList
def funcionesTTR(self, text):
self.uniqueWords = [token[0] for token in self.freqWord]
self.numWordFreqOne = len( [token[0] for token in self.freqWord if token[1] == 1 ])
self.TTR = len(self.uniqueWords) / len(text)
self.RTTR = len(self.uniqueWords) / math.sqrt(len(text))
self.herdan = math.log(len(self.uniqueWords),10) / math.log(len(text),10)
self.mass = (math.log(len(text),10)- math.log(len(self.uniqueWords),10)) / pow(math.log(len(self.uniqueWords),10),2)
self.somers = math.log(math.log(len(self.uniqueWords),10),10) / math.log(math.log(len(text),10),10)
if math.log(len(text),10)- math.log(len(self.uniqueWords),10) == 0:
self.dugast = pow(math.log(len(text),10),2)
else:
children = [c.children for c in child[r]]
self.dugast = pow(math.log(len(text),10),2) / (math.log(len(text),10)- math.log(len(self.uniqueWords),10))
if 1-(self.numWordFreqOne/len(self.uniqueWords)) == 0:
self.honore = 100*(math.log(len(text),10))
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")
self.honore = 100*(math.log(len(text),10)/(1-(self.numWordFreqOne/len(self.uniqueWords))))
def stylometry(self):
pass
def freqWords(self,tokens, stopWords, puntuationMarks):
freqStopWords = {}
freqPuntuationMarks = {}
freqWord ={}
for token in tokens:
if token in stopWords:
if token in freqStopWords:
freqStopWords[token] += 1
else:
freqStopWords[token] = 1
elif token in puntuationMarks:
if token in freqPuntuationMarks:
freqPuntuationMarks[token] += 1
else:
freqPuntuationMarks[token] = 1
else:
if token in freqWord:
freqWord[token] += 1
else:
freqWord[token] = 1
self.freqWord = sorted(freqWord.items(), reverse = True)
self.freqPuntuationMarks = sorted(freqPuntuationMarks.items(), reverse = True)
self.freqStopWords = sorted(freqStopWords.items(), reverse = True)
\ No newline at end of file
from typing import Optional
from textflow.Sequence import Sequence
from nltk.tokenize import WhitespaceTokenizer
from textflow.Analyzer import Analyzer
class VolumetryAnalyzer(Analyzer):
def __init__(self, tokenizer= WhitespaceTokenizer()):
"""Creates an analyzer from an input object.
Args:
function: the function of the analyzer like count word, files...
isMetadata: boolean, if the result of the analyzer is stored in metadata (True) or in children(False)
"""
self.tokenizer = tokenizer
def volumetry(self, arrayText):
arrayResults =[]
for texts in arrayText:
text = self.tokenizer.tokenize(texts)
dicResults = {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(texts),
"avgWordsLen" : round(len(texts) / len(text))
}
arrayResults.append(dicResults)
return arrayResults
#La secuencia siempre debe tener un atributo texto(metadata) para que este funcione
#Contar el numero de palabras, numero de palabras unicas, numero de caracteres y numero medio de caracteres
def analyze(self,sequence,tag,levelOfAnalyzer,levelOfResult:Optional[str] = ""):
super().analyze(self.volumetry,sequence, tag, levelOfAnalyzer, levelOfResult, True)
'''children = [sequence.children]
ruta = levelOfAnalyze.split("/")
for r in ruta: #Para cada nivel de la ruta
for child in children: #Miramos en todas las secuencias disponibles
if r in child: #Si dentro de la secuencia actual está r
if r == ruta[-1]:
for seq in child[r]:
if "text" not in seq.metadata:
raise ValueError(f"Level text not found in {seq.metadata.keys()}")
else:
text = seq.metadata["text"].split(" ")
volumetry= {
"words" : len(text),
"uniqueWords" : len(set(text)),
"chars" : len(seq.metadata["text"]),
"avgWordsLen" : round(volumetry["chars"] / volumetry["words"])
}
seq.metadata["volumetry"] = volumetry
else:
children = [c.children for c in child[r]]
else:
raise ValueError(f"Sequence level '{r}' not found in {child}")'''
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment