Working on a version for easing the inclusion of new analyzers

parent dd4c4c96
...@@ -724,3 +724,35 @@ The steps to create an analyzer are: ...@@ -724,3 +724,35 @@ The steps to create an analyzer are:
## Quick path for adding a new analyzer
If your analyzer just needs to process a list of inputs (e.g., texts from metadata) and return one result per input, you can extend `SimpleAnalyzer` and optionally register it for easy instantiation:
1) Extend `textflow.SimpleAnalyzer.SimpleAnalyzer` and implement `compute(items)`.
2) Optionally set `apply_to_metadata = False` if you want to analyze children sequences rather than metadata items.
3) (Optional) Register with a name using the decorator `@textflow.register_analyzer("my_analyzer")`, then instantiate with `textflow.create_analyzer("my_analyzer", **kwargs)`.
Minimal example:
```python
from textflow.SimpleAnalyzer import SimpleAnalyzer
from textflow import register_analyzer
@register_analyzer("word_count")
class WordCountAnalyzer(SimpleAnalyzer):
apply_to_metadata = True # expects list of texts from metadata
def compute(self, items):
return [{"words": len(text.split())} for text in items]
```
Usage:
```python
an = textflow.create_analyzer("word_count")
an.analyze(sequence, tag="volumetry", levelOfAnalyzer="text")
```
from typing import Optional from typing import Optional
import emoji import emoji
from textflow.Analyzer import Analyzer from textflow.Analyzer import Analyzer
from textflow.registry import register_analyzer
import re import re
@register_analyzer("emoji")
class EmojiAnalyzer(Analyzer): class EmojiAnalyzer(Analyzer):
""" """
A class that provides methods to analyze the different emojis of the text of a sequence. A class that provides methods to analyze the different emojis of the text of a sequence.
......
from typing import Optional from typing import Optional
from textflow.Analyzer import Analyzer from textflow.Analyzer import Analyzer
from textflow.registry import register_analyzer
from transformers import pipeline from transformers import pipeline
@register_analyzer("polarity")
class PolarityAnalyzer(Analyzer): class PolarityAnalyzer(Analyzer):
""" """
A class that provides methods to analyze the polarity of the text of a sequence. A class that provides methods to analyze the polarity of the text of a sequence.
......
from abc import ABC, abstractmethod
from typing import Any, Iterable, List, Optional
from textflow.Analyzer import Analyzer
class SimpleAnalyzer(Analyzer, ABC):
"""
Convenience base class to implement analyzers with minimal boilerplate.
Subclasses should implement `compute(items)` where `items` is a list of
inputs gathered from the sequence based on `apply_to_metadata`.
- If `apply_to_metadata` is True (default), `items` will be a list of
metadata values (e.g., texts) collected via `Sequence.filterMetadata`.
- If False, `items` will be a list of child sequences (via `Sequence.filter`).
"""
apply_to_metadata: bool = True
@abstractmethod
def compute(self, items: List[Any]) -> List[Any]: # pragma: no cover
"""Return a result list, one output per input item."""
raise NotImplementedError
def analyze(
self,
sequence,
tag,
levelOfAnalyzer,
levelOfResult: Optional[str] = "",
):
super().analyze(self.compute, sequence, tag, levelOfAnalyzer, levelOfResult, self.apply_to_metadata)
__version__ = '0.1.0' __version__ = '0.1.0'
from textflow.registry import (
register_analyzer,
get_analyzer_class,
create_analyzer,
available_analyzers,
)
__all__ = [
"__version__",
"register_analyzer",
"get_analyzer_class",
"create_analyzer",
"available_analyzers",
]
from typing import Callable, Dict, Iterable, List, Optional, Type, TypeVar
from textflow.Analyzer import Analyzer
T = TypeVar("T", bound=Analyzer)
ANALYZER_REGISTRY: Dict[str, Type[Analyzer]] = {}
def register_analyzer(name: str) -> Callable[[Type[T]], Type[T]]:
"""
Class decorator to register an analyzer by name.
Usage:
@register_analyzer("polarity")
class PolarityAnalyzer(Analyzer):
...
"""
def _decorator(cls: Type[T]) -> Type[T]:
key = name.lower()
ANALYZER_REGISTRY[key] = cls
return cls
return _decorator
def get_analyzer_class(name: str) -> Type[Analyzer]:
key = name.lower()
if key not in ANALYZER_REGISTRY:
available = ", ".join(sorted(ANALYZER_REGISTRY.keys()))
raise KeyError(f"Analyzer '{name}' not found. Available: {available}")
return ANALYZER_REGISTRY[key]
def create_analyzer(name: str, **kwargs) -> Analyzer:
"""
Instantiate an analyzer by registered name.
"""
cls = get_analyzer_class(name)
return cls(**kwargs) # type: ignore[call-arg]
def available_analyzers() -> List[str]:
return sorted(ANALYZER_REGISTRY.keys())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment