Commit f3b23a01 by Jaime Collado

Implementing Sequence input formats

parent 64a022e5
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque ullamcorper id ante eu maximus. Nullam efficitur vehicula ante, in luctus ante sollicitudin iaculis. Donec efficitur porta ante, ut venenatis enim faucibus quis. Sed vitae egestas neque, a tempor tortor. Suspendisse nec maximus mi. Nam iaculis convallis ultricies. Nunc vel tempor magna, in ultrices enim. Duis id tristique nisl. In hac habitasse platea dictumst. Suspendisse at arcu placerat lectus interdum sollicitudin. Lorem ipsum dolor sit amet
\ No newline at end of file Nam lectus turpis
\ No newline at end of file
import pytest import pytest
from textflow.Sequence import Sequence from textflow.Sequence import Sequence
class CustomSequence(Sequence):
def __init__(self, text: str):
self.id = "root"
self.sequences = text.split(" ")
@pytest.fixture def test_sequence_wrong_format():
def sequence(): with pytest.raises(Exception):
return CustomSequence("Esto es una prueba") sequence = Sequence("csv", "Lorem ipsum dolor sit amet")
def test_str(sequence):
assert str(sequence) == "id: root, sequences: ['Esto', 'es', 'una', 'prueba']"
def test_repr(sequence): @pytest.mark.parametrize(
assert repr(sequence) == "Sequence('Esto', 'es', 'una', 'prueba')" "sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
"Lorem ipsum dolor sit amet"
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
"Lorem ipsum dolor sit amet\nNam lectus turpis"
)
]
)
def test_str(sequence, expected):
assert str(sequence) == expected
def test_len(sequence):
assert len(sequence) == 4
def test_iter(sequence): @pytest.mark.parametrize(
assert list(sequence) == ["Esto", "es", "una", "prueba"] "sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
(
"Sequence(\n"
" id: string\n"
" sequences: 'Lorem', 'ipsum', 'dolor', 'sit', 'amet'\n"
")"
)
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
(
"Sequence(\n"
" id: doc_1\n"
" sequences: 'Lorem ipsum dolor sit amet', 'Nam lectus turpis'\n"
")"
)
)
]
)
def test_repr(sequence, expected):
assert repr(sequence) == expected
@pytest.mark.parametrize(
"sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
5
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
2
)
]
)
def test_len(sequence, expected):
assert len(sequence) == expected
@pytest.mark.parametrize(
"sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
["Lorem", "ipsum", "dolor", "sit", "amet"]
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
["Lorem ipsum dolor sit amet", "Nam lectus turpis"]
)
]
)
def test_iter(sequence, expected):
assert list(sequence) == expected
@pytest.mark.parametrize(
"sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
"Lorem"
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
"Lorem ipsum dolor sit amet"
)
]
)
def test_getitem(sequence, expected):
assert sequence[0] == expected
def test_getitem(sequence):
assert sequence[0] == "Esto"
def test_get_depth(): def test_get_depth():
pass pass
def test_filter(): def test_filter():
pass pass
\ No newline at end of file
import os
from typing import Optional
class SequenceIterator: class SequenceIterator:
def __init__(self, sequences): def __init__(self, sequences):
self.idx = 0 self.idx = 0
...@@ -16,21 +20,58 @@ class SequenceIterator: ...@@ -16,21 +20,58 @@ class SequenceIterator:
class Sequence: class Sequence:
def __init__(self, object): """Summary of class here.
# TODO: Extraer id y sequences a partir del object de cualquier forma que se nos ocurra
# ver: https://huggingface.co/docs/datasets/v2.0.0/en/package_reference/loading_methods#datasets.load_dataset Longer class information...
if isinstance(object, str): Longer class information...
self.id = object
else: Attributes:
self.id = "collection" id: ...
self.sequences = ["subcollection_1", "subcollection_2", "subcollection_3"] text: ...
sequences: ...
"""
def __init__(self, format: str, item: object, id: Optional[str] = None):
"""Creates a sequence from an input object.
Args:
format: A string containing the input data's type.
item: An object representing the input data. It can be a string for a
string format or a file path for a text format.
id: A string to overwrite the default's sequence id.
"""
VALID_FORMATS = ("string", "text")
if format not in VALID_FORMATS:
raise ValueError(
f"{format} is not a valid format. Valid formats: {VALID_FORMATS}"
)
# Splits string text by whitespace
if format == "string":
if not isinstance(item, str):
raise ValueError(f"{item} is not an instance of string")
self.id = id if id else "string"
self.text = item
self.sequences = item.split(" ")
# Splits file text by \n
if format == "text":
self.id = id if id else os.path.basename(item).split(".")[0]
with open(item, "r") as f:
self.text = f.read()
self.sequences = self.text.split("\n")
def __str__(self): def __str__(self):
return f"id: {self.id}, sequences: {self.sequences}" return self.text
def __repr__(self): def __repr__(self):
values = ", ".join([sequence.__repr__() for sequence in self.sequences]) values = ", ".join([sequence.__repr__() for sequence in self.sequences])
return f"Sequence({values})" return (
"Sequence(\n"
f" id: {self.id}\n"
f" sequences: {values}\n"
")"
)
def __len__(self): def __len__(self):
return len(self.sequences) return len(self.sequences)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment