Commit f3b23a01 by Jaime Collado

Implementing Sequence input formats

parent 64a022e5
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque ullamcorper id ante eu maximus. Nullam efficitur vehicula ante, in luctus ante sollicitudin iaculis. Donec efficitur porta ante, ut venenatis enim faucibus quis. Sed vitae egestas neque, a tempor tortor. Suspendisse nec maximus mi. Nam iaculis convallis ultricies. Nunc vel tempor magna, in ultrices enim. Duis id tristique nisl. In hac habitasse platea dictumst. Suspendisse at arcu placerat lectus interdum sollicitudin.
\ No newline at end of file
Lorem ipsum dolor sit amet
Nam lectus turpis
\ No newline at end of file
import pytest
from textflow.Sequence import Sequence
class CustomSequence(Sequence):
def __init__(self, text: str):
self.id = "root"
self.sequences = text.split(" ")
@pytest.fixture
def sequence():
return CustomSequence("Esto es una prueba")
def test_sequence_wrong_format():
with pytest.raises(Exception):
sequence = Sequence("csv", "Lorem ipsum dolor sit amet")
def test_str(sequence):
assert str(sequence) == "id: root, sequences: ['Esto', 'es', 'una', 'prueba']"
def test_repr(sequence):
assert repr(sequence) == "Sequence('Esto', 'es', 'una', 'prueba')"
@pytest.mark.parametrize(
"sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
"Lorem ipsum dolor sit amet"
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
"Lorem ipsum dolor sit amet\nNam lectus turpis"
)
]
)
def test_str(sequence, expected):
assert str(sequence) == expected
def test_len(sequence):
assert len(sequence) == 4
def test_iter(sequence):
assert list(sequence) == ["Esto", "es", "una", "prueba"]
@pytest.mark.parametrize(
"sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
(
"Sequence(\n"
" id: string\n"
" sequences: 'Lorem', 'ipsum', 'dolor', 'sit', 'amet'\n"
")"
)
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
(
"Sequence(\n"
" id: doc_1\n"
" sequences: 'Lorem ipsum dolor sit amet', 'Nam lectus turpis'\n"
")"
)
)
]
)
def test_repr(sequence, expected):
assert repr(sequence) == expected
@pytest.mark.parametrize(
"sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
5
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
2
)
]
)
def test_len(sequence, expected):
assert len(sequence) == expected
@pytest.mark.parametrize(
"sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
["Lorem", "ipsum", "dolor", "sit", "amet"]
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
["Lorem ipsum dolor sit amet", "Nam lectus turpis"]
)
]
)
def test_iter(sequence, expected):
assert list(sequence) == expected
@pytest.mark.parametrize(
"sequence, expected",
[
pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"),
"Lorem"
),
pytest.param(
Sequence("text", "tests/data/doc_1.txt"),
"Lorem ipsum dolor sit amet"
)
]
)
def test_getitem(sequence, expected):
assert sequence[0] == expected
def test_getitem(sequence):
assert sequence[0] == "Esto"
def test_get_depth():
pass
def test_filter():
pass
\ No newline at end of file
import os
from typing import Optional
class SequenceIterator:
def __init__(self, sequences):
self.idx = 0
......@@ -16,21 +20,58 @@ class SequenceIterator:
class Sequence:
def __init__(self, object):
# TODO: Extraer id y sequences a partir del object de cualquier forma que se nos ocurra
# ver: https://huggingface.co/docs/datasets/v2.0.0/en/package_reference/loading_methods#datasets.load_dataset
if isinstance(object, str):
self.id = object
else:
self.id = "collection"
self.sequences = ["subcollection_1", "subcollection_2", "subcollection_3"]
"""Summary of class here.
Longer class information...
Longer class information...
Attributes:
id: ...
text: ...
sequences: ...
"""
def __init__(self, format: str, item: object, id: Optional[str] = None):
"""Creates a sequence from an input object.
Args:
format: A string containing the input data's type.
item: An object representing the input data. It can be a string for a
string format or a file path for a text format.
id: A string to overwrite the default's sequence id.
"""
VALID_FORMATS = ("string", "text")
if format not in VALID_FORMATS:
raise ValueError(
f"{format} is not a valid format. Valid formats: {VALID_FORMATS}"
)
# Splits string text by whitespace
if format == "string":
if not isinstance(item, str):
raise ValueError(f"{item} is not an instance of string")
self.id = id if id else "string"
self.text = item
self.sequences = item.split(" ")
# Splits file text by \n
if format == "text":
self.id = id if id else os.path.basename(item).split(".")[0]
with open(item, "r") as f:
self.text = f.read()
self.sequences = self.text.split("\n")
def __str__(self):
return f"id: {self.id}, sequences: {self.sequences}"
return self.text
def __repr__(self):
values = ", ".join([sequence.__repr__() for sequence in self.sequences])
return f"Sequence({values})"
return (
"Sequence(\n"
f" id: {self.id}\n"
f" sequences: {values}\n"
")"
)
def __len__(self):
return len(self.sequences)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment