Commit 02e97587 by Jaime Collado

Refactoring Sequence class

parent f3b23a01
Showing with 72 additions and 35 deletions
...@@ -3,9 +3,10 @@ from typing import Optional ...@@ -3,9 +3,10 @@ from typing import Optional
class SequenceIterator: class SequenceIterator:
def __init__(self, sequences): def __init__(self, children, sequences):
self.idx = 0 self.idx = 0
self.data = sequences self.children = children
self.sequences = sequences
def __iter__(self): def __iter__(self):
return self return self
...@@ -13,12 +14,18 @@ class SequenceIterator: ...@@ -13,12 +14,18 @@ class SequenceIterator:
def __next__(self): def __next__(self):
self.idx += 1 self.idx += 1
try: try:
return self.data[self.idx-1] #return self.data[self.idx-1]
return {
"child": self.children[self.idx-1],
"sequences": self.sequences[self.idx-1]
}
except IndexError: except IndexError:
self.idx = 0 self.idx = 0
raise StopIteration raise StopIteration
_VALID_FORMATS = ["string", "text", "token", None]
class Sequence: class Sequence:
"""Summary of class here. """Summary of class here.
...@@ -30,75 +37,105 @@ class Sequence: ...@@ -30,75 +37,105 @@ class Sequence:
text: ... text: ...
sequences: ... sequences: ...
""" """
def __init__(self, format: str, item: object, id: Optional[str] = None): def __init__(self, format: Optional[str] = None, src: Optional[object] = None, id: Optional[str] = None):
"""Creates a sequence from an input object. """Creates a sequence from an input object.
Args: Args:
format: A string containing the input data's type. format: A string containing the input data's type.
item: An object representing the input data. It can be a string for a src: An object representing the input data. It can be a string for a
string format or a file path for a text format. string format or a file path for a text format.
id: A string to overwrite the default's sequence id. id: A string to overwrite the default's sequence id.
""" """
VALID_FORMATS = ("string", "text")
if format not in VALID_FORMATS: if format not in _VALID_FORMATS:
raise ValueError(
f"{format} is not a valid format. Valid formats: {_VALID_FORMATS}"
)
if format == "token":
raise ValueError( raise ValueError(
f"{format} is not a valid format. Valid formats: {VALID_FORMATS}" f"Tokens can not be split"
) )
# Empty sequence
if format is None:
self.id = id
self.text = None
self.children = []
self.sequences = []
# Splits string text by whitespace # Splits string text by whitespace
if format == "string": if format == "string":
if not isinstance(item, str): if not isinstance(src, str):
raise ValueError(f"{item} is not an instance of string") raise ValueError(f"{src} is not an instance of string")
self.id = id if id else "string" self.id = id if id else "string"
self.text = item self.text = src
self.sequences = item.split(" ") self.children = [("token", token_src) for token_src in src.split(" ")]
self.sequences = [Sequence() for _ in self.children]
# Splits file text by \n # Splits file text by \n
if format == "text": if format == "text":
self.id = id if id else os.path.basename(item).split(".")[0] self.id = id if id else os.path.basename(src).split(".")[0]
with open(item, "r") as f: with open(src, "r") as f:
self.text = f.read() self.text = f.read()
self.sequences = self.text.split("\n") self.children = [("string", line_src) for line_src in self.text.split("\n")]
self.sequences = [Sequence() for _ in self.children]
def __str__(self): def __str__(self):
return self.text return self.text
def __repr__(self): def __repr__(self):
values = ", ".join([sequence.__repr__() for sequence in self.sequences]) children = ", ".join([child.__repr__() for child in self.children])
sequences = ", ".join([sequence.__repr__() for sequence in self.sequences])
return ( return (
"Sequence(\n" "Sequence(\n"
f" id: {self.id}\n" f" id: {self.id}\n"
f" sequences: {values}\n" f" text: {self.text}\n"
f" children: {children}\n"
f" sequences: {sequences}\n"
")" ")"
) )
def __len__(self): def __len__(self):
return len(self.sequences) return len(self.children)
def __iter__(self): def __iter__(self):
return SequenceIterator(self.sequences) return SequenceIterator(self.children, self.sequences)
def __getitem__(self, i): def __getitem__(self, idx):
if isinstance(i, str): if isinstance(idx, str): # Get src by string (e.g. seq["doc1"])
for sequence in self.sequences: if self.sequences[0] is None:
if isinstance(sequence, Sequence): raise ValueError(f"Sequence id '{idx}' not found in {self.sequences}")
if sequence.id == i: return sequence for cont, sequence in enumerate(self.sequences):
raise ValueError(f"Sequence index '{i}' not found") if sequence.id == idx: return {
elif isinstance(i, int): "child": self.children[cont],
if i < 0: "sequences": self.sequences[cont]
i = len(self.sequences) + i }
raise ValueError(f"Sequence id '{idx}' not found in {self}")
if i >= len(self.sequences):
raise IndexError(f"Sequence index '{i}' out of range") elif isinstance(idx, int): # Get src by int (e.g. seq[0])
else: if abs(idx) >= len(self.children):
return self.sequences[i] raise IndexError(f"Sequence index '{idx}' out of range")
if idx < 0:
idx = len(self.children) + idx
return {
"child": self.children[idx],
"sequences": self.sequences[idx]
}
else: # TODO: Should it support slices (e.g. [2:4])? else: # TODO: Should it support slices (e.g. [2:4])?
invalid_type = type(i)
raise TypeError( raise TypeError(
f"Sequence indices must be integers or strings, not {invalid_type.__name__}" f"Sequence indices must be integers or strings, not {type(idx).__name__}"
) )
def set_sequence(self, new_sequence):
print("Setting value...")
self.id = new_sequence.id
self.text = new_sequence.text
self.children = new_sequence.children
self.sequences = new_sequence.sequences
def get_depth(self): def get_depth(self):
pass # TODO pass # TODO
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment