getItem test fixed

parent e146fd66
...@@ -6,6 +6,7 @@ authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@r ...@@ -6,6 +6,7 @@ authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@r
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.8" python = "^3.8"
nltk = "^3.7"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pytest = "^5.2" pytest = "^5.2"
......
...@@ -83,45 +83,42 @@ def test_len(sequence, expected): ...@@ -83,45 +83,42 @@ def test_len(sequence, expected):
[ [
pytest.param( pytest.param(
Sequence("string", "Lorem ipsum"), Sequence("string", "Lorem ipsum"),
[Sequence() for _ in range(2)] [Sequence("token",tkn) for tkn in "Lorem ipsum".split(" ")]
), ),
pytest.param( pytest.param(
Sequence("text", "tests/data/doc_1.txt"), Sequence("text", "tests/data/doc_1.txt"),
{ [Sequence("token",tkn) for tkn in "Lorem ipsum".split(" ")]
"child": [("string", "Lorem ipsum dolor sit amet"), ("string", "Nam lectus turpis")],
"sequence": [Sequence() for _ in range(2)]
}
), ),
pytest.param( pytest.param(
Sequence("directory","tests/data" ), Sequence("directory","tests/data" ),
2 [Sequence().initFromDocument("tests/data/doc_1.txt","tokens","token")]
) )
] ]
) )
def test_iter(sequence, expected): def test_iter(sequence, expected):
assert iter(sequence).__next__() == expected assert iter(sequence).__next__() == expected
@pytest.mark.parametrize( @pytest.mark.parametrize(
"sequence, expected", "sequence, expected",
[ [
pytest.param( pytest.param(
Sequence("string", "Lorem ipsum dolor sit amet"), Sequence("string", "Lorem ipsum dolor sit amet"),
[Sequence() for _ in range(5)] Sequence("string", "Lorem ipsum dolor sit amet").children["tokens"]
), ),
pytest.param( pytest.param(
Sequence("text", "tests/data/doc_1.txt"), Sequence("text", "tests/data/doc_1.txt"),
[Sequence() for _ in range(8)] Sequence("text", "tests/data/doc_1.txt").children["tokens"]
), ),
pytest.param( pytest.param(
Sequence("directory","tests/data" ), Sequence("directory","tests/data" ),
2 Sequence("directory","tests/data" ).children["files"]
) )
] ]
) )
def test_getitem(sequence, expected): def test_getitem(sequence, expected):
assert sequence[0] == expected assert sequence[0] == expected
@pytest.mark.parametrize( @pytest.mark.parametrize(
"sequence, expected", "sequence, expected",
[ [
......
import os import os
from typing import Optional from typing import Optional
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import SpaceTokenizer
from nltk.tokenize import WordPunctTokenizer
class SequenceIterator: #TODO documentar class SequenceIterator:
def __init__(self, children): def __init__(self, children):
""" """
Creates a sequenceIterator from a Sequence. Creates a sequenceIterator from a Sequence.
...@@ -47,7 +51,7 @@ class Sequence: ...@@ -47,7 +51,7 @@ class Sequence:
text: ... text: ...
sequences: ... sequences: ...
""" """
def __init__(self, format: Optional[str] = None, src: Optional[object] = None): def __init__(self, format: Optional[str] = None, src: Optional[object] = None, tokenizer: Optional[object] = None ):
"""Creates a sequence from an input object. """Creates a sequence from an input object.
Args: Args:
...@@ -61,6 +65,8 @@ class Sequence: ...@@ -61,6 +65,8 @@ class Sequence:
raise ValueError( raise ValueError(
f"{format} is not a valid format. Valid formats: {_VALID_FORMATS}" f"{format} is not a valid format. Valid formats: {_VALID_FORMATS}"
) )
if tokenizer == None:
tokenizer = WhitespaceTokenizer()
self.format = format self.format = format
self.children = {} self.children = {}
...@@ -70,13 +76,13 @@ class Sequence: ...@@ -70,13 +76,13 @@ class Sequence:
raise ValueError(f"{src} is not an instance of token") raise ValueError(f"{src} is not an instance of token")
self.metadata["text"] = src self.metadata["text"] = src
if format == "string": if format == "string":
self.initFromString(src,"tokens","token") self.initFromString(src,"tokens","token",tokenizer)
if format == "text": if format == "text":
self.initFromDocument(src,"tokens","token") self.initFromDocument(src,"tokens","token", tokenizer)
if format == "directory": if format == "directory":
self.initFromDirectory(src,"directory","files") self.initFromDirectory(src,"directory","files",tokenizer)
def initFromDirectory(self, directory, labelDirectory, labelFile ): #TODO Inicializador por defecto para un directorio def initFromDirectory(self, directory, labelDirectory, labelFile, tokenizer):
''' '''
Initialize a Sequence from a directory Initialize a Sequence from a directory
Args: Args:
...@@ -101,12 +107,12 @@ class Sequence: ...@@ -101,12 +107,12 @@ class Sequence:
else: else:
self.metadata["directoriesPath"].append(directory+"/"+file) self.metadata["directoriesPath"].append(directory+"/"+file)
if labelDirectory in self.children: if labelDirectory in self.children:
self.children[labelDirectory].append(Sequence("directory", directory+"/"+file )) self.children[labelDirectory].append(Sequence("directory", directory+"/"+file,tokenizer ))
else: else:
self.children[labelDirectory]= [Sequence("directory", directory+"/"+file)] self.children[labelDirectory]= [Sequence("directory", directory+"/"+file, tokenizer)]
def initFromDocument(self, documentPath, labelSubSequence, formatSubsequence): def initFromDocument(self, documentPath, labelSubSequence, formatSubsequence, tokenizer):
''' '''
Initialize a Sequence from a document Initialize a Sequence from a document
Args: Args:
...@@ -117,10 +123,10 @@ class Sequence: ...@@ -117,10 +123,10 @@ class Sequence:
self.format = "text" self.format = "text"
with open(documentPath, "r") as f: with open(documentPath, "r") as f:
txt = f.read() txt = f.read()
self.children[labelSubSequence] = [Sequence(formatSubsequence,token_src) for token_src in txt.split(" ")] self.children[labelSubSequence] = [Sequence(formatSubsequence,token_src) for token_src in tokenizer.tokenize(txt)]
self.metadata["text"] = txt self.metadata["text"] = txt
def initFromString(self, srcString, labelSubSequence, formatSubsequence): def initFromString(self, srcString, labelSubSequence, formatSubsequence, tokenizer):
''' '''
Initialize a Sequence from a string Initialize a Sequence from a string
Args: Args:
...@@ -133,7 +139,7 @@ class Sequence: ...@@ -133,7 +139,7 @@ class Sequence:
if not isinstance(srcString, str): if not isinstance(srcString, str):
raise ValueError(f"{srcString} is not an instance of string") raise ValueError(f"{srcString} is not an instance of string")
self.format = "string" self.format = "string"
self.children[labelSubSequence]= [Sequence(formatSubsequence,token_src) for token_src in srcString.split(" ")] self.children[labelSubSequence]= [Sequence(formatSubsequence,token_src) for token_src in tokenizer.tokenize(srcString)]
self.metadata["text"]= srcString self.metadata["text"]= srcString
...@@ -180,8 +186,15 @@ class Sequence: ...@@ -180,8 +186,15 @@ class Sequence:
''' '''
return SequenceIterator(list(self.children.values())) return SequenceIterator(list(self.children.values()))
def __getitem__(self, idx): #TODO Documentacion def __getitem__(self, idx):
'''
Get the value of a key from the dictionary of children
Args:
idx: a string that represent the key of the children dictionary
or an integer that represent the position of the key in children dictionary keys
Returns:
A List of Sequences
'''
if isinstance(idx, str): # Get src by string (e.g. seq["doc1"]) if isinstance(idx, str): # Get src by string (e.g. seq["doc1"])
if self.children: if self.children:
if idx in self.children: if idx in self.children:
...@@ -198,6 +211,12 @@ class Sequence: ...@@ -198,6 +211,12 @@ class Sequence:
else: # TODO: Should it support slices (e.g. [2:4])? else: # TODO: Should it support slices (e.g. [2:4])?
raise ValueError(f"Sequence id '{idx}' not found in {self.children}") raise ValueError(f"Sequence id '{idx}' not found in {self.children}")
def __eq__(self, other):
if self.format == other.format and self.metadata == other.metadata and self.children == other.children:
return True
else:
return False
def depth(self,diccionaryList: Optional[list] = None): def depth(self,diccionaryList: Optional[list] = None):
''' '''
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment