Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Jaime Collado
/
textflow
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
1
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
be5dfe7d
authored
Apr 25, 2022
by
Estrella Vallecillo
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
getItem test fixed
parent
e146fd66
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
41 additions
and
24 deletions
poetry.lock
pyproject.toml
tests/test_sequence.py
textflow/Sequence.py
poetry.lock
View file @
be5dfe7d
This diff is collapsed.
Click to expand it.
pyproject.toml
View file @
be5dfe7d
...
@@ -6,6 +6,7 @@ authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@r
...
@@ -6,6 +6,7 @@ authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@r
[tool.poetry.dependencies]
[tool.poetry.dependencies]
python
=
"^3.8"
python
=
"^3.8"
nltk
=
"^3.7"
[tool.poetry.dev-dependencies]
[tool.poetry.dev-dependencies]
pytest
=
"^5.2"
pytest
=
"^5.2"
...
...
tests/test_sequence.py
View file @
be5dfe7d
...
@@ -83,45 +83,42 @@ def test_len(sequence, expected):
...
@@ -83,45 +83,42 @@ def test_len(sequence, expected):
[
[
pytest
.
param
(
pytest
.
param
(
Sequence
(
"string"
,
"Lorem ipsum"
),
Sequence
(
"string"
,
"Lorem ipsum"
),
[
Sequence
(
)
for
_
in
range
(
2
)]
[
Sequence
(
"token"
,
tkn
)
for
tkn
in
"Lorem ipsum"
.
split
(
" "
)]
),
),
pytest
.
param
(
pytest
.
param
(
Sequence
(
"text"
,
"tests/data/doc_1.txt"
),
Sequence
(
"text"
,
"tests/data/doc_1.txt"
),
{
[
Sequence
(
"token"
,
tkn
)
for
tkn
in
"Lorem ipsum"
.
split
(
" "
)]
"child"
:
[(
"string"
,
"Lorem ipsum dolor sit amet"
),
(
"string"
,
"Nam lectus turpis"
)],
"sequence"
:
[
Sequence
()
for
_
in
range
(
2
)]
}
),
),
pytest
.
param
(
pytest
.
param
(
Sequence
(
"directory"
,
"tests/data"
),
Sequence
(
"directory"
,
"tests/data"
),
2
[
Sequence
()
.
initFromDocument
(
"tests/data/doc_1.txt"
,
"tokens"
,
"token"
)]
)
)
]
]
)
)
def
test_iter
(
sequence
,
expected
):
def
test_iter
(
sequence
,
expected
):
assert
iter
(
sequence
)
.
__next__
()
==
expected
assert
iter
(
sequence
)
.
__next__
()
==
expected
@pytest.mark.parametrize
(
@pytest.mark.parametrize
(
"sequence, expected"
,
"sequence, expected"
,
[
[
pytest
.
param
(
pytest
.
param
(
Sequence
(
"string"
,
"Lorem ipsum dolor sit amet"
),
Sequence
(
"string"
,
"Lorem ipsum dolor sit amet"
),
[
Sequence
()
for
_
in
range
(
5
)
]
Sequence
(
"string"
,
"Lorem ipsum dolor sit amet"
)
.
children
[
"tokens"
]
),
),
pytest
.
param
(
pytest
.
param
(
Sequence
(
"text"
,
"tests/data/doc_1.txt"
),
Sequence
(
"text"
,
"tests/data/doc_1.txt"
),
[
Sequence
()
for
_
in
range
(
8
)
]
Sequence
(
"text"
,
"tests/data/doc_1.txt"
)
.
children
[
"tokens"
]
),
),
pytest
.
param
(
pytest
.
param
(
Sequence
(
"directory"
,
"tests/data"
),
Sequence
(
"directory"
,
"tests/data"
),
2
Sequence
(
"directory"
,
"tests/data"
)
.
children
[
"files"
]
)
)
]
]
)
)
def
test_getitem
(
sequence
,
expected
):
def
test_getitem
(
sequence
,
expected
):
assert
sequence
[
0
]
==
expected
assert
sequence
[
0
]
==
expected
@pytest.mark.parametrize
(
@pytest.mark.parametrize
(
"sequence, expected"
,
"sequence, expected"
,
[
[
...
...
textflow/Sequence.py
View file @
be5dfe7d
import
os
import
os
from
typing
import
Optional
from
typing
import
Optional
from
nltk.tokenize
import
TreebankWordTokenizer
from
nltk.tokenize
import
WhitespaceTokenizer
from
nltk.tokenize
import
SpaceTokenizer
from
nltk.tokenize
import
WordPunctTokenizer
class
SequenceIterator
:
#TODO documentar
class
SequenceIterator
:
def
__init__
(
self
,
children
):
def
__init__
(
self
,
children
):
"""
"""
Creates a sequenceIterator from a Sequence.
Creates a sequenceIterator from a Sequence.
...
@@ -47,7 +51,7 @@ class Sequence:
...
@@ -47,7 +51,7 @@ class Sequence:
text: ...
text: ...
sequences: ...
sequences: ...
"""
"""
def
__init__
(
self
,
format
:
Optional
[
str
]
=
None
,
src
:
Optional
[
object
]
=
None
):
def
__init__
(
self
,
format
:
Optional
[
str
]
=
None
,
src
:
Optional
[
object
]
=
None
,
tokenizer
:
Optional
[
object
]
=
None
):
"""Creates a sequence from an input object.
"""Creates a sequence from an input object.
Args:
Args:
...
@@ -61,6 +65,8 @@ class Sequence:
...
@@ -61,6 +65,8 @@ class Sequence:
raise
ValueError
(
raise
ValueError
(
f
"{format} is not a valid format. Valid formats: {_VALID_FORMATS}"
f
"{format} is not a valid format. Valid formats: {_VALID_FORMATS}"
)
)
if
tokenizer
==
None
:
tokenizer
=
WhitespaceTokenizer
()
self
.
format
=
format
self
.
format
=
format
self
.
children
=
{}
self
.
children
=
{}
...
@@ -70,13 +76,13 @@ class Sequence:
...
@@ -70,13 +76,13 @@ class Sequence:
raise
ValueError
(
f
"{src} is not an instance of token"
)
raise
ValueError
(
f
"{src} is not an instance of token"
)
self
.
metadata
[
"text"
]
=
src
self
.
metadata
[
"text"
]
=
src
if
format
==
"string"
:
if
format
==
"string"
:
self
.
initFromString
(
src
,
"tokens"
,
"token"
)
self
.
initFromString
(
src
,
"tokens"
,
"token"
,
tokenizer
)
if
format
==
"text"
:
if
format
==
"text"
:
self
.
initFromDocument
(
src
,
"tokens"
,
"token"
)
self
.
initFromDocument
(
src
,
"tokens"
,
"token"
,
tokenizer
)
if
format
==
"directory"
:
if
format
==
"directory"
:
self
.
initFromDirectory
(
src
,
"directory"
,
"files"
)
self
.
initFromDirectory
(
src
,
"directory"
,
"files"
,
tokenizer
)
def
initFromDirectory
(
self
,
directory
,
labelDirectory
,
labelFile
):
#TODO Inicializador por defecto para un directorio
def
initFromDirectory
(
self
,
directory
,
labelDirectory
,
labelFile
,
tokenizer
):
'''
'''
Initialize a Sequence from a directory
Initialize a Sequence from a directory
Args:
Args:
...
@@ -101,12 +107,12 @@ class Sequence:
...
@@ -101,12 +107,12 @@ class Sequence:
else
:
else
:
self
.
metadata
[
"directoriesPath"
]
.
append
(
directory
+
"/"
+
file
)
self
.
metadata
[
"directoriesPath"
]
.
append
(
directory
+
"/"
+
file
)
if
labelDirectory
in
self
.
children
:
if
labelDirectory
in
self
.
children
:
self
.
children
[
labelDirectory
]
.
append
(
Sequence
(
"directory"
,
directory
+
"/"
+
file
))
self
.
children
[
labelDirectory
]
.
append
(
Sequence
(
"directory"
,
directory
+
"/"
+
file
,
tokenizer
))
else
:
else
:
self
.
children
[
labelDirectory
]
=
[
Sequence
(
"directory"
,
directory
+
"/"
+
file
)]
self
.
children
[
labelDirectory
]
=
[
Sequence
(
"directory"
,
directory
+
"/"
+
file
,
tokenizer
)]
def
initFromDocument
(
self
,
documentPath
,
labelSubSequence
,
formatSubsequence
):
def
initFromDocument
(
self
,
documentPath
,
labelSubSequence
,
formatSubsequence
,
tokenizer
):
'''
'''
Initialize a Sequence from a document
Initialize a Sequence from a document
Args:
Args:
...
@@ -117,10 +123,10 @@ class Sequence:
...
@@ -117,10 +123,10 @@ class Sequence:
self
.
format
=
"text"
self
.
format
=
"text"
with
open
(
documentPath
,
"r"
)
as
f
:
with
open
(
documentPath
,
"r"
)
as
f
:
txt
=
f
.
read
()
txt
=
f
.
read
()
self
.
children
[
labelSubSequence
]
=
[
Sequence
(
formatSubsequence
,
token_src
)
for
token_src
in
t
xt
.
split
(
" "
)]
self
.
children
[
labelSubSequence
]
=
[
Sequence
(
formatSubsequence
,
token_src
)
for
token_src
in
t
okenizer
.
tokenize
(
txt
)]
self
.
metadata
[
"text"
]
=
txt
self
.
metadata
[
"text"
]
=
txt
def
initFromString
(
self
,
srcString
,
labelSubSequence
,
formatSubsequence
):
def
initFromString
(
self
,
srcString
,
labelSubSequence
,
formatSubsequence
,
tokenizer
):
'''
'''
Initialize a Sequence from a string
Initialize a Sequence from a string
Args:
Args:
...
@@ -133,7 +139,7 @@ class Sequence:
...
@@ -133,7 +139,7 @@ class Sequence:
if
not
isinstance
(
srcString
,
str
):
if
not
isinstance
(
srcString
,
str
):
raise
ValueError
(
f
"{srcString} is not an instance of string"
)
raise
ValueError
(
f
"{srcString} is not an instance of string"
)
self
.
format
=
"string"
self
.
format
=
"string"
self
.
children
[
labelSubSequence
]
=
[
Sequence
(
formatSubsequence
,
token_src
)
for
token_src
in
srcString
.
split
(
" "
)]
self
.
children
[
labelSubSequence
]
=
[
Sequence
(
formatSubsequence
,
token_src
)
for
token_src
in
tokenizer
.
tokenize
(
srcString
)]
self
.
metadata
[
"text"
]
=
srcString
self
.
metadata
[
"text"
]
=
srcString
...
@@ -180,8 +186,15 @@ class Sequence:
...
@@ -180,8 +186,15 @@ class Sequence:
'''
'''
return
SequenceIterator
(
list
(
self
.
children
.
values
()))
return
SequenceIterator
(
list
(
self
.
children
.
values
()))
def
__getitem__
(
self
,
idx
):
#TODO Documentacion
def
__getitem__
(
self
,
idx
):
'''
Get the value of a key from the dictionary of children
Args:
idx: a string that represent the key of the children dictionary
or an integer that represent the position of the key in children dictionary keys
Returns:
A List of Sequences
'''
if
isinstance
(
idx
,
str
):
# Get src by string (e.g. seq["doc1"])
if
isinstance
(
idx
,
str
):
# Get src by string (e.g. seq["doc1"])
if
self
.
children
:
if
self
.
children
:
if
idx
in
self
.
children
:
if
idx
in
self
.
children
:
...
@@ -198,6 +211,12 @@ class Sequence:
...
@@ -198,6 +211,12 @@ class Sequence:
else
:
# TODO: Should it support slices (e.g. [2:4])?
else
:
# TODO: Should it support slices (e.g. [2:4])?
raise
ValueError
(
f
"Sequence id '{idx}' not found in {self.children}"
)
raise
ValueError
(
f
"Sequence id '{idx}' not found in {self.children}"
)
def
__eq__
(
self
,
other
):
if
self
.
format
==
other
.
format
and
self
.
metadata
==
other
.
metadata
and
self
.
children
==
other
.
children
:
return
True
else
:
return
False
def
depth
(
self
,
diccionaryList
:
Optional
[
list
]
=
None
):
def
depth
(
self
,
diccionaryList
:
Optional
[
list
]
=
None
):
'''
'''
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment