Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Jaime Collado
/
textflow
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
1
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
be5dfe7d
authored
Apr 25, 2022
by
Estrella Vallecillo
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
getItem test fixed
parent
e146fd66
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
41 additions
and
24 deletions
poetry.lock
pyproject.toml
tests/test_sequence.py
textflow/Sequence.py
poetry.lock
View file @
be5dfe7d
This diff is collapsed.
Click to expand it.
pyproject.toml
View file @
be5dfe7d
...
...
@@ -6,6 +6,7 @@ authors = ["Jaime Collado <jcollado@ujaen.es>", "Estrella Vallecillo <mevr0003@r
[tool.poetry.dependencies]
python
=
"^3.8"
nltk
=
"^3.7"
[tool.poetry.dev-dependencies]
pytest
=
"^5.2"
...
...
tests/test_sequence.py
View file @
be5dfe7d
...
...
@@ -83,45 +83,42 @@ def test_len(sequence, expected):
[
pytest
.
param
(
Sequence
(
"string"
,
"Lorem ipsum"
),
[
Sequence
(
)
for
_
in
range
(
2
)]
[
Sequence
(
"token"
,
tkn
)
for
tkn
in
"Lorem ipsum"
.
split
(
" "
)]
),
pytest
.
param
(
Sequence
(
"text"
,
"tests/data/doc_1.txt"
),
{
"child"
:
[(
"string"
,
"Lorem ipsum dolor sit amet"
),
(
"string"
,
"Nam lectus turpis"
)],
"sequence"
:
[
Sequence
()
for
_
in
range
(
2
)]
}
[
Sequence
(
"token"
,
tkn
)
for
tkn
in
"Lorem ipsum"
.
split
(
" "
)]
),
pytest
.
param
(
Sequence
(
"directory"
,
"tests/data"
),
2
[
Sequence
()
.
initFromDocument
(
"tests/data/doc_1.txt"
,
"tokens"
,
"token"
)]
)
]
)
def
test_iter
(
sequence
,
expected
):
assert
iter
(
sequence
)
.
__next__
()
==
expected
@pytest.mark.parametrize
(
"sequence, expected"
,
[
pytest
.
param
(
Sequence
(
"string"
,
"Lorem ipsum dolor sit amet"
),
[
Sequence
()
for
_
in
range
(
5
)
]
Sequence
(
"string"
,
"Lorem ipsum dolor sit amet"
)
.
children
[
"tokens"
]
),
pytest
.
param
(
Sequence
(
"text"
,
"tests/data/doc_1.txt"
),
[
Sequence
()
for
_
in
range
(
8
)
]
Sequence
(
"text"
,
"tests/data/doc_1.txt"
)
.
children
[
"tokens"
]
),
pytest
.
param
(
Sequence
(
"directory"
,
"tests/data"
),
2
Sequence
(
"directory"
,
"tests/data"
)
.
children
[
"files"
]
)
]
)
def
test_getitem
(
sequence
,
expected
):
assert
sequence
[
0
]
==
expected
@pytest.mark.parametrize
(
"sequence, expected"
,
[
...
...
textflow/Sequence.py
View file @
be5dfe7d
import
os
from
typing
import
Optional
from
nltk.tokenize
import
TreebankWordTokenizer
from
nltk.tokenize
import
WhitespaceTokenizer
from
nltk.tokenize
import
SpaceTokenizer
from
nltk.tokenize
import
WordPunctTokenizer
class
SequenceIterator
:
#TODO documentar
class
SequenceIterator
:
def
__init__
(
self
,
children
):
"""
Creates a sequenceIterator from a Sequence.
...
...
@@ -47,7 +51,7 @@ class Sequence:
text: ...
sequences: ...
"""
def
__init__
(
self
,
format
:
Optional
[
str
]
=
None
,
src
:
Optional
[
object
]
=
None
):
def
__init__
(
self
,
format
:
Optional
[
str
]
=
None
,
src
:
Optional
[
object
]
=
None
,
tokenizer
:
Optional
[
object
]
=
None
):
"""Creates a sequence from an input object.
Args:
...
...
@@ -61,6 +65,8 @@ class Sequence:
raise
ValueError
(
f
"{format} is not a valid format. Valid formats: {_VALID_FORMATS}"
)
if
tokenizer
==
None
:
tokenizer
=
WhitespaceTokenizer
()
self
.
format
=
format
self
.
children
=
{}
...
...
@@ -70,13 +76,13 @@ class Sequence:
raise
ValueError
(
f
"{src} is not an instance of token"
)
self
.
metadata
[
"text"
]
=
src
if
format
==
"string"
:
self
.
initFromString
(
src
,
"tokens"
,
"token"
)
self
.
initFromString
(
src
,
"tokens"
,
"token"
,
tokenizer
)
if
format
==
"text"
:
self
.
initFromDocument
(
src
,
"tokens"
,
"token"
)
self
.
initFromDocument
(
src
,
"tokens"
,
"token"
,
tokenizer
)
if
format
==
"directory"
:
self
.
initFromDirectory
(
src
,
"directory"
,
"files"
)
self
.
initFromDirectory
(
src
,
"directory"
,
"files"
,
tokenizer
)
def
initFromDirectory
(
self
,
directory
,
labelDirectory
,
labelFile
):
#TODO Inicializador por defecto para un directorio
def
initFromDirectory
(
self
,
directory
,
labelDirectory
,
labelFile
,
tokenizer
):
'''
Initialize a Sequence from a directory
Args:
...
...
@@ -101,12 +107,12 @@ class Sequence:
else
:
self
.
metadata
[
"directoriesPath"
]
.
append
(
directory
+
"/"
+
file
)
if
labelDirectory
in
self
.
children
:
self
.
children
[
labelDirectory
]
.
append
(
Sequence
(
"directory"
,
directory
+
"/"
+
file
))
self
.
children
[
labelDirectory
]
.
append
(
Sequence
(
"directory"
,
directory
+
"/"
+
file
,
tokenizer
))
else
:
self
.
children
[
labelDirectory
]
=
[
Sequence
(
"directory"
,
directory
+
"/"
+
file
)]
self
.
children
[
labelDirectory
]
=
[
Sequence
(
"directory"
,
directory
+
"/"
+
file
,
tokenizer
)]
def
initFromDocument
(
self
,
documentPath
,
labelSubSequence
,
formatSubsequence
):
def
initFromDocument
(
self
,
documentPath
,
labelSubSequence
,
formatSubsequence
,
tokenizer
):
'''
Initialize a Sequence from a document
Args:
...
...
@@ -117,10 +123,10 @@ class Sequence:
self
.
format
=
"text"
with
open
(
documentPath
,
"r"
)
as
f
:
txt
=
f
.
read
()
self
.
children
[
labelSubSequence
]
=
[
Sequence
(
formatSubsequence
,
token_src
)
for
token_src
in
t
xt
.
split
(
" "
)]
self
.
children
[
labelSubSequence
]
=
[
Sequence
(
formatSubsequence
,
token_src
)
for
token_src
in
t
okenizer
.
tokenize
(
txt
)]
self
.
metadata
[
"text"
]
=
txt
def
initFromString
(
self
,
srcString
,
labelSubSequence
,
formatSubsequence
):
def
initFromString
(
self
,
srcString
,
labelSubSequence
,
formatSubsequence
,
tokenizer
):
'''
Initialize a Sequence from a string
Args:
...
...
@@ -133,7 +139,7 @@ class Sequence:
if
not
isinstance
(
srcString
,
str
):
raise
ValueError
(
f
"{srcString} is not an instance of string"
)
self
.
format
=
"string"
self
.
children
[
labelSubSequence
]
=
[
Sequence
(
formatSubsequence
,
token_src
)
for
token_src
in
srcString
.
split
(
" "
)]
self
.
children
[
labelSubSequence
]
=
[
Sequence
(
formatSubsequence
,
token_src
)
for
token_src
in
tokenizer
.
tokenize
(
srcString
)]
self
.
metadata
[
"text"
]
=
srcString
...
...
@@ -180,8 +186,15 @@ class Sequence:
'''
return
SequenceIterator
(
list
(
self
.
children
.
values
()))
def
__getitem__
(
self
,
idx
):
#TODO Documentacion
def
__getitem__
(
self
,
idx
):
'''
Get the value of a key from the dictionary of children
Args:
idx: a string that represent the key of the children dictionary
or an integer that represent the position of the key in children dictionary keys
Returns:
A List of Sequences
'''
if
isinstance
(
idx
,
str
):
# Get src by string (e.g. seq["doc1"])
if
self
.
children
:
if
idx
in
self
.
children
:
...
...
@@ -198,6 +211,12 @@ class Sequence:
else
:
# TODO: Should it support slices (e.g. [2:4])?
raise
ValueError
(
f
"Sequence id '{idx}' not found in {self.children}"
)
def
__eq__
(
self
,
other
):
if
self
.
format
==
other
.
format
and
self
.
metadata
==
other
.
metadata
and
self
.
children
==
other
.
children
:
return
True
else
:
return
False
def
depth
(
self
,
diccionaryList
:
Optional
[
list
]
=
None
):
'''
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment