Skip to content
This repository has been archived by the owner on Jul 25, 2024. It is now read-only.

Commit

Permalink
Refer #24. Bio converter updated to base itself on text insted of ent…
Browse files Browse the repository at this point in the history
…itites. Real world example / test added.
  • Loading branch information
Alex Hebing committed Jun 18, 2019
1 parent 09f85ff commit 953edf9
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 31 deletions.
65 changes: 41 additions & 24 deletions helpers/bio_converter.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,58 @@


def convert_to_bio(text, entities):
text_index = 1
current_entity_index = 0

bio_tagged = []

prev_entity_end = 0
for word in text.split():
if current_entity_index < len(entities) and is_part_of_entity(text_index, entities[current_entity_index]):
tag_entity(bio_tagged, entities[current_entity_index], word)

if (ends_entity(word, entities[current_entity_index])):
current_entity_index = current_entity_index + 1
else:
tag_non_entities(bio_tagged, word)

text_index = text_index + len(word) + 1

# prev_entity_end = 0

# for entity in entities:
# index = entity['pos']
# entity_to_end = len(text) - index + 1

# text_in_between_entities = text[prev_entity_end:-entity_to_end]
# prev_entity_end = index + len(entity['ne'])

# tag_non_entities(bio_tagged, text_in_between_entities)
# tag_entity(bio_tagged, entity)

for entity in entities:
index = entity['pos']
entity_to_end = len(text) - index + 1

text_in_between_entities = text[prev_entity_end:-entity_to_end]
prev_entity_end = index + len(entity['ne'])
# text_after_last_entity = text[prev_entity_end:]

tag_non_entities(bio_tagged, text_in_between_entities)
tag_entity(bio_tagged, entity)

text_after_last_entity = text[prev_entity_end:]
tag_non_entities(bio_tagged, text_after_last_entity)
# print(len(bio_tagged))

# tag_non_entities(bio_tagged, text_after_last_entity)

return bio_tagged


def is_part_of_entity(text_index, entity):
return entity['pos'] <= text_index and text_index <= entity['pos'] + len(entity['ne'])

def ends_entity(word, entity):
return entity['ne'].endswith(word)


def tag_non_entities(bio_tagged, text):
for word in text.split():
for word in text.split(' '):
bio_tagged.append("{} O".format(word))

def tag_entity(bio_tagged, entity):

def tag_entity(bio_tagged, entity, word):
tag = translate_to_bio(entity['type'])
entity_text = entity['ne'].split()

if len(entity_text) > 1:
for i in range(len(entity_text)):
if i == 0:
bio_tagged.append("{} B-{}".format(entity_text[i], tag))
else:
bio_tagged.append("{} I-{}".format(entity_text[i], tag))
else:
bio_tagged.append("{} B-{}".format(entity['ne'], tag))
bio_tagged.append("{} {}".format(word, tag))


def translate_to_bio(entity_type):
Expand Down
38 changes: 31 additions & 7 deletions helpers/test_bio_converter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os, json
from bio_converter import convert_to_bio

def test_convert_to_bio_one_entity():
Expand All @@ -15,7 +16,7 @@ def test_convert_to_bio_one_entity():
}]

bio = convert_to_bio(text, entities)
expected = ['Simple O', 'line O', 'with O', 'one O', 'Entity B-LOC']
expected = ['Simple O', 'line O', 'with O', 'one O', 'Entity LOC']
assert bio == expected

def test_convert_to_bio_one_entity_two_words():
Expand All @@ -33,7 +34,7 @@ def test_convert_to_bio_one_entity_two_words():
}]

bio = convert_to_bio(text, entities)
expected = ['Simple O', 'line O', 'with O', 'one O', 'TWO B-LOC', 'WORDS I-LOC']
expected = ['Simple O', 'line O', 'with O', 'one O', 'TWO LOC', 'WORDS LOC']
assert bio == expected


Expand Down Expand Up @@ -63,8 +64,8 @@ def test_convert_to_bio_two_entities():

bio = convert_to_bio(text, entities)
expected = [
'Simple O', 'line O', 'with O', 'one O', 'Entity B-LOC', 'in O', 'the O', 'middle O',
'and O', 'ANOTHER B-PER', 'ONE I-PER', 'much O', 'further O', 'on. O']
'Simple O', 'line O', 'with O', 'one O', 'Entity LOC', 'in O', 'the O', 'middle O',
'and O', 'ANOTHER PER', 'ONE PER', 'much O', 'further O', 'on. O']

assert bio == expected

Expand All @@ -83,8 +84,31 @@ def test_convert_to_bio_one_entity_three_words():
}]

bio = convert_to_bio(text, entities)
expected = ['Simple O', 'ENTITY B-ORG', 'THREE I-ORG', 'WORDS I-ORG', 'and O', 'more O', 'words O']
expected = ['Simple O', 'ENTITY ORG', 'THREE ORG', 'WORDS ORG', 'and O', 'more O', 'words O']

print(bio)
print(expected)
assert bio == expected

def test_convert_to_bio_real_example(tmpdir):
test_files_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test_files')

with open(os.path.join(test_files_folder, 'urn=ddd_000010470_mpeg21_p002_alto.alto.xml.txt'), 'r') as fh:
text = fh.read()

with open(os.path.join(test_files_folder, 'urn=ddd_000010470_mpeg21_p002_alto.alto.xml.entities.json'), 'r') as fh:
entities_full = json.load(fh)
entities = entities_full['entities']

bio = convert_to_bio(text, entities)

temp_file = tmpdir.join('tempout.bio')
with open(temp_file, 'w') as fh:
for line in bio:
fh.write("%s\n" % line)

with open(temp_file, 'r') as fh:
actual = fh.readlines()

with open(os.path.join(test_files_folder, 'urn=ddd_000010470_mpeg21_p002_alto.alto.xml.bio'), 'r') as fh:
expected = fh.readlines()

assert actual == expected

0 comments on commit 953edf9

Please sign in to comment.