diff --git a/helpers/bio_converter.py b/helpers/bio_converter.py index 7ae6bd0..199fdd3 100644 --- a/helpers/bio_converter.py +++ b/helpers/bio_converter.py @@ -1,41 +1,58 @@ def convert_to_bio(text, entities): + text_index = 1 + current_entity_index = 0 + bio_tagged = [] - prev_entity_end = 0 + for word in text.split(): + if current_entity_index < len(entities) and is_part_of_entity(text_index, entities[current_entity_index]): + tag_entity(bio_tagged, entities[current_entity_index], word) + + if (ends_entity(word, entities[current_entity_index])): + current_entity_index = current_entity_index + 1 + else: + tag_non_entities(bio_tagged, word) + + text_index = text_index + len(word) + 1 + + # prev_entity_end = 0 + + # for entity in entities: + # index = entity['pos'] + # entity_to_end = len(text) - index + 1 + + # text_in_between_entities = text[prev_entity_end:-entity_to_end] + # prev_entity_end = index + len(entity['ne']) + + # tag_non_entities(bio_tagged, text_in_between_entities) + # tag_entity(bio_tagged, entity) - for entity in entities: - index = entity['pos'] - entity_to_end = len(text) - index + 1 - - text_in_between_entities = text[prev_entity_end:-entity_to_end] - prev_entity_end = index + len(entity['ne']) + # text_after_last_entity = text[prev_entity_end:] - tag_non_entities(bio_tagged, text_in_between_entities) - tag_entity(bio_tagged, entity) - - text_after_last_entity = text[prev_entity_end:] - tag_non_entities(bio_tagged, text_after_last_entity) + # print(len(bio_tagged)) + + # tag_non_entities(bio_tagged, text_after_last_entity) return bio_tagged + +def is_part_of_entity(text_index, entity): + return entity['pos'] <= text_index and text_index <= entity['pos'] + len(entity['ne']) + +def ends_entity(word, entity): + return entity['ne'].endswith(word) + + def tag_non_entities(bio_tagged, text): - for word in text.split(): + for word in text.split(' '): bio_tagged.append("{} O".format(word)) -def tag_entity(bio_tagged, entity): + +def tag_entity(bio_tagged, entity, word): tag = translate_to_bio(entity['type']) - entity_text = entity['ne'].split() - - if len(entity_text) > 1: - for i in range(len(entity_text)): - if i == 0: - bio_tagged.append("{} B-{}".format(entity_text[i], tag)) - else: - bio_tagged.append("{} I-{}".format(entity_text[i], tag)) - else: - bio_tagged.append("{} B-{}".format(entity['ne'], tag)) + bio_tagged.append("{} {}".format(word, tag)) def translate_to_bio(entity_type): diff --git a/helpers/test_bio_converter.py b/helpers/test_bio_converter.py index 48e4169..62ab5d9 100644 --- a/helpers/test_bio_converter.py +++ b/helpers/test_bio_converter.py @@ -1,3 +1,4 @@ +import os, json from bio_converter import convert_to_bio def test_convert_to_bio_one_entity(): @@ -15,7 +16,7 @@ def test_convert_to_bio_one_entity(): }] bio = convert_to_bio(text, entities) - expected = ['Simple O', 'line O', 'with O', 'one O', 'Entity B-LOC'] + expected = ['Simple O', 'line O', 'with O', 'one O', 'Entity LOC'] assert bio == expected def test_convert_to_bio_one_entity_two_words(): @@ -33,7 +34,7 @@ def test_convert_to_bio_one_entity_two_words(): }] bio = convert_to_bio(text, entities) - expected = ['Simple O', 'line O', 'with O', 'one O', 'TWO B-LOC', 'WORDS I-LOC'] + expected = ['Simple O', 'line O', 'with O', 'one O', 'TWO LOC', 'WORDS LOC'] assert bio == expected @@ -63,8 +64,8 @@ def test_convert_to_bio_two_entities(): bio = convert_to_bio(text, entities) expected = [ - 'Simple O', 'line O', 'with O', 'one O', 'Entity B-LOC', 'in O', 'the O', 'middle O', - 'and O', 'ANOTHER B-PER', 'ONE I-PER', 'much O', 'further O', 'on. O'] + 'Simple O', 'line O', 'with O', 'one O', 'Entity LOC', 'in O', 'the O', 'middle O', + 'and O', 'ANOTHER PER', 'ONE PER', 'much O', 'further O', 'on. O'] assert bio == expected @@ -83,8 +84,31 @@ def test_convert_to_bio_one_entity_three_words(): }] bio = convert_to_bio(text, entities) - expected = ['Simple O', 'ENTITY B-ORG', 'THREE I-ORG', 'WORDS I-ORG', 'and O', 'more O', 'words O'] + expected = ['Simple O', 'ENTITY ORG', 'THREE ORG', 'WORDS ORG', 'and O', 'more O', 'words O'] - print(bio) - print(expected) assert bio == expected + +def test_convert_to_bio_real_example(tmpdir): + test_files_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test_files') + + with open(os.path.join(test_files_folder, 'urn=ddd_000010470_mpeg21_p002_alto.alto.xml.txt'), 'r') as fh: + text = fh.read() + + with open(os.path.join(test_files_folder, 'urn=ddd_000010470_mpeg21_p002_alto.alto.xml.entities.json'), 'r') as fh: + entities_full = json.load(fh) + entities = entities_full['entities'] + + bio = convert_to_bio(text, entities) + + temp_file = tmpdir.join('tempout.bio') + with open(temp_file, 'w') as fh: + for line in bio: + fh.write("%s\n" % line) + + with open(temp_file, 'r') as fh: + actual = fh.readlines() + + with open(os.path.join(test_files_folder, 'urn=ddd_000010470_mpeg21_p002_alto.alto.xml.bio'), 'r') as fh: + expected = fh.readlines() + + assert actual == expected