Refer #24. Bio converter updated to base itself on text insted of ent…

…itites. Real world example / test added.
UUDigitalHumanitieslab · Jun 18, 2019 · 953edf9 · 953edf9
1 parent 09f85ff
commit 953edf9
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 31 deletions.
diff --git a/helpers/bio_converter.py b/helpers/bio_converter.py
@@ -1,41 +1,58 @@
 
 
 def convert_to_bio(text, entities):
+    text_index = 1
+    current_entity_index = 0
+
     bio_tagged = []
 
-    prev_entity_end = 0
+    for word in text.split():
+        if current_entity_index < len(entities) and is_part_of_entity(text_index, entities[current_entity_index]):
+            tag_entity(bio_tagged, entities[current_entity_index], word)
+
+            if (ends_entity(word, entities[current_entity_index])):
+                current_entity_index = current_entity_index + 1
+        else:
+            tag_non_entities(bio_tagged, word)
+
+        text_index = text_index + len(word) + 1
+
+    # prev_entity_end = 0
+
+    # for entity in entities:
+    #     index = entity['pos']
+    #     entity_to_end = len(text) - index + 1
+
+    #     text_in_between_entities = text[prev_entity_end:-entity_to_end]
+    #     prev_entity_end = index + len(entity['ne'])
+
+    #     tag_non_entities(bio_tagged, text_in_between_entities)
+    #     tag_entity(bio_tagged, entity)
 
-    for entity in entities:
-        index = entity['pos']
-        entity_to_end = len(text) - index + 1
-
-        text_in_between_entities = text[prev_entity_end:-entity_to_end]        
-        prev_entity_end = index + len(entity['ne'])
+    # text_after_last_entity = text[prev_entity_end:]
 
-        tag_non_entities(bio_tagged, text_in_between_entities)
-        tag_entity(bio_tagged, entity)    
-
-    text_after_last_entity = text[prev_entity_end:]
-    tag_non_entities(bio_tagged, text_after_last_entity)
+    # print(len(bio_tagged))
+
+    # tag_non_entities(bio_tagged, text_after_last_entity)
 
     return bio_tagged
 
+
+def is_part_of_entity(text_index, entity):
+    return entity['pos'] <= text_index and text_index <= entity['pos'] + len(entity['ne'])
+
+def ends_entity(word, entity):
+    return entity['ne'].endswith(word)
+
+
 def tag_non_entities(bio_tagged, text):
-    for word in text.split():
+    for word in text.split(' '):
         bio_tagged.append("{} O".format(word))
 
-def tag_entity(bio_tagged, entity):
+
+def tag_entity(bio_tagged, entity, word):
     tag = translate_to_bio(entity['type'])
-    entity_text = entity['ne'].split()
-
-    if len(entity_text) > 1:
-        for i in range(len(entity_text)):            
-            if i == 0:
-                bio_tagged.append("{} B-{}".format(entity_text[i], tag))
-            else:
-                bio_tagged.append("{} I-{}".format(entity_text[i], tag))
-    else:
-        bio_tagged.append("{} B-{}".format(entity['ne'], tag))
+    bio_tagged.append("{} {}".format(word, tag))
 
 
 def translate_to_bio(entity_type):

diff --git a/helpers/test_bio_converter.py b/helpers/test_bio_converter.py
@@ -1,3 +1,4 @@
+import os, json
 from bio_converter import convert_to_bio
 
 def test_convert_to_bio_one_entity():
@@ -15,7 +16,7 @@ def test_convert_to_bio_one_entity():
     }]
 
     bio = convert_to_bio(text, entities)
-    expected = ['Simple O', 'line O', 'with O', 'one O', 'Entity B-LOC']
+    expected = ['Simple O', 'line O', 'with O', 'one O', 'Entity LOC']
     assert bio == expected
 
 def test_convert_to_bio_one_entity_two_words():
@@ -33,7 +34,7 @@ def test_convert_to_bio_one_entity_two_words():
     }]
 
     bio = convert_to_bio(text, entities)
-    expected = ['Simple O', 'line O', 'with O', 'one O', 'TWO B-LOC', 'WORDS I-LOC']
+    expected = ['Simple O', 'line O', 'with O', 'one O', 'TWO LOC', 'WORDS LOC']
     assert bio == expected
 
 
@@ -63,8 +64,8 @@ def test_convert_to_bio_two_entities():
 
     bio = convert_to_bio(text, entities)
     expected = [
-        'Simple O', 'line O', 'with O', 'one O', 'Entity B-LOC', 'in O', 'the O', 'middle O', 
-        'and O', 'ANOTHER B-PER', 'ONE I-PER', 'much O', 'further O', 'on. O']
+        'Simple O', 'line O', 'with O', 'one O', 'Entity LOC', 'in O', 'the O', 'middle O', 
+        'and O', 'ANOTHER PER', 'ONE PER', 'much O', 'further O', 'on. O']
 
     assert bio == expected
 
@@ -83,8 +84,31 @@ def test_convert_to_bio_one_entity_three_words():
     }]
 
     bio = convert_to_bio(text, entities)
-    expected = ['Simple O', 'ENTITY B-ORG', 'THREE I-ORG', 'WORDS I-ORG', 'and O', 'more O', 'words O']
+    expected = ['Simple O', 'ENTITY ORG', 'THREE ORG', 'WORDS ORG', 'and O', 'more O', 'words O']
 
-    print(bio)
-    print(expected)
     assert bio == expected
+
+def test_convert_to_bio_real_example(tmpdir):
+    test_files_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test_files')
+
+    with open(os.path.join(test_files_folder, 'urn=ddd_000010470_mpeg21_p002_alto.alto.xml.txt'), 'r') as fh:
+        text = fh.read()
+
+    with open(os.path.join(test_files_folder, 'urn=ddd_000010470_mpeg21_p002_alto.alto.xml.entities.json'), 'r') as fh:
+        entities_full = json.load(fh)
+        entities = entities_full['entities']
+
+    bio = convert_to_bio(text, entities)
+
+    temp_file = tmpdir.join('tempout.bio')
+    with open(temp_file, 'w') as fh:
+        for line in bio:
+            fh.write("%s\n" % line)
+
+    with open(temp_file, 'r') as fh:
+        actual = fh.readlines()
+
+    with open(os.path.join(test_files_folder, 'urn=ddd_000010470_mpeg21_p002_alto.alto.xml.bio'), 'r') as fh:
+        expected = fh.readlines()
+
+    assert actual == expected