Skip to content
This repository has been archived by the owner on Jul 25, 2024. It is now read-only.

Commit

Permalink
#22. Some further tests added
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex Hebing committed Jun 3, 2019
1 parent 51f9075 commit 31b67e2
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 7 deletions.
13 changes: 10 additions & 3 deletions parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def parseArguments(sysArgs):
parser.add_argument(
'--route_to_content',
dest='route',
required=True,
help="""The route (i.e. path) to the node the textual content needs to be extracted from.
To extract the content from all 'content' nodes that are direct children of 'parent',
provide 'parent#content'. If the content is in an attribute, you can do:
Expand All @@ -85,14 +86,16 @@ def parseArguments(sysArgs):

return parsedArgs

def fatal(message):
print(message)
sys.exit(1)

###
# Do the work
###
def main(sysArgs):
args = parseArguments(sysArgs)

print(args)

for folder, subs, files in os.walk(args.root_dir):
for filename in files:
if filename.endswith(args.extension):
Expand All @@ -118,7 +121,11 @@ def collect_text(xml, route_to_text):
skip_first = xml.tag == route_to_text.split('#')[0]

route = parse_route(route_to_text, skip_first)
nodes_with_text = xml.findall(route['query'])

try:
nodes_with_text = xml.findall(route['query'])
except SyntaxError:
fatal("Your route contains invalid syntax. Please review it and try again.")

text = []

Expand Down
49 changes: 45 additions & 4 deletions parser/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,25 @@ def test_collect_text_multiple_text_attributes():
assert actual == "TEXT TEXT2"


def test_collect_text_complex_structure():
xml = ET.fromstring("""<root>
<parent>
<child>
<grandchild>
<content>Text</content>
</grandchild>
</child>
<anotherchild>
<grandchild>
<content>Text2</content>
</grandchild>
</anotherchild>
</parent>
</root>""")
actual = collect_text(xml, 'content')
assert actual == "Text Text2"


def test_collect_text_complex_structure_wildcard():
xml = ET.fromstring("""<root>
<parent>
Expand All @@ -92,6 +111,25 @@ def test_collect_text_complex_structure_wildcard():
actual = collect_text(xml, 'parent#*#contentnode[content]')
assert actual == "TEXT TEXT2"


def test_collect_text_nonexisting_in_route():
xml = ET.fromstring(
'<root><child><contentnode>TEXT</contentnode></child></root>')
actual = collect_text(xml, 'nonexisting')
assert actual == ""


def test_collect_text_nonsense_in_route():
xml = ET.fromstring(
'<root><child><contentnode>TEXT</contentnode></child></root>')

with pytest.raises(SystemExit) as pytest_wrapped_e:
collect_text(xml, '@#!')

assert pytest_wrapped_e.type == SystemExit
assert pytest_wrapped_e.value.code == 1


@pytest.mark.xfail(raises=ET.ParseError)
def test_collect_text_html_character():
'''
Expand All @@ -105,23 +143,26 @@ def test_collect_text_html_character():


def test_extract_text_europeana_one_textline():
actual = extract_text(os.path.join(basepath(),"test_files/europeana_one_textline.xml"),
actual = extract_text(os.path.join(basepath(), "test_files/europeana_one_textline.xml"),
'{http://schema.ccs-gmbh.com/ALTO}string[content]')
assert actual == "Indien men Italië in zijn geheel kon neutraliseren ,"


def test_extract_text_europeana_one_textbox():
print(os.path.join(basepath(), "test_files/europeana_one_textbox.txt"))

with open(os.path.join(basepath(), "test_files/europeana_one_textbox.txt"), "r") as txt:
expected = txt.read()
actual = extract_text(os.path.join(basepath(), "test_files/europeana_one_textbox.xml"),
'{http://schema.ccs-gmbh.com/ALTO}string[content]')
assert actual == expected


def test_extract_text_icabish():
actual = extract_text(os.path.join(basepath(),"test_files/icab-ish.xml"), 'TEXT')
actual = extract_text(os.path.join(
basepath(), "test_files/icab-ish.xml"), 'TEXT')
assert actual == "Some text to test"


def basepath():
return os.path.dirname(os.path.realpath(__file__))
return os.path.dirname(os.path.realpath(__file__))

0 comments on commit 31b67e2

Please sign in to comment.