diff --git a/parser/parser.py b/parser/parser.py index 7f78e79..f713e97 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -66,6 +66,7 @@ def parseArguments(sysArgs): parser.add_argument( '--route_to_content', dest='route', + required=True, help="""The route (i.e. path) to the node the textual content needs to be extracted from. To extract the content from all 'content' nodes that are direct children of 'parent', provide 'parent#content'. If the content is in an attribute, you can do: @@ -85,14 +86,16 @@ def parseArguments(sysArgs): return parsedArgs +def fatal(message): + print(message) + sys.exit(1) + ### # Do the work ### def main(sysArgs): args = parseArguments(sysArgs) - print(args) - for folder, subs, files in os.walk(args.root_dir): for filename in files: if filename.endswith(args.extension): @@ -118,7 +121,11 @@ def collect_text(xml, route_to_text): skip_first = xml.tag == route_to_text.split('#')[0] route = parse_route(route_to_text, skip_first) - nodes_with_text = xml.findall(route['query']) + + try: + nodes_with_text = xml.findall(route['query']) + except SyntaxError: + fatal("Your route contains invalid syntax. Please review it and try again.") text = [] diff --git a/parser/test_parser.py b/parser/test_parser.py index 3ebada2..842ab47 100644 --- a/parser/test_parser.py +++ b/parser/test_parser.py @@ -78,6 +78,25 @@ def test_collect_text_multiple_text_attributes(): assert actual == "TEXT TEXT2" +def test_collect_text_complex_structure(): + xml = ET.fromstring(""" + + + + Text + + + + + Text2 + + + + """) + actual = collect_text(xml, 'content') + assert actual == "Text Text2" + + def test_collect_text_complex_structure_wildcard(): xml = ET.fromstring(""" @@ -92,6 +111,25 @@ def test_collect_text_complex_structure_wildcard(): actual = collect_text(xml, 'parent#*#contentnode[content]') assert actual == "TEXT TEXT2" + +def test_collect_text_nonexisting_in_route(): + xml = ET.fromstring( + 'TEXT') + actual = collect_text(xml, 'nonexisting') + assert actual == "" + + +def test_collect_text_nonsense_in_route(): + xml = ET.fromstring( + 'TEXT') + + with pytest.raises(SystemExit) as pytest_wrapped_e: + collect_text(xml, '@#!') + + assert pytest_wrapped_e.type == SystemExit + assert pytest_wrapped_e.value.code == 1 + + @pytest.mark.xfail(raises=ET.ParseError) def test_collect_text_html_character(): ''' @@ -105,23 +143,26 @@ def test_collect_text_html_character(): def test_extract_text_europeana_one_textline(): - actual = extract_text(os.path.join(basepath(),"test_files/europeana_one_textline.xml"), + actual = extract_text(os.path.join(basepath(), "test_files/europeana_one_textline.xml"), '{http://schema.ccs-gmbh.com/ALTO}string[content]') assert actual == "Indien men Italiƫ in zijn geheel kon neutraliseren ," def test_extract_text_europeana_one_textbox(): print(os.path.join(basepath(), "test_files/europeana_one_textbox.txt")) - + with open(os.path.join(basepath(), "test_files/europeana_one_textbox.txt"), "r") as txt: expected = txt.read() actual = extract_text(os.path.join(basepath(), "test_files/europeana_one_textbox.xml"), '{http://schema.ccs-gmbh.com/ALTO}string[content]') assert actual == expected + def test_extract_text_icabish(): - actual = extract_text(os.path.join(basepath(),"test_files/icab-ish.xml"), 'TEXT') + actual = extract_text(os.path.join( + basepath(), "test_files/icab-ish.xml"), 'TEXT') assert actual == "Some text to test" + def basepath(): - return os.path.dirname(os.path.realpath(__file__)) \ No newline at end of file + return os.path.dirname(os.path.realpath(__file__))