diff --git a/parser/README.md b/parser/README.md index c38a696..a69916d 100644 --- a/parser/README.md +++ b/parser/README.md @@ -1,10 +1,6 @@ # `parser.py` -`parser.py` is part of [a pipeline of scripts](../README.md). As it name suggests, it can be used to parse XML files into plain .txt files that contain the text content from the desired elements. In essence it is a wrapper around Python's [ElementTree XML API](https://docs.python.org/2/library/xml.etree.elementtree.html), allowing for some very basic text extraction options. - -## HTML entities - -`parser.py`, before loading the XML from the files into ElementTree to start looking for the texts you need, unescapes any HTML characters that might be in the document. For example, if the document contains the text 'Italië', this will be converted to 'ItaliĆ«' before the XML is loaded. This is dome to prevent parsing errors from ElementTree. +`parser.py` is part of [a pipeline of scripts](../README.md). As it name suggests, it can be used to parse XML and HTML files into plain .txt files that contain the text content from the desired elements. In essence it is a wrapper around [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/), allowing for some very basic text extraction options. Under the hood it utilizes BeautifulSoup's HTML parser, making it a very generous (XML) parser that doesn't care about namespaces and allows for HTML entities in the (XML) documents. ## Command line arguments @@ -123,19 +119,6 @@ Consider the following XML: To extract the content here, you could use a wildcard: `child#*#content`. -#### Xml namespaces - -Beware of xml namespaces! If a namespace applies to the tag you want to extract, you have add it in front of the tagname. For example: - -```xml - - - - - -``` - -If you simply provide `child[content]` the script will NOT find anything! Since the parent declares a namespace, all children belong to this namespace too. Therefore, you'll need to provide: `{http://any.namespace.you/need}tagname`. #### Valid routes diff --git a/parser/parser.py b/parser/parser.py index f713e97..9cec024 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1,12 +1,14 @@ import argparse import os import sys -import xml.etree.ElementTree as ET -import html + +from bs4 import BeautifulSoup ### # Input validation ### + + def route(input): parts = input.split('#') for index in range(len(parts)): @@ -15,11 +17,11 @@ def route(input): if not part: raise argparse.ArgumentTypeError( - "route '{}' may not contain empty elements.".format(input)) + "route_to_content '{}' may not contain empty elements.".format(input)) if not is_last and '[' in part: raise argparse.ArgumentTypeError( - "route '{}' may only contain an attribute in the last element.".format(input)) + "route_to_content '{}' may only contain an attribute in the last element.".format(input)) return input @@ -40,7 +42,7 @@ def extension(input): def parseArguments(sysArgs): parser = argparse.ArgumentParser( - description='Extract text only from I-CAB corpus. Produces txt files with the plain text.') + description='Extract textual content from XML or HTML to plain text files.') parser.add_argument( '--dir', @@ -67,18 +69,15 @@ def parseArguments(sysArgs): '--route_to_content', dest='route', required=True, - help="""The route (i.e. path) to the node the textual content needs to be extracted from. - To extract the content from all 'content' nodes that are direct children of 'parent', - provide 'parent#content'. If the content is in an attribute, you can do: + help="""The route (i.e. path) to the node the textual content needs to be extracted from. + To extract the content from all 'content' nodes that are direct children of 'parent', + provide 'parent#content'. If the content is in an attribute, you can do: 'tagname[attributename]' (note that these are only allowed in the last element of the route). A route cannot contain empty elements (i.e. ## is not allowed), but you can provide wildcards. If, for example, you need to extract text from a node 'text' that lives in 'sibling1' and 'sibling2', - which are both direct children of 'parent', you can provide 'parent#*#text'. - - Beware of namespaces! If a namespace applies to the tag you want to extract, you should add - it to the tagname. For example: '{http://any.namespace.you/need}tagname'. - + which are both direct children of 'parent', you can provide 'parent#*#text'. + More info and examples in the README""", type=route) @@ -86,6 +85,7 @@ def parseArguments(sysArgs): return parsedArgs + def fatal(message): print(message) sys.exit(1) @@ -93,66 +93,78 @@ def fatal(message): ### # Do the work ### + + def main(sysArgs): args = parseArguments(sysArgs) for folder, subs, files in os.walk(args.root_dir): for filename in files: if filename.endswith(args.extension): - text = extract_text(os.path.join(folder, filename), args.route) new_name = filename.replace(args.extension, '.txt') - print("extracting '{}' to '{}'".format(os.path.join( - folder, filename), os.path.join(args.output_folder, new_name))) - write_to_file(args.output_folder, new_name, text) + + if not os.path.exists(os.path.join(args.output_folder, new_name)): + print("Processing '{}'".format(filename)) + text = extract_text(os.path.join(folder, filename), args.route) + write_to_file(args.output_folder, new_name, text) def extract_text(file_path, route_to_text): - # first unescape HTML if it is present - with open(file_path, 'r') as file: - xml = file.read() - html_unescaped = html.unescape(xml) - root = ET.fromstring(html_unescaped) + ''' + Extract the desired text content from a file containing HTML or XML. + ''' + try: + with open(file_path, 'r') as file: + soup = BeautifulSoup(file, features="html.parser") + except UnicodeDecodeError as e: + print("Error when decoding '{}' More info: {}".format(file_path, e)) + exit(1) - return collect_text(root, route_to_text) + return collect_text(soup, route_to_text) -def collect_text(xml, route_to_text): - # if the route contains the root tag we need to skip it - skip_first = xml.tag == route_to_text.split('#')[0] +def collect_text(soup, route_to_text): + ''' + Collect the text content from a BeautifulSoup instance based on a route. + ''' + parsed_route = parse_route(route_to_text) - route = parse_route(route_to_text, skip_first) - try: - nodes_with_text = xml.findall(route['query']) + elements = soup.select(parsed_route['query']) except SyntaxError: fatal("Your route contains invalid syntax. Please review it and try again.") + return get_text(elements, parsed_route['attr']) + + +def get_text(elements, attribute): + ''' + Get the text from a set of HTML or XML elements. + + Keyword arguments: + elements -- The set of elements containg the text content. + attribute -- The name of the attribute (of an element) that contains the text content. + Can be 'None' if the text is not in an attribute. + ''' text = [] - for node in nodes_with_text: - if not route['attr'] is None: - text.append(node.attrib[route['attr']]) + for elem in elements: + if not attribute is None: + text.append(elem.attrs[attribute]) else: - text.append(node.text) + text.append(elem.text) return ' '.join(text) -def parse_route(route, skip_first): +def parse_route(route): ''' - Parses a route of format 'node#childnode#childnode[attribute]' - into XPATH query - - actual = xml.findall(".//child/contentnode[@content]") - actual = xml.findall(".//child/contentnode") - + Parses a route of format 'node#childnode#subchildnode[attribute]' + into a CSS Selector (e.g. 'node childnode subchildnode[attribute]') ''' tags = route.split('#') - if skip_first: - tags.pop(0) - - xpath_query = "./" + query = None attribute = None for i in range(len(tags)): @@ -162,12 +174,15 @@ def parse_route(route, skip_first): tag = tag_attribute[0] attribute = tag_attribute[1][:-1] - xpath_query = "{}/{}".format(xpath_query, tag) + if query is None: + query = tag + else: + query = "{} {}".format(query, tag) if not attribute is None: - xpath_query = "{}[@{}]".format(xpath_query, attribute) + query = "{}[{}]".format(query, attribute) - return {'query': xpath_query, 'attr': attribute} + return {'query': query, 'attr': attribute} def write_to_file(folder, filename, text): diff --git a/parser/test_parser.py b/parser/test_parser.py index 842ab47..63e7d7b 100644 --- a/parser/test_parser.py +++ b/parser/test_parser.py @@ -1,85 +1,88 @@ import os import pytest -import xml.etree.ElementTree as ET +from bs4 import BeautifulSoup from parser import collect_text, parse_route, extract_text def test_parse_route_attribute(): route = 'root#child#contentnode[content]' - actual = parse_route(route, True) + actual = parse_route(route) assert actual == {'attr': 'content', - 'query': './/child/contentnode[@content]'} + 'query': 'root child contentnode[content]'} def test_parse_route_no_attribute(): route = 'root#child#contentnode' - actual = parse_route(route, True) - assert actual == {'attr': None, 'query': './/child/contentnode'} - - -def test_parse_route_no_skip(): - route = 'child#anotherchild#contentnode' - actual = parse_route(route, False) - assert actual == {'attr': None, - 'query': './/child/anotherchild/contentnode'} + actual = parse_route(route) + assert actual == {'attr': None, 'query': 'root child contentnode'} def test_parse_route_single_tag(): route = 'contentnode' - actual = parse_route(route, False) - assert actual == {'attr': None, 'query': './/contentnode'} + actual = parse_route(route) + assert actual == {'attr': None, 'query': 'contentnode'} def test_parse_route_wildcard(): route = 'child#*#contentnode' - actual = parse_route(route, False) - assert actual == {'attr': None, 'query': './/child/*/contentnode'} + actual = parse_route(route) + assert actual == {'attr': None, 'query': 'child * contentnode'} def test_collect_text_single_full_route_no_attribute(): - xml = ET.fromstring( - 'TEXT') - actual = collect_text(xml, 'root#child#contentnode') + soup = BeautifulSoup( + 'TEXT', features="html.parser") + actual = collect_text(soup, 'root#child#contentnode') assert actual == "TEXT" def test_collect_text_single_no_full_route_no_attribute(): - xml = ET.fromstring( - 'TEXT') - actual = collect_text(xml, 'contentnode') + soup = BeautifulSoup( + 'TEXT', features="html.parser") + actual = collect_text(soup, 'contentnode') assert actual == "TEXT" def test_collect_text_single_full_route_attribute(): - xml = ET.fromstring( - '') - actual = collect_text(xml, 'root#child#contentnode[content]') + soup = BeautifulSoup( + '', features="html.parser") + actual = collect_text(soup, 'root#child#contentnode[content]') assert actual == "TEXT" def test_collect_text_single_no_full_route_attribute(): - xml = ET.fromstring( - '') - actual = collect_text(xml, 'contentnode[content]') + soup = BeautifulSoup( + '', features="html.parser") + actual = collect_text(soup, 'contentnode[content]') assert actual == "TEXT" def test_collect_text_multiple_text_nodes(): - xml = ET.fromstring( - 'TEXTTEXT2') - actual = collect_text(xml, 'contentnode') + soup = BeautifulSoup( + """ + + TEXT + TEXT2 + + """, features="html.parser") + actual = collect_text(soup, 'contentnode') assert actual == "TEXT TEXT2" def test_collect_text_multiple_text_attributes(): - xml = ET.fromstring( - '') - actual = collect_text(xml, 'contentnode[content]') + soup = BeautifulSoup( + """ + + + + + """, features="html.parser") + actual = collect_text(soup, 'contentnode[content]') assert actual == "TEXT TEXT2" def test_collect_text_complex_structure(): - xml = ET.fromstring(""" + soup = BeautifulSoup(""" @@ -92,13 +95,13 @@ def test_collect_text_complex_structure(): - """) - actual = collect_text(xml, 'content') + """, features="html.parser") + actual = collect_text(soup, 'content') assert actual == "Text Text2" def test_collect_text_complex_structure_wildcard(): - xml = ET.fromstring(""" + soup = BeautifulSoup(""" @@ -107,44 +110,32 @@ def test_collect_text_complex_structure_wildcard(): - """) - actual = collect_text(xml, 'parent#*#contentnode[content]') + """, features="html.parser") + actual = collect_text(soup, 'parent#*#contentnode[content]') assert actual == "TEXT TEXT2" def test_collect_text_nonexisting_in_route(): - xml = ET.fromstring( - 'TEXT') - actual = collect_text(xml, 'nonexisting') + soup = BeautifulSoup( + 'TEXT', features="html.parser") + actual = collect_text(soup, 'nonexisting') assert actual == "" def test_collect_text_nonsense_in_route(): - xml = ET.fromstring( - 'TEXT') - + soup = BeautifulSoup( + 'TEXT', features="html.parser") + with pytest.raises(SystemExit) as pytest_wrapped_e: - collect_text(xml, '@#!') + collect_text(soup, '@#!') assert pytest_wrapped_e.type == SystemExit assert pytest_wrapped_e.value.code == 1 -@pytest.mark.xfail(raises=ET.ParseError) -def test_collect_text_html_character(): - ''' - This test exists to prove that ElementTree cannnot handle the presence of HTML characters - in the XML. This is why `extract_text` unescapes these first. - ''' - - xml = ET.fromstring( - '') - collect_text(xml, 'contentnode[content]') - - def test_extract_text_europeana_one_textline(): actual = extract_text(os.path.join(basepath(), "test_files/europeana_one_textline.xml"), - '{http://schema.ccs-gmbh.com/ALTO}string[content]') + 'string[content]') assert actual == "Indien men ItaliĆ« in zijn geheel kon neutraliseren ," @@ -154,9 +145,9 @@ def test_extract_text_europeana_one_textbox(): with open(os.path.join(basepath(), "test_files/europeana_one_textbox.txt"), "r") as txt: expected = txt.read() actual = extract_text(os.path.join(basepath(), "test_files/europeana_one_textbox.xml"), - '{http://schema.ccs-gmbh.com/ALTO}string[content]') + 'string[content]') assert actual == expected - + def test_extract_text_icabish(): actual = extract_text(os.path.join( diff --git a/requirements.in b/requirements.in index a2f9044..816984a 100644 --- a/requirements.in +++ b/requirements.in @@ -2,4 +2,5 @@ Jinja2 geocoder requests argparse -pytest \ No newline at end of file +pytest +beautifulsoup4 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index cb5912f..ace6ff6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ argparse==1.4.0 atomicwrites==1.3.0 # via pytest attrs==19.1.0 # via pytest +beautifulsoup4==4.7.1 certifi==2019.3.9 # via requests chardet==3.0.4 # via requests click==7.0 # via geocoder @@ -28,6 +29,7 @@ ratelim==0.1.6 # via geocoder requests==2.21.0 scandir==1.10.0 # via pathlib2 six==1.12.0 # via geocoder, packaging, pathlib2, pytest +soupsieve==1.9.1 # via beautifulsoup4 urllib3==1.24.3 # via requests wcwidth==0.1.7 # via pytest zipp==0.5.1 # via importlib-metadata