Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for file-like object parsing #82

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions xbrl/helper/xml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
It is used by the different parsing modules.
"""
import xml.etree.ElementTree as ET
from io import StringIO
from io import StringIO, IOBase


def parse_file(file: str or StringIO) -> ET.ElementTree:
def parse_file(file: str or IOBase or StringIO) -> ET.ElementTree:
"""
Parses a file, returns the Root element with an attribute 'ns_map' containing the prefix - namespaces map
:param file: either the file path (str) or a file-like object
Expand All @@ -18,6 +18,10 @@ def parse_file(file: str or StringIO) -> ET.ElementTree:
root = None
ns_map = []

# sets the file pointer back to the beginning in case it was read from multiple times
if isinstance(file, IOBase):
file.seek(0, 0)

for event, elem in ET.iterparse(file, events):
if event == "start-ns":
ns_map.append(elem)
Expand Down
80 changes: 50 additions & 30 deletions xbrl/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
import abc
import logging
from io import StringIO, BytesIO
from io import StringIO, BytesIO, IOBase
from typing import List
import xml.etree.ElementTree as ET
from datetime import date, datetime
Expand Down Expand Up @@ -246,21 +246,23 @@ class XbrlInstance(abc.ABC):
"""
Class representing a xbrl instance file
"""

def __init__(self, url: str, taxonomy: TaxonomySchema, facts: List[AbstractFact], context_map: dict,
def __init__(self, url: str or IOBase or StringIO, taxonomy: TaxonomySchema, facts: List[AbstractFact], context_map: dict,
unit_map: dict) -> None:
"""
:param taxonomy: taxonomy file that the instance file references (via link:schemaRef)
:param facts: array of all facts that the instance contains
"""
self.taxonomy: TaxonomySchema = taxonomy
self.facts: List[AbstractFact] = facts
self.instance_url: str = url
self.instance_url: str or IOBase or StringIO = url
self.context_map: dict = context_map
self.unit_map: dict = unit_map

def __str__(self) -> str:
file_name: str = self.instance_url.split('/')[-1]
if isinstance(self.instance_url, str):
file_name: str = self.instance_url.split('/')[-1]
elif isinstance(self.instance_url, IOBase):
file_name: str = self.instance_url.__str__()
return "{} with {} facts".format(file_name, len(self.facts))


Expand All @@ -279,7 +281,7 @@ def parse_xbrl_url(instance_url: str, cache: HttpCache) -> XbrlInstance:
return parse_xbrl(instance_path, cache, instance_url)


def parse_xbrl(instance_path: str, cache: HttpCache, instance_url: str or None = None) -> XbrlInstance:
def parse_xbrl(instance_path: str or IOBase or StringIO, cache: HttpCache, instance_url: str or None = None) -> XbrlInstance:
"""
Parses a instance file with it's taxonomy
:param instance_path: url to the instance file (on the internet)
Expand All @@ -293,6 +295,7 @@ def parse_xbrl(instance_path: str, cache: HttpCache, instance_url: str or None =
# get the link to the taxonomy schema and parse it
schema_ref: ET.Element = root.find(LINK_NS + 'schemaRef')
schema_uri: str = schema_ref.attrib[XLINK_NS + 'href']

# check if the schema uri is relative or absolute
# submissions from SEC normally have their own schema files, whereas submissions from the uk have absolute schemas
if schema_uri.startswith('http'):
Expand All @@ -302,10 +305,12 @@ def parse_xbrl(instance_path: str, cache: HttpCache, instance_url: str or None =
# fetch the taxonomy extension schema from remote by reconstructing the url
schema_url = resolve_uri(instance_url, schema_uri)
taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache)
else:
elif isinstance(instance_path, str):
# try to find the taxonomy extension schema file locally because no full url can be constructed
schema_path = resolve_uri(instance_path, schema_uri)
taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache)
elif isinstance(instance_path, IOBase):
taxonomy: TaxonomySchema = parse_taxonomy(instance_path, cache)

# parse contexts and units
context_dir = _parse_context_elements(root.findall('xbrli:context', NAME_SPACES), root.attrib['ns_map'], taxonomy,
Expand All @@ -331,22 +336,24 @@ def parse_xbrl(instance_path: str, cache: HttpCache, instance_url: str or None =
taxonomy_ns = taxonomy_ns.replace('{', '')
# get the concept object from the taxonomy
tax = taxonomy.get_taxonomy(taxonomy_ns)
if tax is None: tax = _load_common_taxonomy(cache, taxonomy_ns, taxonomy)

concept: Concept = tax.concepts[tax.name_id_map[concept_name]]
context: AbstractContext = context_dir[fact_elem.attrib['contextRef'].strip()]

if 'unitRef' in fact_elem.attrib:
# the fact is a numerical fact
# get the unit
unit: AbstractUnit = unit_dir[fact_elem.attrib['unitRef'].strip()]
decimals_text: str = str(fact_elem.attrib['decimals']).strip()
decimals: int = None if decimals_text.lower() == 'inf' else int(decimals_text)
fact = NumericFact(concept, context, float(fact_elem.text), unit, decimals)
else:
# the fact is probably a text fact
fact = TextFact(concept, context, fact_elem.text.strip())
facts.append(fact)
if tax is None:
tax = _load_common_taxonomy(cache, taxonomy_ns, taxonomy)

if concept_name in tax.name_id_map:
concept: Concept = tax.concepts[tax.name_id_map[concept_name]]
context: AbstractContext = context_dir[fact_elem.attrib['contextRef'].strip()]

if 'unitRef' in fact_elem.attrib:
# the fact is a numerical fact
# get the unit
unit: AbstractUnit = unit_dir[fact_elem.attrib['unitRef'].strip()]
decimals_text: str = str(fact_elem.attrib['decimals']).strip()
decimals: int = None if decimals_text.lower() == 'inf' else int(decimals_text)
fact = NumericFact(concept, context, float(fact_elem.text), unit, decimals)
else:
# the fact is probably a text fact
fact = TextFact(concept, context, fact_elem.text.strip())
facts.append(fact)

return XbrlInstance(instance_url if instance_url else instance_path, taxonomy, facts, context_dir, unit_dir)

Expand All @@ -366,7 +373,7 @@ def parse_ixbrl_url(instance_url: str, cache: HttpCache) -> XbrlInstance:
return parse_ixbrl(instance_path, cache, instance_url)


def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None = None, encoding=None) -> XbrlInstance:
def parse_ixbrl(instance_path: str or IOBase or StringIO, cache: HttpCache, instance_url: str or None = None, encoding=None) -> XbrlInstance:
"""
Parses a inline XBRL (iXBRL) instance file.
:param instance_path: path to the submission you want to parse
Expand All @@ -385,7 +392,10 @@ def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None
=> in the XBRL-parse function root is ET.Element, here just an instance of ElementTree class!
"""

instance_file = open(instance_path, "r", encoding=encoding)
if isinstance(instance_path, str):
instance_file = open(instance_path, "r", encoding=encoding)
elif isinstance(instance_path, IOBase):
instance_file = instance_path
contents = instance_file.read()
pattern = r'<[ ]*script.*?\/[ ]*script[ ]*>'
contents = re.sub(pattern, '', contents, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
Expand All @@ -395,6 +405,7 @@ def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None
# get the link to the taxonomy schema and parse it
schema_ref: ET.Element = root.find('.//{}schemaRef'.format(LINK_NS))
schema_uri: str = schema_ref.attrib[XLINK_NS + 'href']

# check if the schema uri is relative or absolute
# submissions from SEC normally have their own schema files, whereas submissions from the uk have absolute schemas
if schema_uri.startswith('http'):
Expand All @@ -404,10 +415,12 @@ def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None
# fetch the taxonomy extension schema from remote by reconstructing the url
schema_url = resolve_uri(instance_url, schema_uri)
taxonomy: TaxonomySchema = parse_taxonomy_url(schema_url, cache)
else:
elif isinstance(instance_path, str):
# try to find the taxonomy extension schema file locally because no full url can be constructed
schema_path = resolve_uri(instance_path, schema_uri)
taxonomy: TaxonomySchema = parse_taxonomy(schema_path, cache)
elif isinstance(instance_path, IOBase):
taxonomy: TaxonomySchema = parse_taxonomy(instance_path, cache)

# get all contexts and units
xbrl_resources: ET.Element = root.find('.//ix:resources', ns_map)
Expand Down Expand Up @@ -581,11 +594,12 @@ def _parse_context_elements(context_elements: List[ET.Element], ns_map: dict, ta
if member_tax is None:
# try to subsequently load the taxonomy
member_tax = _load_common_taxonomy(cache, ns_map[member_prefix], taxonomy)
dimension_concept: Concept = dimension_tax.concepts[dimension_tax.name_id_map[dimension_concept_name]]
member_concept: Concept = member_tax.concepts[member_tax.name_id_map[member_concept_name]]
if dimension_concept_name in dimension_tax.name_id_map and member_concept_name in member_tax.name_id_map:
dimension_concept: Concept = dimension_tax.concepts[dimension_tax.name_id_map[dimension_concept_name]]
member_concept: Concept = member_tax.concepts[member_tax.name_id_map[member_concept_name]]

# add the explicit member to the context
context.segments.append(ExplicitMember(dimension_concept, member_concept))
# add the explicit member to the context
context.segments.append(ExplicitMember(dimension_concept, member_concept))

context_dict[context_id] = context
return context_dict
Expand Down Expand Up @@ -676,9 +690,15 @@ def parse_instance_locally(self, path: str, instance_url: str or None = None) ->
instance document was downloaded, the parser can fetch relative imports using this base url
:return:
"""

if path.split('.')[-1] == 'xml' or path.split('.')[-1] == 'xbrl':
return parse_xbrl(path, self.cache, instance_url)
return parse_ixbrl(path, self.cache, instance_url)

def parse_file_obj(self, file_obj, instance_url: str or None = None, is_xbrl: bool = True):
if is_xbrl is True:
return parse_xbrl(file_obj, self.cache, instance_url)
return parse_ixbrl(file_obj, self.cache, instance_url)

def __str__(self) -> str:
return 'XbrlParser with cache dir at {}'.format(self.cache.cache_dir)
15 changes: 9 additions & 6 deletions xbrl/linkbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""
import abc
import os
from io import StringIO, IOBase
from typing import List
import xml.etree.ElementTree as ET
from abc import ABC
Expand Down Expand Up @@ -428,7 +429,7 @@ def parse_linkbase_url(linkbase_url: str, linkbase_type: LinkbaseType, cache: Ht
return parse_linkbase(linkbase_path, linkbase_type, linkbase_url)


def parse_linkbase(linkbase_path: str, linkbase_type: LinkbaseType, linkbase_url: str or None = None) -> Linkbase:
def parse_linkbase(linkbase_path: str or IOBase or StringIO, linkbase_type: LinkbaseType, linkbase_url: str or None = None) -> Linkbase:
"""
Parses a linkbase and returns a Linkbase object containing all
locators, arcs and links of the linkbase in a hierarchical order (a Tree)
Expand All @@ -440,10 +441,11 @@ def parse_linkbase(linkbase_path: str, linkbase_type: LinkbaseType, linkbase_url
the url has to be set so that the parser can connect the locator with concept from the taxonomy
:return:
"""
if linkbase_path.startswith('http'): raise XbrlParseException(
'This function only parses locally saved linkbases. Please use parse_linkbase_url to parse remote linkbases')
if not os.path.exists(linkbase_path):
raise LinkbaseNotFoundException(f"Could not find linkbase at {linkbase_path}")
if isinstance(linkbase_path, str):
if linkbase_path.startswith('http'): raise XbrlParseException(
'This function only parses locally saved linkbases. Please use parse_linkbase_url to parse remote linkbases')
if not os.path.exists(linkbase_path):
raise LinkbaseNotFoundException(f"Could not find linkbase at {linkbase_path}")

root: ET.Element = ET.parse(linkbase_path).getroot()
# store the role refs in a dictionary, with the role uri as key.
Expand Down Expand Up @@ -490,7 +492,8 @@ def parse_linkbase(linkbase_path: str, linkbase_type: LinkbaseType, linkbase_url
if not locator_href.startswith('http'):
# resolve the path
# todo, try to get the URL here, instead of the path!!!
locator_href = resolve_uri(linkbase_url if linkbase_url else linkbase_path, locator_href)
if linkbase_url or isinstance(linkbase_path, str):
locator_href = resolve_uri(linkbase_url if linkbase_url else linkbase_path, locator_href)
locator_map[loc_label] = Locator(locator_href, loc_label)

# Performance: extract the labels in advance. The label name (xlink:label) is the key and the value is
Expand Down
42 changes: 31 additions & 11 deletions xbrl/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,17 @@
"""
import logging
import os
from io import StringIO, IOBase
from typing import List
import xml.etree.ElementTree as ET

from functools import lru_cache
from urllib.parse import unquote

from xbrl import XbrlParseException, TaxonomyNotFound
from xbrl.cache import HttpCache
from xbrl.helper.uri_helper import resolve_uri, compare_uri
from xbrl.helper.xml_parser import parse_file
from xbrl.linkbase import Linkbase, ExtendedLink, LinkbaseType, parse_linkbase, parse_linkbase_url, Label

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -432,7 +435,7 @@ class TaxonomySchema:
This parser will not parse all Schemas and imports, only what is necessary.
"""

def __init__(self, schema_url: str, namespace: str):
def __init__(self, schema_url: str or IOBase or StringIO, namespace: str):
"""
The imports array stores an array of all Schemas that are imported.
The current Taxonomy Schema can override the extended schemas in the following way:
Expand Down Expand Up @@ -472,7 +475,7 @@ def get_taxonomy(self, url: str):
:return either a TaxonomySchema obj or None
:return:
"""
if compare_uri(self.namespace, url) or compare_uri(self.schema_url, url):
if (self.namespace is not None and compare_uri(self.namespace, url)) or (isinstance(self.schema_url, str) and compare_uri(self.schema_url, url)):
return self

for imported_tax in self.imports:
Expand Down Expand Up @@ -513,7 +516,7 @@ def parse_taxonomy_url(schema_url: str, cache: HttpCache) -> TaxonomySchema:
return parse_taxonomy(schema_path, cache, schema_url)


def parse_taxonomy(schema_path: str, cache: HttpCache, schema_url: str or None = None) -> TaxonomySchema:
def parse_taxonomy(schema_path: str or IOBase or StringIO, cache: HttpCache, schema_url: str or None = None) -> TaxonomySchema:
"""
Parses a taxonomy schema file.
:param schema_path: url to the schema (on the internet)
Expand All @@ -522,15 +525,28 @@ def parse_taxonomy(schema_path: str, cache: HttpCache, schema_url: str or None =
imported schemas from the remote location. If this url is None, the script will try to find those resources locally.
:return:
"""
if schema_path.startswith('http'): raise XbrlParseException(
'This function only parses locally saved taxonomies. Please use parse_taxonomy_url to parse remote taxonomy schemas')
if not os.path.exists(schema_path):
raise TaxonomyNotFound(f"Could not find taxonomy schema at {schema_path}")

if isinstance(schema_path, str):
if schema_path.startswith('http'): raise XbrlParseException(
'This function only parses locally saved taxonomies. Please use parse_taxonomy_url to parse remote taxonomy schemas')
if not os.path.exists(schema_path):
raise TaxonomyNotFound(f"Could not find taxonomy schema at {schema_path}")

# Get the local absolute path to the schema file (and download it if it is not yet cached)
root: ET.Element = ET.parse(schema_path).getroot()
if isinstance(schema_path, str):
root: ET.Element = ET.parse(schema_path).getroot()
elif isinstance(schema_path, IOBase):
root: ET.Element = parse_file(schema_path).getroot()
# get the target namespace of the taxonomy
target_ns = root.attrib['targetNamespace']

if 'targetNamespace' in root.attrib:
target_ns = root.attrib['targetNamespace']
else:
schema_ref: ET.Element = root.find(LINK_NS + 'schemaRef')
schema_uri: str = schema_ref.attrib[XLINK_NS + 'href']
ticker = schema_uri.split("-")[0]
target_ns = root.attrib['ns_map'][ticker]

taxonomy: TaxonomySchema = TaxonomySchema(schema_url if schema_url else schema_path, target_ns)

import_elements: List[ET.Element] = root.findall('xsd:import', NAME_SPACES)
Expand All @@ -546,10 +562,12 @@ def parse_taxonomy(schema_path: str, cache: HttpCache, schema_url: str or None =
# fetch the schema file from remote by reconstructing the full url
import_url = resolve_uri(schema_url, import_uri)
taxonomy.imports.append(parse_taxonomy_url(import_url, cache))
else:
elif isinstance(schema_path, str):
# We have to try to fetch the linkbase locally because no full url can be constructed
import_path = resolve_uri(schema_path, import_uri)
taxonomy.imports.append(parse_taxonomy(import_path, cache))
elif isinstance(schema_path, IOBase):
taxonomy.imports.append(parse_taxonomy(schema_path, cache))

role_type_elements: List[ET.Element] = root.findall('xsd:annotation/xsd:appinfo/link:roleType', NAME_SPACES)
# parse ELR's
Expand Down Expand Up @@ -597,10 +615,12 @@ def parse_taxonomy(schema_path: str, cache: HttpCache, schema_url: str or None =
# fetch the linkbase from remote by reconstructing the full URL
linkbase_url = resolve_uri(schema_url, linkbase_uri)
linkbase: Linkbase = parse_linkbase_url(linkbase_url, linkbase_type, cache)
else:
elif isinstance(schema_path, str):
# We have to try to fetch the linkbase locally because no full url can be constructed
linkbase_path = resolve_uri(schema_path, linkbase_uri)
linkbase: Linkbase = parse_linkbase(linkbase_path, linkbase_type)
elif isinstance(schema_path, IOBase):
linkbase: Linkbase = parse_linkbase(schema_path, linkbase_type)

# add the linkbase to the taxonomy
if linkbase_type == LinkbaseType.DEFINITION:
Expand Down