From e2349d98fa81ec62a1b605380e98b63ee5d17080 Mon Sep 17 00:00:00 2001 From: carlo cancellieri Date: Fri, 22 Oct 2021 14:29:32 +0200 Subject: [PATCH 1/9] CSW harvester OutputSchema config support #258 --- ckanext/spatial/harvesters/csw.py | 5 +++-- ckanext/spatial/lib/csw_client.py | 24 +++++++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ckanext/spatial/harvesters/csw.py b/ckanext/spatial/harvesters/csw.py index c27824be..bc647553 100644 --- a/ckanext/spatial/harvesters/csw.py +++ b/ckanext/spatial/harvesters/csw.py @@ -159,10 +159,11 @@ def fetch_stage(self,harvest_object): self._save_object_error('Error contacting the CSW server: %s' % e, harvest_object) return False - + + namespace = self.source_config.get('output_schema',self.output_schema()) identifier = harvest_object.guid try: - record = self.csw.getrecordbyid([identifier], outputschema=self.output_schema()) + record = self.csw.getrecordbyid([identifier], outputschema=namespace) except Exception as e: self._save_object_error('Error getting the CSW record with GUID %s' % identifier, harvest_object) return False diff --git a/ckanext/spatial/lib/csw_client.py b/ckanext/spatial/lib/csw_client.py index 0ac075e5..c10a3569 100644 --- a/ckanext/spatial/lib/csw_client.py +++ b/ckanext/spatial/lib/csw_client.py @@ -70,6 +70,23 @@ class CswService(OwsService): def __init__(self, endpoint=None): super(CswService, self).__init__(endpoint) self.sortby = SortBy([SortProperty('dc:identifier')]) + # check capabilities + _cap = self.getcapabilities(endpoint)['response'] + self.capabilities=etree.ElementTree(etree.fromstring(_cap)) + + def _get_output_schemas(self, operation): + _cap_ns = self.capabilities.getroot().nsmap + _ows_ns = _cap_ns.get('ows') + if not _ows_ns: + raise CswError('Bad getcapabilities response: OWS namespace not found '+str(_cap_ns)) + _op=self.capabilities.find("//{}Operation[@name='{}']".format(_ows_ns,operation)) + _schemas=_op.find("{}Parameter[@name='outputSchema']".format(_ows_ns)) + _values = map(lambda v: v.text, _schemas.findall("{}Value".format(_ows_ns))) + output_schemas={} + for key, value in _schemas.nsmap.items(): + if value in _values: + output_schemas.update({key:value}) + return output_schemas def getrecords(self, qtype=None, keywords=[], typenames="csw:Record", esn="brief", @@ -156,9 +173,14 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief", def getrecordbyid(self, ids=[], esn="full", outputschema="gmd", **kw): from owslib.csw import namespaces csw = self._ows(**kw) + + output_schemas=self._get_output_schemas('GetRecordById') + if not output_schemas[outputschema]: + raise CswError('Output schema not supported by target server: '+str(output_schemas)) + kwa = { "esn": esn, - "outputschema": namespaces[outputschema], + "outputschema": output_schemas[outputschema], } # Ordinary Python version's don't support the metadata argument log.info('Making CSW request: getrecordbyid %r %r', ids, kwa) From 33d9b70e9ded47f3870465876d75850da203ca3e Mon Sep 17 00:00:00 2001 From: carlo cancellieri Date: Fri, 22 Oct 2021 15:15:36 +0200 Subject: [PATCH 2/9] fix special character formatting --- ckanext/spatial/lib/csw_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/spatial/lib/csw_client.py b/ckanext/spatial/lib/csw_client.py index c10a3569..d65e9e9e 100644 --- a/ckanext/spatial/lib/csw_client.py +++ b/ckanext/spatial/lib/csw_client.py @@ -79,9 +79,9 @@ def _get_output_schemas(self, operation): _ows_ns = _cap_ns.get('ows') if not _ows_ns: raise CswError('Bad getcapabilities response: OWS namespace not found '+str(_cap_ns)) - _op=self.capabilities.find("//{}Operation[@name='{}']".format(_ows_ns,operation)) - _schemas=_op.find("{}Parameter[@name='outputSchema']".format(_ows_ns)) - _values = map(lambda v: v.text, _schemas.findall("{}Value".format(_ows_ns))) + _op=self.capabilities.find("//{{{}}}Operation[@name='{}']".format(_ows_ns,operation)) + _schemas=_op.find("{{{}}}Parameter[@name='outputSchema']".format(_ows_ns)) + _values = map(lambda v: v.text, _schemas.findall("{{{}}}Value".format(_ows_ns))) output_schemas={} for key, value in _schemas.nsmap.items(): if value in _values: From 855ab7dda4530ffc4114d6f3dd21d36f8051c0a3 Mon Sep 17 00:00:00 2001 From: carlo cancellieri Date: Fri, 22 Oct 2021 18:31:07 +0200 Subject: [PATCH 3/9] preload config to read namespace if necessary. Generalize metadata namespace and tag name so we can now harvest any kind of metadata (if validation is provided or ignored) #209 #210 #219 #258 --- ckanext/spatial/harvesters/csw.py | 2 ++ ckanext/spatial/lib/csw_client.py | 13 ++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ckanext/spatial/harvesters/csw.py b/ckanext/spatial/harvesters/csw.py index bc647553..2b218624 100644 --- a/ckanext/spatial/harvesters/csw.py +++ b/ckanext/spatial/harvesters/csw.py @@ -160,6 +160,8 @@ def fetch_stage(self,harvest_object): harvest_object) return False + if not self.source_config: + self._set_source_config(harvest_object.source.config) namespace = self.source_config.get('output_schema',self.output_schema()) identifier = harvest_object.guid try: diff --git a/ckanext/spatial/lib/csw_client.py b/ckanext/spatial/lib/csw_client.py index d65e9e9e..cb84845f 100644 --- a/ckanext/spatial/lib/csw_client.py +++ b/ckanext/spatial/lib/csw_client.py @@ -190,14 +190,17 @@ def getrecordbyid(self, ids=[], esn="full", outputschema="gmd", **kw): csw.exceptionreport.exceptions #log.error(err) raise CswError(err) - if not csw.records: + elif csw.records: + record = self._xmd(list(csw.records.values())[0]) + elif csw.response: + record = self._xmd(etree.fromstring(csw.response)) + else: return - record = self._xmd(list(csw.records.values())[0]) ## strip off the enclosing results container, we only want the metadata - #md = csw._exml.find("/gmd:MD_Metadata")#, namespaces=namespaces) - # Ordinary Python version's don't support the metadata argument - md = csw._exml.find("/{http://www.isotc211.org/2005/gmd}MD_Metadata") + # '/{schema}*' expression should be safe enough and is able to match the + # desired schema followed by both MD_Metadata or MI_Metadata (iso19115[-2]) + md = csw._exml.find("/{{{schema}}}*".format(schema=output_schemas[outputschema])) mdtree = etree.ElementTree(md) try: record["xml"] = etree.tostring(mdtree, pretty_print=True, encoding=str) From bf2d1d6223e9a5a22ff56a08d987263b778ae0c0 Mon Sep 17 00:00:00 2001 From: carlo cancellieri Date: Sat, 23 Oct 2021 00:07:20 +0200 Subject: [PATCH 4/9] lets reload config at each run (the config can be different for each run) --- ckanext/spatial/harvesters/csw.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ckanext/spatial/harvesters/csw.py b/ckanext/spatial/harvesters/csw.py index 2b218624..d698253d 100644 --- a/ckanext/spatial/harvesters/csw.py +++ b/ckanext/spatial/harvesters/csw.py @@ -160,8 +160,9 @@ def fetch_stage(self,harvest_object): harvest_object) return False - if not self.source_config: - self._set_source_config(harvest_object.source.config) + # load config + self._set_source_config(harvest_object.source.config) + # get output_schema from config namespace = self.source_config.get('output_schema',self.output_schema()) identifier = harvest_object.guid try: From fbd5d1c3bc34ddba8a9ed225a13d0dfb5a645a8b Mon Sep 17 00:00:00 2001 From: carlo cancellieri Date: Sat, 23 Oct 2021 11:06:00 +0200 Subject: [PATCH 5/9] improve checks and comments --- ckanext/spatial/lib/csw_client.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ckanext/spatial/lib/csw_client.py b/ckanext/spatial/lib/csw_client.py index cb84845f..1d8b181f 100644 --- a/ckanext/spatial/lib/csw_client.py +++ b/ckanext/spatial/lib/csw_client.py @@ -171,12 +171,13 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief", kwa["startposition"] = startposition def getrecordbyid(self, ids=[], esn="full", outputschema="gmd", **kw): - from owslib.csw import namespaces + csw = self._ows(**kw) + # fetch target csw server capabilities for requested output schema output_schemas=self._get_output_schemas('GetRecordById') - if not output_schemas[outputschema]: - raise CswError('Output schema not supported by target server: '+str(output_schemas)) + if not output_schemas.get(outputschema): + raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas)) kwa = { "esn": esn, From 9d3637884879db0208a5a215d4383bd6461a409e Mon Sep 17 00:00:00 2001 From: carlo cancellieri Date: Sat, 23 Oct 2021 22:41:40 +0200 Subject: [PATCH 6/9] namespace from getcapabilities also for getrecords --- ckanext/spatial/lib/csw_client.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ckanext/spatial/lib/csw_client.py b/ckanext/spatial/lib/csw_client.py index 1d8b181f..c6302a56 100644 --- a/ckanext/spatial/lib/csw_client.py +++ b/ckanext/spatial/lib/csw_client.py @@ -91,10 +91,15 @@ def _get_output_schemas(self, operation): def getrecords(self, qtype=None, keywords=[], typenames="csw:Record", esn="brief", skip=0, count=10, outputschema="gmd", **kw): - from owslib.csw import namespaces + constraints = [] csw = self._ows(**kw) + # fetch target csw server capabilities for requested output schema + output_schemas=self._get_output_schemas('GetRecords') + if not output_schemas.get(outputschema): + raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas)) + if qtype is not None: constraints.append(PropertyIsEqualTo("dc:type", qtype)) @@ -104,7 +109,7 @@ def getrecords(self, qtype=None, keywords=[], "esn": esn, "startposition": skip, "maxrecords": count, - "outputschema": namespaces[outputschema], + "outputschema": output_schemas[outputschema], "sortby": self.sortby } log.info('Making CSW request: getrecords2 %r', kwa) @@ -119,10 +124,15 @@ def getrecords(self, qtype=None, keywords=[], def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief", keywords=[], limit=None, page=10, outputschema="gmd", startposition=0, cql=None, **kw): - from owslib.csw import namespaces + constraints = [] csw = self._ows(**kw) + # fetch target csw server capabilities for requested output schema + output_schemas=self._get_output_schemas('GetRecords') + if not output_schemas.get(outputschema): + raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas)) + if qtype is not None: constraints.append(PropertyIsEqualTo("dc:type", qtype)) @@ -132,7 +142,7 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief", "esn": esn, "startposition": startposition, "maxrecords": page, - "outputschema": namespaces[outputschema], + "outputschema": output_schemas[outputschema], "cql": cql, "sortby": self.sortby } @@ -146,7 +156,6 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief", err = 'Error getting identifiers: %r' % \ csw.exceptionreport.exceptions #log.error(err) - raise CswError(err) if matches == 0: matches = csw.results['matches'] From 4fb17ccad638b9963ac29e5c7c1bba2a68d50773 Mon Sep 17 00:00:00 2001 From: carlo cancellieri Date: Thu, 28 Oct 2021 18:52:45 +0200 Subject: [PATCH 7/9] documentation, pep8 and improve are requested --- ckanext/spatial/lib/csw_client.py | 26 +++++++++++++++----------- doc/harvesters.rst | 2 ++ 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/ckanext/spatial/lib/csw_client.py b/ckanext/spatial/lib/csw_client.py index c6302a56..8ac6abc4 100644 --- a/ckanext/spatial/lib/csw_client.py +++ b/ckanext/spatial/lib/csw_client.py @@ -72,20 +72,24 @@ def __init__(self, endpoint=None): self.sortby = SortBy([SortProperty('dc:identifier')]) # check capabilities _cap = self.getcapabilities(endpoint)['response'] - self.capabilities=etree.ElementTree(etree.fromstring(_cap)) + self.capabilities = etree.ElementTree(etree.fromstring(_cap)) + self.output_schemas = { + 'GetRecords': self._get_output_schemas('GetRecords'), + 'GetRecordById': self._get_output_schemas('GetRecordById'), + } def _get_output_schemas(self, operation): _cap_ns = self.capabilities.getroot().nsmap _ows_ns = _cap_ns.get('ows') if not _ows_ns: - raise CswError('Bad getcapabilities response: OWS namespace not found '+str(_cap_ns)) - _op=self.capabilities.find("//{{{}}}Operation[@name='{}']".format(_ows_ns,operation)) - _schemas=_op.find("{{{}}}Parameter[@name='outputSchema']".format(_ows_ns)) + raise CswError('Bad getcapabilities response: OWS namespace not found ' + str(_cap_ns)) + _op = self.capabilities.find("//{{{}}}Operation[@name='{}']".format(_ows_ns, operation)) + _schemas = _op.find("{{{}}}Parameter[@name='outputSchema']".format(_ows_ns)) _values = map(lambda v: v.text, _schemas.findall("{{{}}}Value".format(_ows_ns))) - output_schemas={} + output_schemas = {} for key, value in _schemas.nsmap.items(): if value in _values: - output_schemas.update({key:value}) + output_schemas.update({key : value}) return output_schemas def getrecords(self, qtype=None, keywords=[], @@ -95,8 +99,8 @@ def getrecords(self, qtype=None, keywords=[], constraints = [] csw = self._ows(**kw) - # fetch target csw server capabilities for requested output schema - output_schemas=self._get_output_schemas('GetRecords') + # check target csw server capabilities for requested output schema + output_schemas = self.output_schemas['GetRecords'] if not output_schemas.get(outputschema): raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas)) @@ -128,8 +132,8 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief", constraints = [] csw = self._ows(**kw) - # fetch target csw server capabilities for requested output schema - output_schemas=self._get_output_schemas('GetRecords') + # check target csw server capabilities for requested output schema + output_schemas = self.output_schemas['GetRecords'] if not output_schemas.get(outputschema): raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas)) @@ -184,7 +188,7 @@ def getrecordbyid(self, ids=[], esn="full", outputschema="gmd", **kw): csw = self._ows(**kw) # fetch target csw server capabilities for requested output schema - output_schemas=self._get_output_schemas('GetRecordById') + output_schemas=output_schemas = self.output_schemas['GetRecordById'] if not output_schemas.get(outputschema): raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas)) diff --git a/doc/harvesters.rst b/doc/harvesters.rst index c785b036..dcc24a50 100644 --- a/doc/harvesters.rst +++ b/doc/harvesters.rst @@ -75,7 +75,9 @@ The currently supported configuration options are: and spaces replaced with dashes. Setting this option to False gives the same effect as leaving it unset. * ``validator_profiles``: A list of string that specifies a list of validators that will be applied to the current harvester, overriding the global ones defined by the 'ckan.spatial.validator.profiles' option. +* ``output_schema``: the namespace to use as outputSchema_ for a CSW request +.. _outputSchema: https://docs.opengeospatial.org/is/12-176r7/12-176r7.html#72 Customizing the harvesters -------------------------- From f91f516d12c15c155a94dd0fd9c344e00c1ab96d Mon Sep 17 00:00:00 2001 From: carlo cancellieri Date: Thu, 28 Oct 2021 21:35:10 +0200 Subject: [PATCH 8/9] fallback to default schema --- ckanext/spatial/harvesters/csw.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ckanext/spatial/harvesters/csw.py b/ckanext/spatial/harvesters/csw.py index d698253d..70aefead 100644 --- a/ckanext/spatial/harvesters/csw.py +++ b/ckanext/spatial/harvesters/csw.py @@ -163,14 +163,22 @@ def fetch_stage(self,harvest_object): # load config self._set_source_config(harvest_object.source.config) # get output_schema from config - namespace = self.source_config.get('output_schema',self.output_schema()) + output_schema = self.source_config.get('output_schema',self.output_schema()) identifier = harvest_object.guid try: - record = self.csw.getrecordbyid([identifier], outputschema=namespace) + record = self.csw.getrecordbyid([identifier], outputschema=output_schema) except Exception as e: - self._save_object_error('Error getting the CSW record with GUID %s' % identifier, harvest_object) - return False - + try: + log.warn('Unable to fetch GUID {} with output schema: {}'.format(identifier, output_schema)) + if output_schema == self.output_schema(): + raise e + log.info('Fetching GUID {} with output schema: {}'.format(identifier, self.output_schema())) + # retry with default output schema + record = self.csw.getrecordbyid([identifier], outputschema=self.output_schema()) + except Exception as e: + self._save_object_error('Error getting the CSW record with GUID {}'.format(identifier), harvest_object) + return False + if record is None: self._save_object_error('Empty record for GUID %s' % identifier, harvest_object) From b82eb89244ddfc640be8811572e730c2ea3b94a9 Mon Sep 17 00:00:00 2001 From: carlo cancellieri Date: Thu, 28 Oct 2021 21:37:07 +0200 Subject: [PATCH 9/9] fallback to default schema --- ckanext/spatial/harvesters/csw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/spatial/harvesters/csw.py b/ckanext/spatial/harvesters/csw.py index 70aefead..dd1bdf05 100644 --- a/ckanext/spatial/harvesters/csw.py +++ b/ckanext/spatial/harvesters/csw.py @@ -163,7 +163,7 @@ def fetch_stage(self,harvest_object): # load config self._set_source_config(harvest_object.source.config) # get output_schema from config - output_schema = self.source_config.get('output_schema',self.output_schema()) + output_schema = self.source_config.get('output_schema', self.output_schema()) identifier = harvest_object.guid try: record = self.csw.getrecordbyid([identifier], outputschema=output_schema)