From f02e61b4bad762703ea443c363f172a16d3bac52 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Fri, 3 Jan 2025 12:02:06 -0500 Subject: [PATCH] Upgrade validation methods (#1911) * reconcile io path validator differences * make io and path output the same * update tests for io validation * add test that io and path output is same * update tests for io output with status * add validate helper function and fix status tracking * update arguments and status checks * fix test comparing io and path outputs * update error message for io inputs * add tests to compare all io and path outputs * remove script for testing * add initial json export option * update validator inputs and outputs, remove hdf5io references * update tests for non status output * add tests for json validator output * move pynwb.validate into validation module * update json report * add validation entry point * separate cli and validation function files * update validation tutorial * move get_backend to pynwb init * update example validation for new io behavior * update ruff ignores * fix test comments * update CHANGELOG * add tests for _get_backend * update backend imports for optional zarr * fix test name * update test filename * close io after validation * fix test assertion * add condition for ros3 validation * Update CHANGELOG.md * Apply suggestions from code review Co-authored-by: Ryan Ly * update cli args and docs * fix formatting * update json-file-path argname * update extension tutorial * Apply suggestions from code review Co-authored-by: Ryan Ly * warn for positional args in validation --------- Co-authored-by: Ryan Ly --- CHANGELOG.md | 10 + docs/gallery/general/extensions.py | 5 +- docs/source/validation.rst | 28 ++- pyproject.toml | 5 +- src/pynwb/__init__.py | 23 ++- src/pynwb/testing/testh5io.py | 8 +- src/pynwb/validate.py | 252 ----------------------- src/pynwb/validation.py | 198 ++++++++++++++++++ src/pynwb/validation_cli.py | 84 ++++++++ test.py | 14 +- tests/integration/ros3/test_ros3.py | 7 +- tests/integration/utils/test_io_utils.py | 27 ++- tests/validation/test_validate.py | 146 +++++++------ 13 files changed, 465 insertions(+), 342 deletions(-) delete mode 100644 src/pynwb/validate.py create mode 100644 src/pynwb/validation.py create mode 100644 src/pynwb/validation_cli.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 503df39e6..b416b5025 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ ## PyNWB 3.0.0 (Upcoming) +### Breaking changes +- The validation methods have been updated with multiple breaking changes. @stephprince [#1911](https://github.com/NeurodataWithoutBorders/pynwb/pull/1911) + - The behavior of `pynwb.validate(io=...)` now matches the behavior of `pynwb.validate(path=...)`. In previous pynwb versions, `pynwb.validate(io=...)` did not use the cached namespaces during validation. To obtain the same behavior as in previous versions, you can update the function call to `pynwb.validate(io=..., use_cached_namespaces=False)` + - `pynwb.validate` will return only a list of validation errors instead of a tuple: (list of validation_errors, status code) + - The validate module has been renamed to `validation.py`. The validate method can be + imported using `import pynwb; pynwb.validate` or `from pynwb import validate` + ### Deprecations - The following deprecated classes will now raise errors when creating new instances of these classes: ``ClusteringWaveforms``, ``Clustering``, ``SweepTable``. Reading files using these data types will continue to be supported. - The following methods and arguments have been deprecated: @@ -30,6 +37,9 @@ - Added support for `model_number`, `model_name`, and `serial_number` fields to `Device`. @stephprince [#1997](https://github.com/NeurodataWithoutBorders/pynwb/pull/1997) - Deprecated `EventWaveform` neurodata type. @rly [#1940](https://github.com/NeurodataWithoutBorders/pynwb/pull/1940) - Deprecated `ImageMaskSeries` neurodata type. @rly [#1941](https://github.com/NeurodataWithoutBorders/pynwb/pull/1941) +- Added enhancements to the validation CLI. @stephprince [#1911](https://github.com/NeurodataWithoutBorders/pynwb/pull/1911) + - Added an entry point for the validation module. You can now use `pynwb-validate "file.nwb"`. + - Added the `--json-outpath-path` CLI argument to output validation results in a machine readable format. - Removed python 3.8 support, added python 3.13 support. @stephprince [#2007](https://github.com/NeurodataWithoutBorders/pynwb/pull/2007) - Added warnings when using positional arguments in `Container` constructor methods. Positional arguments will raise errors in the next major release. @stephprince [#1972](https://github.com/NeurodataWithoutBorders/pynwb/pull/1972) diff --git a/docs/gallery/general/extensions.py b/docs/gallery/general/extensions.py index ddf9159c7..3e41e7a91 100644 --- a/docs/gallery/general/extensions.py +++ b/docs/gallery/general/extensions.py @@ -37,8 +37,7 @@ ns_builder = NWBNamespaceBuilder( "Extension for use in my Lab", "mylab", version="0.1.0" ) - -ns_builder.include_type("ElectricalSeries", namespace="core") +ns_builder.include_namespace("core") ext = NWBGroupSpec( "A custom ElectricalSeries for my lab", @@ -264,7 +263,7 @@ def __init__(self, **kwargs): ext_source = name + ".extensions.yaml" ns_builder = NWBNamespaceBuilder(name + " extensions", name, version="0.1.0") -ns_builder.include_type("NWBDataInterface", namespace="core") +ns_builder.include_namespace("core") potato = NWBGroupSpec( neurodata_type_def="Potato", diff --git a/docs/source/validation.rst b/docs/source/validation.rst index 73c138127..6e3ac7d2d 100644 --- a/docs/source/validation.rst +++ b/docs/source/validation.rst @@ -3,12 +3,21 @@ Validating NWB files ==================== +.. note:: + + The pynwb validation CLI checks for structural compliance of NWB files with the NWB schema. + It is recommended to use the `NWBInspector CLI `_ + for more comprehensive validation of both structural compliance with the NWB schema and + compliance of data with NWB best practices. The NWBInspector runs both PyNWB validation as + described here and additional data checks. + + Validating NWB files is handled by a command-line tool available in :py:mod:`~pynwb`. The validator can be invoked like so: .. code-block:: bash - python -m pynwb.validate test.nwb + pynwb-validate test.nwb If the file contains no NWB extensions, then this command will validate the file ``test.nwb`` against the *core* NWB specification. On success, the output will be: @@ -29,7 +38,7 @@ within the ``test.nwb`` file. .. code-block:: bash - python -m pynwb.validate -n ndx-my-extension test.nwb + pynwb-validate -n ndx-my-extension test.nwb To validate against the version of the **core** NWB specification that is included with the installed version of PyNWB, use the ``--no-cached-namespace`` flag. This can be useful in validating files against newer or older versions @@ -37,27 +46,28 @@ of the **core** NWB specification that are installed with newer or older version .. code-block:: bash - python -m pynwb.validate --no-cached-namespace test.nwb + pynwb-validate --no-cached-namespace test.nwb .. Last updated 8/13/2021 .. code-block:: text - $python -m pynwb.validate --help - usage: validate.py [-h] [-n NS] [-lns] [--cached-namespace | --no-cached-namespace] paths [paths ...] + $pynwb-validate --help + usage: pynwb-validate [-h] [-lns] [-n NS] [--json-output-path JSON_OUTPUT_PATH] [--no-cached-namespace] paths [paths ...] Validate an NWB file positional arguments: paths NWB file paths - optional arguments: + options: -h, --help show this help message and exit - -n NS, --ns NS the namespace to validate against -lns, --list-namespaces List the available namespaces and exit. - --cached-namespace Use the cached namespace (default). + -n NS, --ns NS the namespace to validate against + --json-output-path JSON_OUTPUT_PATH + Write json output to this location. --no-cached-namespace - Don't use the cached namespace. + Use the namespaces installed by PyNWB (true) or use the cached namespaces (false; default). If --ns is not specified, validate against all namespaces in the NWB file. diff --git a/pyproject.toml b/pyproject.toml index 122fbd05b..755d3384e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,9 @@ dynamic = ["version"] # the build backend will compute the version dynamically f "Homepage" = "https://github.com/NeurodataWithoutBorders/pynwb" "Bug Tracker" = "https://github.com/NeurodataWithoutBorders/pynwb/issues" +[project.scripts] +pynwb-validate = "pynwb.validation_cli:validation_cli" + [tool.hatch.version] source = "vcs" @@ -111,7 +114,7 @@ line-length = 120 "docs/gallery/*" = ["E402", "T201"] "src/*/__init__.py" = ["F401"] "src/pynwb/_version.py" = ["T201"] -"src/pynwb/validate.py" = ["T201"] +"src/pynwb/validation_cli.py" = ["T201"] "scripts/*" = ["T201"] # "test_gallery.py" = ["T201"] # Uncomment when test_gallery.py is created diff --git a/src/pynwb/__init__.py b/src/pynwb/__init__.py index 5f8fec157..409c28c34 100644 --- a/src/pynwb/__init__.py +++ b/src/pynwb/__init__.py @@ -23,7 +23,7 @@ CORE_NAMESPACE = 'core' from .spec import NWBDatasetSpec, NWBGroupSpec, NWBNamespace # noqa E402 -from .validate import validate # noqa: F401, E402 +from .validation import validate # noqa: F401, E402 try: # see https://effigies.gitlab.io/posts/python-packaging-2023/ @@ -344,6 +344,27 @@ def get_sum(self, a, b): return __TYPE_MAP.get_dt_container_cls(neurodata_type, namespace) +@docval({'name': 'path', 'type': str, 'doc': 'Path to the NWB file which can be an HDF5 file or a Zarr store.'}, + {"name": "method", "type": str, "doc": "the method to use when opening the file", 'default': None}, + is_method=False) +def _get_backend(path: str, method: str = None): + if method == "ros3": + return NWBHDF5IO # TODO - add additional conditions for other streaming methods + + try: + from hdmf_zarr import NWBZarrIO + backend_io_classes = [NWBHDF5IO, NWBZarrIO] + except ImportError: + backend_io_classes = [NWBHDF5IO] + + backend_options = [b for b in backend_io_classes if b.can_read(path=path)] + if len(backend_options) == 0: + raise ValueError(f"Could not find an IO to read the file '{path}'. If you are trying to read " + f"a Zarr file, make sure you have hdmf-zarr installed.") + else: + return backend_options[0] + + class NWBHDF5IO(_HDF5IO): @staticmethod diff --git a/src/pynwb/testing/testh5io.py b/src/pynwb/testing/testh5io.py index 7234e79f5..265cd5944 100644 --- a/src/pynwb/testing/testh5io.py +++ b/src/pynwb/testing/testh5io.py @@ -163,12 +163,12 @@ def getContainer(self, nwbfile): def validate(self): """ Validate the created files """ if os.path.exists(self.filename): - errors, _ = pynwb_validate(paths=[self.filename]) + errors = pynwb_validate(path=self.filename) if errors: raise Exception("\n".join(errors)) if os.path.exists(self.export_filename): - errors, _ = pynwb_validate(paths=[self.export_filename]) + errors = pynwb_validate(path=self.export_filename) if errors: raise Exception("\n".join(errors)) @@ -366,11 +366,11 @@ def roundtripExportContainer(self, cache_spec=True): def validate(self): """Validate the created files.""" if os.path.exists(self.filename): - errors, _ = pynwb_validate(paths=[self.filename]) + errors = pynwb_validate(path=self.filename) if errors: raise Exception("\n".join(errors)) if os.path.exists(self.export_filename): - errors, _ = pynwb_validate(paths=[self.export_filename]) + errors = pynwb_validate(path=self.export_filename) if errors: raise Exception("\n".join(errors)) diff --git a/src/pynwb/validate.py b/src/pynwb/validate.py deleted file mode 100644 index 880f860a6..000000000 --- a/src/pynwb/validate.py +++ /dev/null @@ -1,252 +0,0 @@ -"""Command line tool to Validate an NWB file against a namespace.""" -import sys -from argparse import ArgumentParser -from typing import Tuple, List, Dict, Optional - -from hdmf.spec import NamespaceCatalog -from hdmf.build import BuildManager -from hdmf.build import TypeMap as TypeMap -from hdmf.utils import docval, getargs -from hdmf.backends.io import HDMFIO -from hdmf.validate import ValidatorMap - -from pynwb import CORE_NAMESPACE -from pynwb.spec import NWBDatasetSpec, NWBGroupSpec, NWBNamespace - - -def _print_errors(validation_errors: list): - if validation_errors: - print(" - found the following errors:", file=sys.stderr) - for err in validation_errors: - print(str(err), file=sys.stderr) - else: - print(" - no errors found.") - - -def _validate_helper(io: HDMFIO, namespace: str = CORE_NAMESPACE) -> list: - builder = io.read_builder() - validator = ValidatorMap(io.manager.namespace_catalog.get_namespace(name=namespace)) - return validator.validate(builder) - - -def get_cached_namespaces_to_validate( - path: str, driver: Optional[str] = None, aws_region: Optional[str] = None, -) -> Tuple[List[str], BuildManager, Dict[str, str]]: - """ - Determine the most specific namespace(s) that are cached in the given NWBFile that can be used for validation. - - Example - ------- - The following example illustrates how we can use this function to validate against namespaces - cached in a file. This is useful, e.g., when a file was created using an extension - - .. code-block:: python - - from pynwb import validate - from pynwb.validate import get_cached_namespaces_to_validate - path = "my_nwb_file.nwb" - validate_namespaces, manager, cached_namespaces = get_cached_namespaces_to_validate(path) - with NWBHDF5IO(path, "r", manager=manager) as reader: - errors = [] - for ns in validate_namespaces: - errors += validate(io=reader, namespace=ns) - - :param path: Path for the NWB file - :return: Tuple with: - - List of strings with the most specific namespace(s) to use for validation. - - BuildManager object for opening the file for validation - - Dict with the full result from NWBHDF5IO.load_namespaces - """ - from . import NWBHDF5IO # TODO: modularize to avoid circular import - - catalog = NamespaceCatalog( - group_spec_cls=NWBGroupSpec, dataset_spec_cls=NWBDatasetSpec, spec_namespace_cls=NWBNamespace - ) - namespace_dependencies = NWBHDF5IO.load_namespaces( - namespace_catalog=catalog, - path=path, - driver=driver, - aws_region=aws_region - ) - - # Determine which namespaces are the most specific (i.e. extensions) and validate against those - candidate_namespaces = set(namespace_dependencies.keys()) - for namespace_dependency in namespace_dependencies: - candidate_namespaces -= namespace_dependencies[namespace_dependency].keys() - - # TODO: remove this workaround for issue https://github.com/NeurodataWithoutBorders/pynwb/issues/1357 - candidate_namespaces.discard("hdmf-experimental") # remove validation of hdmf-experimental for now - cached_namespaces = sorted(candidate_namespaces) - - if len(cached_namespaces) > 0: - type_map = TypeMap(namespaces=catalog) - manager = BuildManager(type_map=type_map) - else: - manager = None - - return cached_namespaces, manager, namespace_dependencies - - -@docval( - { - "name": "io", - "type": HDMFIO, - "doc": "An open IO to an NWB file.", - "default": None, - }, # For back-compatability - { - "name": "namespace", - "type": str, - "doc": "A specific namespace to validate against.", - "default": None, - }, # Argument order is for back-compatability - { - "name": "paths", - "type": list, - "doc": "List of NWB file paths.", - "default": None, - }, - { - "name": "use_cached_namespaces", - "type": bool, - "doc": "Whether to use namespaces cached within the file for validation.", - "default": True, - }, - { - "name": "verbose", - "type": bool, - "doc": "Whether or not to print messages to stdout.", - "default": False, - }, - { - "name": "driver", - "type": str, - "doc": "Driver for h5py to use when opening the HDF5 file.", - "default": None, - }, - returns="Validation errors in the file.", - rtype=(list, (list, bool)), - is_method=False, -) -def validate(**kwargs): - """Validate NWB file(s) against a namespace or its cached namespaces. - - NOTE: If an io object is provided and no namespace name is specified, then the file will be validated - against the core namespace, even if use_cached_namespaces is True. - """ - from . import NWBHDF5IO # TODO: modularize to avoid circular import - - io, paths, use_cached_namespaces, namespace, verbose, driver = getargs( - "io", "paths", "use_cached_namespaces", "namespace", "verbose", "driver", kwargs - ) - assert io != paths, "Both 'io' and 'paths' were specified! Please choose only one." - - if io is not None: - validation_errors = _validate_helper(io=io, namespace=namespace or CORE_NAMESPACE) - return validation_errors - - status = 0 - validation_errors = list() - for path in paths: - namespaces_to_validate = [] - namespace_message = "PyNWB namespace information" - io_kwargs = dict(path=path, mode="r", driver=driver) - - if use_cached_namespaces: - cached_namespaces, manager, namespace_dependencies = get_cached_namespaces_to_validate( - path=path, driver=driver - ) - io_kwargs.update(manager=manager) - - if any(cached_namespaces): - namespaces_to_validate = cached_namespaces - namespace_message = "cached namespace information" - else: - namespaces_to_validate = [CORE_NAMESPACE] - if verbose: - print( - f"The file {path} has no cached namespace information. Falling back to {namespace_message}.", - file=sys.stderr, - ) - else: - io_kwargs.update(load_namespaces=False) - namespaces_to_validate = [CORE_NAMESPACE] - - if namespace is not None: - if namespace in namespaces_to_validate: - namespaces_to_validate = [namespace] - elif use_cached_namespaces and namespace in namespace_dependencies: # validating against a dependency - for namespace_dependency in namespace_dependencies: - if namespace in namespace_dependencies[namespace_dependency]: - status = 1 - print( - f"The namespace '{namespace}' is included by the namespace " - f"'{namespace_dependency}'. Please validate against that namespace instead.", - file=sys.stderr, - ) - else: - status = 1 - print( - f"The namespace '{namespace}' could not be found in {namespace_message} as only " - f"{namespaces_to_validate} is present.", - file=sys.stderr, - ) - - if status == 1: - continue - - with NWBHDF5IO(**io_kwargs) as io: - for validation_namespace in namespaces_to_validate: - if verbose: - print(f"Validating {path} against {namespace_message} using namespace '{validation_namespace}'.") - validation_errors += _validate_helper(io=io, namespace=validation_namespace) - return validation_errors, status - - -def validate_cli(): - """CLI wrapper around pynwb.validate.""" - parser = ArgumentParser( - description="Validate an NWB file", - epilog="If --ns is not specified, validate against all namespaces in the NWB file.", - ) - - # Special arg specific to CLI - parser.add_argument( - "-lns", - "--list-namespaces", - dest="list_namespaces", - action="store_true", - help="List the available namespaces and exit.", - ) - - # Common args to the API validate - parser.add_argument("paths", type=str, nargs="+", help="NWB file paths") - parser.add_argument("-n", "--ns", type=str, help="the namespace to validate against") - feature_parser = parser.add_mutually_exclusive_group(required=False) - feature_parser.add_argument( - "--no-cached-namespace", - dest="no_cached_namespace", - action="store_true", - help="Use the PyNWB loaded namespace (true) or use the cached namespace (false; default).", - ) - parser.set_defaults(no_cached_namespace=False) - args = parser.parse_args() - status = 0 - - if args.list_namespaces: - for path in args.paths: - cached_namespaces, _, _ = get_cached_namespaces_to_validate(path=path) - print("\n".join(cached_namespaces)) - else: - validation_errors, validation_status = validate( - paths=args.paths, use_cached_namespaces=not args.no_cached_namespace, namespace=args.ns, verbose=True - ) - if not validation_status: - _print_errors(validation_errors=validation_errors) - status = status or validation_status or (validation_errors is not None and len(validation_errors) > 0) - - sys.exit(status) - - -if __name__ == "__main__": # pragma: no cover - validate_cli() diff --git a/src/pynwb/validation.py b/src/pynwb/validation.py new file mode 100644 index 000000000..d2fb5cb17 --- /dev/null +++ b/src/pynwb/validation.py @@ -0,0 +1,198 @@ +"""Module to validate an NWB file against a namespace.""" +from typing import Tuple, List, Dict, Optional +from pathlib import Path +from warnings import warn + +from hdmf.spec import NamespaceCatalog +from hdmf.build import BuildManager, TypeMap +from hdmf.utils import docval, getargs, AllowPositional +from hdmf.backends.io import HDMFIO +from hdmf.validate import ValidatorMap + +from pynwb import CORE_NAMESPACE +from pynwb.spec import NWBDatasetSpec, NWBGroupSpec, NWBNamespace + + +def _validate_helper(io: HDMFIO, namespace: str = CORE_NAMESPACE) -> list: + builder = io.read_builder() + validator = ValidatorMap(io.manager.namespace_catalog.get_namespace(name=namespace)) + return validator.validate(builder) + + +def get_cached_namespaces_to_validate(path: Optional[str] = None, + driver: Optional[str] = None, + aws_region: Optional[str] = None, + io: Optional[HDMFIO] = None +) -> Tuple[List[str], BuildManager, Dict[str, str]]: + """ + Determine the most specific namespace(s) that are cached in the given NWBFile that can be used for validation. + + Example + ------- + The following example illustrates how we can use this function to validate against namespaces + cached in a file. This is useful, e.g., when a file was created using an extension + + .. code-block:: python + + from pynwb import validate + from pynwb.validation import get_cached_namespaces_to_validate + path = "my_nwb_file.nwb" + validate_namespaces, manager, cached_namespaces = get_cached_namespaces_to_validate(path) + with NWBHDF5IO(path, "r", manager=manager) as reader: + errors = [] + for ns in validate_namespaces: + errors += validate(io=reader, namespace=ns) + + :param path: Path for the NWB file + :return: Tuple with: + - List of strings with the most specific namespace(s) to use for validation. + - BuildManager object for opening the file for validation + - Dict with the full result from NWBHDF5IO.load_namespaces + """ + + catalog = NamespaceCatalog( + group_spec_cls=NWBGroupSpec, dataset_spec_cls=NWBDatasetSpec, spec_namespace_cls=NWBNamespace + ) + + if io is not None: + # TODO update HDF5IO to have .file property to make consistent with ZarrIO + # then update input arguments here + namespace_dependencies = io.load_namespaces(namespace_catalog=catalog, + file=io._file) + else: + from pynwb import _get_backend + backend_io = _get_backend(path, method=driver) + namespace_dependencies = backend_io.load_namespaces(namespace_catalog=catalog, + path=path, + driver=driver, + aws_region=aws_region) + + # Determine which namespaces are the most specific (i.e. extensions) and validate against those + candidate_namespaces = set(namespace_dependencies.keys()) + for namespace_dependency in namespace_dependencies: + candidate_namespaces -= namespace_dependencies[namespace_dependency].keys() + + # TODO: remove this workaround for issue https://github.com/NeurodataWithoutBorders/pynwb/issues/1357 + candidate_namespaces.discard("hdmf-experimental") # remove validation of hdmf-experimental for now + cached_namespaces = sorted(candidate_namespaces) + + if len(cached_namespaces) > 0: + type_map = TypeMap(namespaces=catalog) + manager = BuildManager(type_map=type_map) + else: + manager = None + + return cached_namespaces, manager, namespace_dependencies + +@docval( + { + "name": "io", + "type": HDMFIO, + "doc": "An open IO to an NWB file.", + "default": None, + }, # For back-compatability + { + "name": "namespace", + "type": str, + "doc": "A specific namespace to validate against.", + "default": None, + }, # Argument order is for back-compatability + { + "name": "path", + "type": (str, Path), + "doc": "NWB file path.", + "default": None, + }, + { + "name": "use_cached_namespaces", + "type": bool, + "doc": "Whether to use namespaces cached within the file for validation.", + "default": True, + }, + { + "name": "verbose", + "type": bool, + "doc": "Whether or not to print messages to stdout.", + "default": False, + }, + { + "name": "driver", + "type": str, + "doc": "Driver for h5py to use when opening the HDF5 file.", + "default": None, + }, + returns="Validation errors in the file.", + rtype=list, + is_method=False, + allow_positional=AllowPositional.WARNING, +) +def validate(**kwargs): + """Validate NWB file(s) against a namespace or its cached namespaces. + + Note: this function checks for compliance with the NWB schema. + It is recommended to use the NWBInspector for more comprehensive validation of both + compliance with the schema and compliance of data with NWB best practices. + """ + + io, path, use_cached_namespaces, namespace, verbose, driver = getargs( + "io", "path", "use_cached_namespaces", "namespace", "verbose", "driver", kwargs + ) + assert io != path, "Both 'io' and 'path' were specified! Please choose only one." + path = str(path) if isinstance(path, Path) else path + + # get namespaces to validate + namespace_message = "PyNWB namespace information" + io_kwargs = dict(path=path, mode="r", driver=driver) + + if use_cached_namespaces: + cached_namespaces, manager, namespace_dependencies = get_cached_namespaces_to_validate(path=path, + driver=driver, + io=io) + io_kwargs.update(manager=manager) + + if any(cached_namespaces): + namespaces_to_validate = cached_namespaces + namespace_message = "cached namespace information" + else: + namespaces_to_validate = [CORE_NAMESPACE] + if verbose: + warn(f"The file {f'{path} ' if path is not None else ''}has no cached namespace information. " + f"Falling back to {namespace_message}.", UserWarning) + else: + io_kwargs.update(load_namespaces=False) + namespaces_to_validate = [CORE_NAMESPACE] + + # get io object if not provided + if path is not None: + from pynwb import _get_backend + backend_io = _get_backend(path, method=driver) + io = backend_io(**io_kwargs) + + # check namespaces are accurate + if namespace is not None: + if namespace in namespaces_to_validate: + namespaces_to_validate = [namespace] + elif use_cached_namespaces and namespace in namespace_dependencies: # validating against a dependency + for namespace_dependency in namespace_dependencies: + if namespace in namespace_dependencies[namespace_dependency]: + raise ValueError( + f"The namespace '{namespace}' is included by the namespace " + f"'{namespace_dependency}'. Please validate against that namespace instead.") + else: + raise ValueError( + f"The namespace '{namespace}' could not be found in {namespace_message} as only " + f"{namespaces_to_validate} is present.",) + + # validate against namespaces + validation_errors = [] + for validation_namespace in namespaces_to_validate: + if verbose: + print(f"Validating {f'{path} ' if path is not None else ''}against " # noqa: T201 + f"{namespace_message} using namespace '{validation_namespace}'.") + validation_errors += _validate_helper(io=io, namespace=validation_namespace) + + if path is not None: + io.close() # close the io object if it was created within this function, otherwise leave as is + + return validation_errors + diff --git a/src/pynwb/validation_cli.py b/src/pynwb/validation_cli.py new file mode 100644 index 000000000..ad774f25b --- /dev/null +++ b/src/pynwb/validation_cli.py @@ -0,0 +1,84 @@ +"""Command line tool to Validate an NWB file against a namespace.""" +import json +import sys +from argparse import ArgumentParser +from pathlib import Path + +from pynwb.validation import validate, get_cached_namespaces_to_validate + + +def _print_errors(validation_errors: list): + if validation_errors: + print(" - found the following errors:", file=sys.stderr) + for err in validation_errors: + print(str(err), file=sys.stderr) + else: + print(" - no errors found.") + + +def validation_cli(): + """CLI wrapper around pynwb.validate. + + Note: this CLI wrapper checks for compliance with the NWB schema. + It is recommended to use the NWBInspector CLI for more comprehensive validation of both + compliance with the schema and compliance of data with NWB best practices. + """ + parser = ArgumentParser( + description="Validate an NWB file", + epilog="If --ns is not specified, validate against all namespaces in the NWB file.", + ) + + # Special arg specific to CLI + parser.add_argument( + "-lns", + "--list-namespaces", + dest="list_namespaces", + action="store_true", + help="List the available namespaces and exit.", + ) + + # Common args to the API validate + parser.add_argument("paths", type=str, nargs="+", help="NWB file paths") + parser.add_argument("-n", "--ns", type=str, help="the namespace to validate against") + parser.add_argument("--json-output-path", dest="json_output_path", type=str, + help="Write json output to this location.") + feature_parser = parser.add_mutually_exclusive_group(required=False) + feature_parser.add_argument( + "--no-cached-namespace", + dest="no_cached_namespace", + action="store_true", + help="Use the namespaces installed by PyNWB (true) or use the cached namespaces (false; default).", + ) + parser.set_defaults(no_cached_namespace=False) + args = parser.parse_args() + + status = 0 + for path in args.paths: + if args.list_namespaces: + cached_namespaces, _, _ = get_cached_namespaces_to_validate(path=path) + print("\n".join(cached_namespaces)) + else: + validation_errors = [] + try: + val_errors = validate( + path=path, use_cached_namespaces=not args.no_cached_namespace, namespace=args.ns, verbose=True, + ) + _print_errors(validation_errors=val_errors) + status = status or int(val_errors is not None and len(val_errors) > 0) + validation_errors.append(val_errors) + except ValueError as e: + print(e, file=sys.stderr) + status = 1 + + # write output to json file + if args.json_output_path is not None: + with open(args.json_output_path, "w") as f: + json_report = {'exitcode': status, 'errors': [str(e) for e in validation_errors]} + json.dump(obj=json_report, fp=f) + print(f"Report saved to {str(Path(args.json_output_path).absolute())}!") + + sys.exit(status) + + +if __name__ == "__main__": # pragma: no cover + validation_cli() diff --git a/test.py b/test.py index de3c2c762..a93f6237d 100644 --- a/test.py +++ b/test.py @@ -156,7 +156,7 @@ def validate_nwbs(): examples_nwbs = [x for x in examples_nwbs if not x.startswith('sub-')] import pynwb - from pynwb.validate import get_cached_namespaces_to_validate + from pynwb.validation import validate, get_cached_namespaces_to_validate for nwb in examples_nwbs: try: @@ -168,7 +168,8 @@ def validate_nwbs(): is_family_nwb_file = False try: with pynwb.NWBHDF5IO(nwb, mode='r') as io: - errors = pynwb.validate(io) + errors = validate(io, use_cached_namespaces=False) + errors.extend(validate(io, use_cached_namespaces=True)) except OSError as e: # if the file was created with the family driver, need to use the family driver to open it if 'family driver should be used' in str(e): @@ -178,7 +179,8 @@ def validate_nwbs(): memb_size = 1024**2 # note: the memb_size must be the same as the one used to create the file with h5py.File(filename_pattern, mode='r', driver='family', memb_size=memb_size) as f: with pynwb.NWBHDF5IO(file=f, manager=None, mode='r') as io: - errors = pynwb.validate(io) + errors = validate(io, use_cached_namespaces=False) + errors.extend(validate(io, use_cached_namespaces=True)) else: raise e @@ -201,14 +203,14 @@ def validate_nwbs(): ERRORS += 1 cmds = [] - cmds += [["python", "-m", "pynwb.validate", nwb]] - cmds += [["python", "-m", "pynwb.validate", "--no-cached-namespace", nwb]] + cmds += [["pynwb-validate", nwb]] + cmds += [["pynwb-validate", "--no-cached-namespace", nwb]] for ns in namespaces: # for some reason, this logging command is necessary to correctly printing the namespace in the # next logging command logging.info("Namespace found: %s" % ns) - cmds += [["python", "-m", "pynwb.validate", "--ns", ns, nwb]] + cmds += [["pynwb-validate", "--ns", ns, nwb]] for cmd in cmds: logging.info("Validating with \"%s\"." % (" ".join(cmd[:-1]))) diff --git a/tests/integration/ros3/test_ros3.py b/tests/integration/ros3/test_ros3.py index 2571e6199..347b070e4 100644 --- a/tests/integration/ros3/test_ros3.py +++ b/tests/integration/ros3/test_ros3.py @@ -1,6 +1,6 @@ from pynwb import NWBHDF5IO from pynwb import validate -from pynwb.validate import get_cached_namespaces_to_validate +from pynwb.validation import get_cached_namespaces_to_validate from pynwb.testing import TestCase import urllib.request import h5py @@ -90,10 +90,9 @@ def test_dandi_get_cached_namespaces(self): ) self.assertCountEqual(first=found_namespaces, second=expected_namespaces) - self.assertDictEqual(d1=expected_namespace_dependencies, d2=expected_namespace_dependencies) + self.assertDictEqual(d1=found_namespace_dependencies, d2=expected_namespace_dependencies) def test_dandi_validate(self): - result, status = validate(paths=[self.s3_test_path], driver="ros3") + result = validate(path=self.s3_test_path, driver="ros3") assert result == [] - assert status == 0 diff --git a/tests/integration/utils/test_io_utils.py b/tests/integration/utils/test_io_utils.py index 73732924f..7f36e5474 100644 --- a/tests/integration/utils/test_io_utils.py +++ b/tests/integration/utils/test_io_utils.py @@ -1,10 +1,13 @@ """Tests related to pynwb.io.utils.""" import pytest +from datetime import datetime +from dateutil.tz import tzutc + from hdmf.build import GroupBuilder from pynwb.io.utils import get_nwb_version -from pynwb.testing import TestCase - +from pynwb.testing import TestCase, remove_test_file +from pynwb import NWBFile, NWBHDF5IO, _get_backend class TestGetNWBVersion(TestCase): @@ -53,3 +56,23 @@ def test_get_nwb_version_20b(self): builder1.set_attribute(name="nwb_version", value="2.0b") assert get_nwb_version(builder1) == (2, 0, 0) assert get_nwb_version(builder1, include_prerelease=True) == (2, 0, 0, "b") + +class TestGetNWBBackend(TestCase): + def setUp(self): + self.nwbfile = NWBFile(session_description='a test NWB File', + identifier='TEST123', + session_start_time=datetime(1970, 1, 1, 12, tzinfo=tzutc())) + self.hdf5_path = "test_pynwb_nwb_backend.nwb" + with NWBHDF5IO(self.hdf5_path, 'w') as io: + io.write(self.nwbfile) + + def tearDown(self): + remove_test_file(self.hdf5_path) + + def test_get_backend_invalid_file(self): + with self.assertRaises(ValueError): + _get_backend('not_a_file.nwb') + + def test_get_backend_HDF5(self): + backend_io = _get_backend(self.hdf5_path) + self.assertEqual(backend_io, NWBHDF5IO) \ No newline at end of file diff --git a/tests/validation/test_validate.py b/tests/validation/test_validate.py index c2829ee1f..8b2c34888 100644 --- a/tests/validation/test_validate.py +++ b/tests/validation/test_validate.py @@ -1,5 +1,6 @@ import subprocess import re +import os import sys from unittest.mock import patch from io import StringIO @@ -23,7 +24,7 @@ # combine the individual coverage reports into one .coverage file. def run_coverage(extra_args: list[str]): return subprocess.run( - [sys.executable, "-m", "coverage", "run", "-p", "-m", "pynwb.validate"] + [sys.executable, "-m", "coverage", "run", "-p", "-m", "pynwb.validation_cli"] + extra_args, capture_output=True ) @@ -54,13 +55,15 @@ def test_validate_file_no_cache_bad_ns(self): """Test that validating a file with no cached spec against a specified, unknown namespace fails.""" result = run_coverage(["tests/back_compat/1.0.2_nwbfile.nwb", "--ns", "notfound"]) - stderr_regex = re.compile( + stderr_regex_1 = re.compile( r"The file tests/back_compat/1\.0\.2_nwbfile\.nwb has no cached namespace information\. " - r"Falling back to PyNWB namespace information\.\s*" + r"Falling back to PyNWB namespace information\.\s*") + stderr_regex_2 = re.compile( r"The namespace 'notfound' could not be found in PyNWB namespace information as only " - r"\['core'\] is present\.\s*" - ) - self.assertRegex(result.stderr.decode('utf-8'), stderr_regex) + r"\['core'\] is present\.\s*") + + self.assertRegex(result.stderr.decode('utf-8'), stderr_regex_1) + self.assertRegex(result.stderr.decode('utf-8'), stderr_regex_2) self.assertEqual(result.stdout.decode('utf-8'), '') @@ -174,6 +177,22 @@ def test_validate_file_list_namespaces_extension(self): stdout_regex = re.compile(r"ndx-testextension\s*") self.assertRegex(result.stdout.decode('utf-8'), stdout_regex) + def test_validate_file_json_output(self): + """Test that validating a file with the json flag outputs a json file.""" + json_path = "test_validation.json" + run_coverage(["tests/back_compat/1.0.2_str_experimenter.nwb", "--no-cached-namespace", + "--json-output-path", json_path]) + self.assertTrue(os.path.exists(json_path)) + os.remove(json_path) + + def test_validation_entry_point(self): + """Test that using the validation entry point successfully executes the validate CLI.""" + json_path = "test_validation_entry_point.json" + subprocess.run(["pynwb-validate", "tests/back_compat/1.0.2_str_experimenter.nwb", + "--json-output-path", json_path]) + self.assertTrue(os.path.exists(json_path)) + os.remove(json_path) + class TestValidateFunction(TestCase): @@ -199,9 +218,11 @@ def test_validate_io_no_cache(self): def test_validate_io_no_cache_bad_ns(self): """Test that validating a file with no cached spec against a specified, unknown namespace fails.""" - with self.get_io('tests/back_compat/1.0.2_nwbfile.nwb') as io: - with self.assertRaisesWith(KeyError, "\"'notfound' not a namespace\""): - validate(io, 'notfound') + expected_error = ("The namespace 'notfound' could not be found in PyNWB namespace information as only " + "['core'] is present.") + with self.assertRaisesWith(ValueError, expected_error): + with self.get_io('tests/back_compat/1.0.2_nwbfile.nwb') as io: + validate(io=io, namespace='notfound') def test_validate_io_cached(self): """Test that validating a file with cached spec against its cached namespace succeeds.""" @@ -221,75 +242,80 @@ def test_validate_io_cached_extension_pass_ns(self): errors = validate(io, 'ndx-testextension') self.assertEqual(errors, []) - def test_validate_io_cached_core_with_io(self): - """ - For back-compatability, test that validating a file with cached extension spec against the core - namespace succeeds when using the `io` + `namespace` keywords. - """ - with self.get_io(path='tests/back_compat/2.1.0_nwbfile_with_extension.nwb') as io: - results = validate(io=io, namespace="core") - self.assertEqual(results, []) - def test_validate_file_cached_extension(self): """ Test that validating a file with cached extension spec against the core namespace raises an error with the new CLI-mimicing paths keyword. """ nwbfile_path = "tests/back_compat/2.1.0_nwbfile_with_extension.nwb" - with patch("sys.stderr", new=StringIO()) as fake_err: - with patch("sys.stdout", new=StringIO()) as fake_out: - results, status = validate(paths=[nwbfile_path], namespace="core", verbose=True) - self.assertEqual(results, []) - self.assertEqual(status, 1) - self.assertEqual( - fake_err.getvalue(), - ( - "The namespace 'core' is included by the namespace 'ndx-testextension'. " - "Please validate against that namespace instead.\n" - ) - ) - self.assertEqual(fake_out.getvalue(), "") + expected_error = ("The namespace 'core' is included by the namespace 'ndx-testextension'. " + "Please validate against that namespace instead.") + with self.assertRaisesWith(ValueError, expected_error): + validate(path=nwbfile_path, namespace="core", verbose=True) def test_validate_file_cached_core(self): """ Test that validating a file with cached core spec with verbose=False. """ nwbfile_path = "tests/back_compat/1.1.2_nwbfile.nwb" - with patch("sys.stderr", new=StringIO()) as fake_err: - with patch("sys.stdout", new=StringIO()) as fake_out: - results, status = validate(paths=[nwbfile_path], namespace="core") - self.assertEqual(results, []) - self.assertEqual(status, 0) - self.assertEqual(fake_err.getvalue(), "") - self.assertEqual(fake_out.getvalue(), "") + results = validate(path=nwbfile_path, namespace="core") + self.assertEqual(results, []) def test_validate_file_cached_no_cache_bad_ns(self): """ Test that validating a file with no cached namespace, a namespace that is not found, and verbose=False. """ nwbfile_path = "tests/back_compat/1.0.2_nwbfile.nwb" - with patch("sys.stderr", new=StringIO()) as fake_err: - with patch("sys.stdout", new=StringIO()) as fake_out: - results, status = validate(paths=[nwbfile_path], namespace="notfound") - self.assertEqual(results, []) - self.assertEqual(status, 1) - stderr_regex = ( - r"The namespace 'notfound' could not be found in PyNWB namespace information as only " - r"\['core'\] is present.\n" - ) - self.assertRegex(fake_err.getvalue(), stderr_regex) - self.assertEqual(fake_out.getvalue(), "") - - def test_validate_io_cached_bad_ns(self): - """Test that validating a file with cached spec against a specified, unknown namespace fails.""" - with self.get_io('tests/back_compat/1.1.2_nwbfile.nwb') as io: - with self.assertRaisesWith(KeyError, "\"'notfound' not a namespace\""): - validate(io, 'notfound') + expected_error = ("The namespace 'notfound' could not be found in PyNWB namespace information as only " + "['core'] is present.") + with self.assertRaisesWith(ValueError, expected_error): + validate(path=nwbfile_path, namespace="notfound") def test_validate_io_cached_hdmf_common(self): """Test that validating a file with cached spec against the hdmf-common namespace fails.""" - with self.get_io('tests/back_compat/1.1.2_nwbfile.nwb') as io: - # TODO this error should not be different from the error when using the validate script above - msg = "builder must have data type defined with attribute 'data_type'" - with self.assertRaisesWith(ValueError, msg): - validate(io, 'hdmf-common') + expected_error = ("The namespace 'hdmf-common' is included by the namespace 'core'. " + "Please validate against that namespace instead.") + with self.assertRaisesWith(ValueError, expected_error): + with self.get_io(path='tests/back_compat/1.1.2_nwbfile.nwb') as io: + validate(io=io, namespace="hdmf-common", verbose=True) + + def test_validate_io_and_path_same(self): + """Test that validating a file with an io object and a path return the same results.""" + tests = [('tests/back_compat/1.0.2_nwbfile.nwb', None), + ('tests/back_compat/1.1.2_nwbfile.nwb', None), + ('tests/back_compat/1.1.2_nwbfile.nwb', 'core'), + ('tests/back_compat/2.1.0_nwbfile_with_extension.nwb', None), + ('tests/back_compat/2.1.0_nwbfile_with_extension.nwb', 'ndx-testextension'),] + + tests_with_error = [('tests/back_compat/1.0.2_nwbfile.nwb', 'notfound'), + ('tests/back_compat/1.1.2_nwbfile.nwb', 'notfound'), + ('tests/back_compat/1.1.2_nwbfile.nwb', 'hdmf-common'), + ('tests/back_compat/2.1.0_nwbfile_with_extension.nwb', 'core'),] + + # paths that cause no errors + for path, ns in tests: + with patch("sys.stdout", new=StringIO()) as fake_out: + with self.get_io(path=path) as io: + results_io = validate(io=io, namespace=ns, verbose=True) + out_io = fake_out.getvalue() + + with patch("sys.stdout", new=StringIO()) as fake_out: + results_path = validate(path=path, namespace=ns, verbose=True) + out_path = fake_out.getvalue() + + # remove path from error messages since it will not be included in io outputs + out_path = out_path.replace(f'{path} ', '') + self.assertEqual(results_io, results_path) + self.assertEqual(out_io, out_path) + + # paths that return errors + for path, ns in tests_with_error: + with self.assertRaises(ValueError) as e_io: + with self.get_io(path=path) as io: + results_io = validate(io=io, namespace=ns, verbose=True) + + with self.assertRaises(ValueError) as e_path: + results_path = validate(path=path, namespace=ns, verbose=True) + + # remove path from error messages since it will not be included in io outputs + self.assertEqual(str(e_io.exception), str(e_path.exception))