From ac3938b80d571e40eb9ba10f7e0c7a8f4285c4c3 Mon Sep 17 00:00:00 2001 From: Fabian Zills <46721498+PythonFZ@users.noreply.github.com> Date: Tue, 19 Dec 2023 14:34:23 +0100 Subject: [PATCH] add `auto_remove` and fix `NodeNotAvailable` error (#751) * add `auto_remove` and fix `NodeNotAvailable` error * bugfix + indent --- tests/integration/test_external.py | 4 +-- tests/integration/test_from_rev.py | 10 +++---- tests/integration/test_fs_patch.py | 2 +- tests/integration/test_project.py | 26 +++++++++++++++++ zntrack/core/node.py | 6 ++++ zntrack/project/zntrack_project.py | 46 +++++++++++++++++++++++++++--- 6 files changed, 82 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_external.py b/tests/integration/test_external.py index 456856f7..ed8a66aa 100644 --- a/tests/integration/test_external.py +++ b/tests/integration/test_external.py @@ -21,7 +21,7 @@ def test_external(proj_path): node = zntrack.from_rev( "HelloWorld", remote="https://github.com/PythonFZ/ZnTrackExamples.git", - rev="fbb6ada", + rev="890c714", ) with zntrack.Project() as proj: @@ -37,7 +37,7 @@ def test_external_grp(proj_path): node = zntrack.from_rev( "HelloWorld", remote="https://github.com/PythonFZ/ZnTrackExamples.git", - rev="fbb6ada", + rev="890c714", ) proj = zntrack.Project() diff --git a/tests/integration/test_from_rev.py b/tests/integration/test_from_rev.py index 20f5abb8..4f9acd54 100644 --- a/tests/integration/test_from_rev.py +++ b/tests/integration/test_from_rev.py @@ -29,12 +29,12 @@ def test_import_from_remote(proj_path): node = zntrack.from_rev( "HelloWorld", remote="https://github.com/PythonFZ/ZnTrackExamples.git", - rev="fbb6ada", + rev="890c714", ) assert node.max_number == 512 assert node.random_number == 123 assert node.name == "HelloWorld" - assert node.state.rev == "fbb6ada" + assert node.state.rev == "890c714" assert node.state.remote == "https://github.com/PythonFZ/ZnTrackExamples.git" assert node.state.results == NodeStatusResults.AVAILABLE assert node.uuid == uuid.UUID("1d2d5eef-c42b-4ff4-aa1f-837638fdf090") @@ -44,13 +44,13 @@ def test_connect_from_remote(proj_path): node_a = zntrack.from_rev( "HelloWorld", remote="https://github.com/PythonFZ/ZnTrackExamples.git", - rev="fbb6ada", + rev="890c714", ) node_b = zntrack.from_rev( "HelloWorld", remote="https://github.com/PythonFZ/ZnTrackExamples.git", - rev="35d35ff", + rev="369fe8f", ) assert node_a.random_number == 123 @@ -82,7 +82,7 @@ def test_two_nodes_connect_external(proj_path): node_a = zntrack.from_rev( "HelloWorld", remote="https://github.com/PythonFZ/ZnTrackExamples.git", - rev="fbb6ada", + rev="890c714", ) with zntrack.Project(automatic_node_names=True) as project: diff --git a/tests/integration/test_fs_patch.py b/tests/integration/test_fs_patch.py index fe140f4c..2cf0b572 100644 --- a/tests/integration/test_fs_patch.py +++ b/tests/integration/test_fs_patch.py @@ -38,7 +38,7 @@ def test_patch_list(proj_path): node = zntrack.from_rev( "HelloWorld", remote="https://github.com/PythonFZ/ZnTrackExamples.git", - rev="fbb6ada", + rev="890c714", ) def func(self, path): diff --git a/tests/integration/test_project.py b/tests/integration/test_project.py index 568f3ec2..5d2b8e2b 100644 --- a/tests/integration/test_project.py +++ b/tests/integration/test_project.py @@ -481,3 +481,29 @@ def test_git_only_repo(proj_path, git_only_repo): else: # check if node-meta.json is not in the repo index assert ("nodes/ParamsToOuts/node-meta.json", 0) not in repo.index.entries.keys() + + +def test_auto_remove(proj_path): + with zntrack.Project(automatic_node_names=True) as project: + n1 = zntrack.examples.ParamsToOuts(params="Lorem Ipsum") + n2 = zntrack.examples.ParamsToOuts(params="Dolor Sit") + + project.run() + + n1 = zntrack.examples.ParamsToOuts.from_rev(n1.name) + n2 = zntrack.examples.ParamsToOuts.from_rev(n2.name) + assert n1.outs == "Lorem Ipsum" + assert n2.outs == "Dolor Sit" + + repo = git.Repo() + repo.git.add(".") + repo.index.commit("initial commit") + + with zntrack.Project(automatic_node_names=True) as project: + n1 = zntrack.examples.ParamsToOuts(params="Hello World") + + project.run(auto_remove=True) + + n1 = zntrack.examples.ParamsToOuts.from_rev(n1.name) + with pytest.raises(zntrack.exceptions.NodeNotAvailableError): + n2 = zntrack.examples.ParamsToOuts.from_rev(n2.name) diff --git a/zntrack/core/node.py b/zntrack/core/node.py index 93f5a782..f382ece9 100644 --- a/zntrack/core/node.py +++ b/zntrack/core/node.py @@ -326,6 +326,12 @@ def load(self, lazy: bool = None, results: bool = True) -> None: self._uuid = uuid.UUID(node_meta["uuid"]) self.state.results = NodeStatusResults.AVAILABLE # TODO: documentation about _post_init and _post_load_ and when they are called + + zntrack_config = json.loads(self.state.fs.read_text(config.files.zntrack)) + + if self.name not in zntrack_config: + raise exceptions.NodeNotAvailableError(self) + self._post_load_() @classmethod diff --git a/zntrack/project/zntrack_project.py b/zntrack/project/zntrack_project.py index bffddbf7..597ad85e 100644 --- a/zntrack/project/zntrack_project.py +++ b/zntrack/project/zntrack_project.py @@ -21,6 +21,7 @@ from zntrack import exceptions from zntrack.core.node import Node, get_dvc_cmd from zntrack.utils import NodeName, config, run_dvc_cmd +from zntrack.utils.cli import get_groups log = logging.getLogger(__name__) @@ -171,6 +172,38 @@ def group(self, *names: typing.List[str]): if not node._external_: node.__dict__["nwd"] = grp.nwd / node._name_.get_name_without_groups() + def auto_remove(self, remove_empty_dirs=True): + """Remove all nodes from 'dvc.yaml' that are not in the graph.""" + _, dvc_node_names = get_groups(None, None) + graph_node_names = [self.graph.nodes[x]["value"].name for x in self.graph.nodes] + + nodes_to_remove = [] + + for node_name in dvc_node_names: + if node_name not in graph_node_names: + if "+" in node_name: + # currently there is no way to remove the zntrack.deps Nodes correctly + # so we check for the parent node, if that is not available, we remove + # the node + continue + else: + nodes_to_remove.append(node_name) + + if len(nodes_to_remove): + zntrack_config = json.loads(config.files.zntrack.read_text()) + + for node_name in tqdm.tqdm(nodes_to_remove): + run_dvc_cmd(["remove", node_name, "--outs"]) + _ = zntrack_config.pop(node_name, None) + + config.files.zntrack.write_text(json.dumps(zntrack_config, indent=4)) + + if remove_empty_dirs: + # remove all empty directories inside "nodes" + for path in pathlib.Path("nodes").glob("**/*"): + if path.is_dir() and not any(path.iterdir()): + path.rmdir() + def run( self, eager=False, @@ -179,6 +212,7 @@ def run( save: bool = True, environment: dict = None, nodes: list = None, + auto_remove: bool = False, ): """Run the Project Graph. @@ -200,6 +234,9 @@ def run( A dictionary of environment variables for all nodes. nodes : list, default = None A list of node names to run. If None, run all nodes. + auto_remove : bool, default = False + If True, remove all nodes from 'dvc.yaml' that are not in the graph. + This is the same as calling 'project.auto_remove()' """ if not save and not eager: raise ValueError("Save can only be false if eager is True") @@ -258,11 +295,12 @@ def run( self.repro() # TODO should we load the nodes here? Maybe, if lazy loading is implemented. - def build( - self, environment: dict = None, optional: dict = None, nodes: list = None - ) -> None: + if auto_remove: + self.auto_remove() + + def build(self, **kwargs) -> None: """Build the project graph without running it.""" - self.run(repro=False, environment=environment, optional=optional, nodes=nodes) + self.run(repro=False, **kwargs) def repro(self) -> None: """Run dvc repro."""