From 87124a396a3d8277ea53b62712c70cb7356b94d9 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 7 Jun 2024 09:34:34 +0200 Subject: [PATCH 01/16] Don't copy from_work_dir outputs to purged path in shell fragment --- lib/galaxy/jobs/command_factory.py | 6 +++++- test/unit/app/jobs/test_command_factory.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/galaxy/jobs/command_factory.py b/lib/galaxy/jobs/command_factory.py index 1c2a67da86ad..bda5c81ce845 100644 --- a/lib/galaxy/jobs/command_factory.py +++ b/lib/galaxy/jobs/command_factory.py @@ -293,7 +293,11 @@ def __copy_if_exists_command(work_dir_output): source_file, destination = work_dir_output if "?" in source_file or "*" in source_file: source_file = source_file.replace("*", '"*"').replace("?", '"?"') - return f'\nif [ -f "{source_file}" ] ; then cp "{source_file}" "{destination}" ; fi' + # Check if source and destination exist. + # Users can purge outputs before the job completes, + # in that case we don't want to copy the output to a purged path. + # Static, non work_dir_output files are handled in job_finish code. + return f'\nif [ -f "{source_file}" -a -f "{destination}" ] ; then cp "{source_file}" "{destination}" ; fi' class CommandsBuilder: diff --git a/test/unit/app/jobs/test_command_factory.py b/test/unit/app/jobs/test_command_factory.py index 48c097a37139..e185904e94e6 100644 --- a/test/unit/app/jobs/test_command_factory.py +++ b/test/unit/app/jobs/test_command_factory.py @@ -21,7 +21,7 @@ TEST_FILES_PATH = "file_path" TEE_REDIRECT = '> "$__out" 2> "$__err"' RETURN_CODE_CAPTURE = "; return_code=$?; echo $return_code > galaxy_1.ec" -CP_WORK_DIR_OUTPUTS = '; \nif [ -f "foo" ] ; then cp "foo" "bar" ; fi' +CP_WORK_DIR_OUTPUTS = '; \nif [ -f "foo" -a -f "bar" ] ; then cp "foo" "bar" ; fi' class TestCommandFactory(TestCase): @@ -105,7 +105,7 @@ def test_workdir_outputs_with_glob(self): self.workdir_outputs = [("foo*bar", "foo_x_bar")] self._assert_command_is( self._surround_command( - MOCK_COMMAND_LINE, '; \nif [ -f "foo"*"bar" ] ; then cp "foo"*"bar" "foo_x_bar" ; fi' + MOCK_COMMAND_LINE, '; \nif [ -f "foo"*"bar" -a -f "foo_x_bar" ] ; then cp "foo"*"bar" "foo_x_bar" ; fi' ) ) From c3a5d61d61861dc479a805a6c603ae5ff3d89c2f Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 7 Jun 2024 09:35:19 +0200 Subject: [PATCH 02/16] Don't include purged outputs in from_work_dir outputs --- lib/galaxy/jobs/runners/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/galaxy/jobs/runners/__init__.py b/lib/galaxy/jobs/runners/__init__.py index 58485fc955ec..d15aa558648d 100644 --- a/lib/galaxy/jobs/runners/__init__.py +++ b/lib/galaxy/jobs/runners/__init__.py @@ -380,6 +380,13 @@ def get_work_dir_outputs( job_tool = job_wrapper.tool for joda, dataset in self._walk_dataset_outputs(job): if joda and job_tool: + if dataset.dataset.purged: + log.info( + "Output dataset %s for job %s purged before job completed, skipping output collection.", + joda.name, + job.id, + ) + continue hda_tool_output = job_tool.find_output_def(joda.name) if hda_tool_output and hda_tool_output.from_work_dir: # Copy from working dir to HDA. From a1d41ab0d6e71d217f832d27bb1fe86d89012708 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 7 Jun 2024 09:35:58 +0200 Subject: [PATCH 03/16] Raise appropriate exception if require path parameter is missing --- lib/galaxy/webapps/galaxy/api/job_files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/galaxy/webapps/galaxy/api/job_files.py b/lib/galaxy/webapps/galaxy/api/job_files.py index 74c25e07ea35..3bae6d62d137 100644 --- a/lib/galaxy/webapps/galaxy/api/job_files.py +++ b/lib/galaxy/webapps/galaxy/api/job_files.py @@ -99,6 +99,8 @@ def create(self, trans, job_id, payload, **kwargs): """ job = self.__authorize_job_access(trans, job_id, **payload) path = payload.get("path") + if not path: + raise exceptions.RequestParameterInvalidException("'path' parameter not provided or empty.") self.__check_job_can_write_to_path(trans, job, path) # Is this writing an unneeded file? Should this just copy in Python? From 8e6e25d7fbc436315b83450a21dccaba6b46c4f6 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 7 Jun 2024 09:36:43 +0200 Subject: [PATCH 04/16] Don't push purged dataset contents to object store This in particular needs a lot of new tests. We will also need to actively purge datasets in the model store import code, since users might have purged datasets while the job ran. Again, more tests needed. --- lib/galaxy/metadata/set_metadata.py | 4 ++-- lib/galaxy/model/__init__.py | 15 ++++++++------- lib/galaxy/model/store/__init__.py | 21 +++++++++++---------- lib/galaxy/model/store/discover.py | 3 +++ lib/galaxy/objectstore/__init__.py | 2 +- 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/lib/galaxy/metadata/set_metadata.py b/lib/galaxy/metadata/set_metadata.py index 8f1f9ea86be9..d287a9a84c94 100644 --- a/lib/galaxy/metadata/set_metadata.py +++ b/lib/galaxy/metadata/set_metadata.py @@ -96,7 +96,7 @@ def push_if_necessary(object_store: ObjectStore, dataset: DatasetInstance, exter # or a remote object store from its cache path. # empty files could happen when outputs are discovered from working dir, # empty file check needed for e.g. test/integration/test_extended_metadata_outputs_to_working_directory.py::test_tools[multi_output_assign_primary] - if os.path.getsize(external_filename): + if not dataset.dataset.purged and os.path.getsize(external_filename): object_store.update_from_file(dataset.dataset, file_name=external_filename, create=True) @@ -477,7 +477,7 @@ def set_meta(new_dataset_instance, file_dict): object_store_update_actions.append(partial(reset_external_filename, dataset)) object_store_update_actions.append(partial(dataset.set_total_size)) object_store_update_actions.append(partial(export_store.add_dataset, dataset)) - if dataset_instance_id not in unnamed_id_to_path: + if dataset_instance_id not in unnamed_id_to_path and not dataset.dataset.purged: object_store_update_actions.append(partial(collect_extra_files, object_store, dataset, ".")) dataset_state = "deferred" if (is_deferred and final_job_state == "ok") else final_job_state if not dataset.state == dataset.states.ERROR: diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 97ff0f6b0fbf..582da171d315 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -9485,13 +9485,14 @@ def dataset(self) -> Optional[Dataset]: def update_from_file(self, file_name): if not self.dataset: raise Exception("Attempted to write MetadataFile, but no DatasetAssociation set") - self.dataset.object_store.update_from_file( - self, - file_name=file_name, - extra_dir="_metadata_files", - extra_dir_at_root=True, - alt_name=os.path.basename(self.get_file_name()), - ) + if not self.dataset.purged: + self.dataset.object_store.update_from_file( + self, + file_name=file_name, + extra_dir="_metadata_files", + extra_dir_at_root=True, + alt_name=os.path.basename(self.get_file_name()), + ) def get_file_name(self, sync_cache=True): # Ensure the directory structure and the metadata file object exist diff --git a/lib/galaxy/model/store/__init__.py b/lib/galaxy/model/store/__init__.py index 1b78f52ebcc8..42c6e927e2c1 100644 --- a/lib/galaxy/model/store/__init__.py +++ b/lib/galaxy/model/store/__init__.py @@ -654,17 +654,18 @@ def handle_dataset_object_edit(dataset_instance, dataset_attrs): dataset_instance.state = dataset_state if not self.object_store: raise Exception(f"self.object_store is missing from {self}.") - self.object_store.update_from_file( - dataset_instance.dataset, file_name=temp_dataset_file_name, create=True - ) + if not dataset_instance.dataset.purged: + self.object_store.update_from_file( + dataset_instance.dataset, file_name=temp_dataset_file_name, create=True + ) - # Import additional files if present. Histories exported previously might not have this attribute set. - dataset_extra_files_path = dataset_attrs.get("extra_files_path", None) - if dataset_extra_files_path: - assert file_source_root - dataset_extra_files_path = os.path.join(file_source_root, dataset_extra_files_path) - persist_extra_files(self.object_store, dataset_extra_files_path, dataset_instance) - # Don't trust serialized file size + # Import additional files if present. Histories exported previously might not have this attribute set. + dataset_extra_files_path = dataset_attrs.get("extra_files_path", None) + if dataset_extra_files_path: + assert file_source_root + dataset_extra_files_path = os.path.join(file_source_root, dataset_extra_files_path) + persist_extra_files(self.object_store, dataset_extra_files_path, dataset_instance) + # Don't trust serialized file size dataset_instance.dataset.file_size = None dataset_instance.dataset.set_total_size() # update the filesize record in the database diff --git a/lib/galaxy/model/store/discover.py b/lib/galaxy/model/store/discover.py index 9329bda9ee40..2db3749feac9 100644 --- a/lib/galaxy/model/store/discover.py +++ b/lib/galaxy/model/store/discover.py @@ -214,6 +214,9 @@ def create_dataset( return primary_data def finalize_storage(self, primary_data, dataset_attributes, extra_files, filename, link_data, output_name): + if primary_data.dataset.purged: + # metadata won't be set, maybe we should do that, then purge ? + return # Move data from temp location to dataset location if not link_data: dataset = primary_data.dataset diff --git a/lib/galaxy/objectstore/__init__.py b/lib/galaxy/objectstore/__init__.py index f6bb0f4b07ae..49b559525ea9 100644 --- a/lib/galaxy/objectstore/__init__.py +++ b/lib/galaxy/objectstore/__init__.py @@ -1670,7 +1670,7 @@ def persist_extra_files( primary_data: "DatasetInstance", extra_files_path_name: Optional[str] = None, ) -> None: - if os.path.exists(src_extra_files_path): + if not primary_data.dataset.purged and os.path.exists(src_extra_files_path): assert primary_data.dataset if not extra_files_path_name: extra_files_path_name = primary_data.dataset.extra_files_path_name_from(object_store) From 8bc3f2151af1a5d684062b705b36675bfa8e5a02 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sat, 8 Jun 2024 16:27:49 +0200 Subject: [PATCH 05/16] Add tool that produces all types of outputs --- test/functional/tools/all_output_types.xml | 84 ++++++++++++++++++++++ test/functional/tools/sample_tool_conf.xml | 24 +++---- 2 files changed, 96 insertions(+), 12 deletions(-) create mode 100644 test/functional/tools/all_output_types.xml diff --git a/test/functional/tools/all_output_types.xml b/test/functional/tools/all_output_types.xml new file mode 100644 index 000000000000..1914c3b6009a --- /dev/null +++ b/test/functional/tools/all_output_types.xml @@ -0,0 +1,84 @@ + + output.txt && + cp output.txt '$static_output' && + cp '$c1' galaxy.json + ]]> + + + {"output_tool_supplied_metadata": { + "name": "my dynamic name", + "ext": "txt", + "info": "my dynamic info" + }} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/functional/tools/sample_tool_conf.xml b/test/functional/tools/sample_tool_conf.xml index 8775d130e520..2c074381db5e 100644 --- a/test/functional/tools/sample_tool_conf.xml +++ b/test/functional/tools/sample_tool_conf.xml @@ -8,10 +8,10 @@ - - - - + + + + @@ -89,7 +89,7 @@ - +