Version 0.7.0 (#93)

* docs: better naming for oml object help pages * docs: fix openml links * BREAKING CHANGE: Rename sugar functions To be more consistent with the rest of mlr3, we renamed: * `oml_data` --> `odt` * `oml_task` --> `otsk` * `oml_flow` --> `oflw` * `oml_run` --> `orn` * `oml_collection` --> `ocl` * ci: only run ci once every week * improve filtering tasks according to task type! * docs: knit readme * fix: examples fail gracefully when OpenML is busy * docs: typos in NEWS * chore: change error message * typo * * fix: strings and nominals are distinguished for parquet files * unload tasks and resamplings * remove test (openml bug was fixed) * docs: improve docu * update required duckdb version duckdb/duckdb#4806 * prepare for CRAN release * fix: wrap example in try statement (CRAN issue) * typo * safely escape example * update cran-comments * fix CRAN NOTE: too long runtime of example * document
mlr-org · Dec 14, 2022 · f7264f1 · f7264f1
1 parent a0bb0e5
commit f7264f1
Show file tree

Hide file tree

Showing 22 changed files with 115 additions and 160 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: mlr3oml
 Title: Connector Between 'mlr3' and 'OpenML'
-Version: 0.6.0-9000
+Version: 0.7.0
 Authors@R:
     c(person("Michel", "Lang", , "michellang@gmail.com", role = "aut",
              comment = c(ORCID = "0000-0001-9754-0393")),
@@ -33,7 +33,7 @@ Imports:
     withr
 Suggests:
     DBI,
-    duckdb,
+    duckdb (>= 0.6.0),
     mlr3db (>= 0.5.0),
     qs,
     RWeka,
@@ -42,4 +42,4 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 NeedsCompilation: yes
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.2
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # mlr3oml 0.7.0
 
+* feature: Add argument `task_type` to function `list_oml_tasks()`.
+* fix: strings and nominals are distinguished for parquet files
 * docs: Fixed some OpenML links
 * docs: Renamed the docs for OpenML objects
 * Renamed the sugar functions from:

diff --git a/R/OMLCollection.R b/R/OMLCollection.R
@@ -44,32 +44,13 @@
 #'   run_collection = OMLCollection$new(id = 232)
 #'   # using sugar
 #'   run_collection = ocl(id = 232)
-#'   run_collection$main_entity_type
-#'   run_collection$tasks
-#'   run_collection$data
-#'   run_collection$flows
-#'   run_collection$runs
+#'   print(run_collection)
 #'
-#'   # mlr3 conversion:
-#'   tasks = as_tasks(run_collection)
-#'   resamplings = as_resamplings(run_collection)
-#'   learners = as_learners(run_collection, "classif")
-#'
-#'   bmr = as_benchmark_result(run_collection)
-#'   bmr$score(msr("classif.ce"))
-#'
-#'   # OpenML task collection
+#'   # OpenML task collection:
 #'   task_collection = OMLCollection$new(id = 258)
 #'   # using sugar
 #'   task_collection = ocl(id = 258)
-#'
-#'   task_collection$main_entity_type
-#'   task_collection$tasks
-#'   task_collection$data
-#'
-#'   # mlr3 conversion
-#'   tasks = as_tasks(task_collection)
-#'   resamplings = as_resamplings(task_collection)
+#'   print(task_collection)
 #'   }, silent = TRUE)
 OMLCollection = R6Class("OMLCollection",
   inherit = OMLObject,

diff --git a/R/OMLData.R b/R/OMLData.R
@@ -233,7 +233,8 @@ OMLData = R6Class("OMLData",
         if (inherits(path, "try-error")) {
           lg$info("Failed to download parquet, trying arff.", id = self$id)
         } else {
-          backend = try(as_duckdb_backend_character(path, primary_key = primary_key), silent = TRUE)
+          factors = self$features[get("data_type") == "nominal", "name"][[1L]]
+          backend = try(as_duckdb_backend_character(path, primary_key = primary_key, factors = factors), silent = TRUE)
           if (inherits(backend, "try-error")) {
             msg = paste(
               "Parquet available but failed to create backend, reverting to arff.",

diff --git a/R/benchmark_grid_oml.R b/R/benchmark_grid_oml.R
@@ -9,17 +9,17 @@
 #' @param resamplings (`list()` or `Resampling`) A list of [mlr3::Resampling]s that are instantiated on the given tasks.
 #'
 #' @examples
-#' \donttest{
-#' library("mlr3")
-#' collection = OMLCollection$new(258)
-#' otasks = collection$tasks[1:2, ][["task"]]
-#' tasks = as_tasks(otasks)
-#' resamplings = as_resamplings(otasks)
-#' learners = lrns(c("classif.rpart", "classif.featureless"))
-#' design = benchmark_grid_oml(tasks, learners, resamplings)
-#' print(design)
-#' bmr = benchmark(design)
-#' }
+#' try({
+#'   library("mlr3")
+#'   collection = OMLCollection$new(258)
+#'   otasks = collection$tasks[1:2, ][["task"]]
+#'   tasks = as_tasks(otasks)
+#'   resamplings = as_resamplings(otasks)
+#'   learners = lrns(c("classif.rpart", "classif.featureless"))
+#'   design = benchmark_grid_oml(tasks, learners, resamplings)
+#'   print(design)
+#'   bmr = benchmark(design)
+#' }, silent = TRUE)
 #' @return ([`data.table()`])
 #' @export
 benchmark_grid_oml = function(tasks, learners, resamplings) {

diff --git a/R/list_oml_data.R b/R/list_oml_data.R
@@ -7,6 +7,9 @@
 #' This function allows to query data sets, tasks, flows, setups, runs, and evaluation measures
 #' from \url{https://www.openml.org/search?type=data&sort=runs&status=active} using some simple filter criteria.
 #'
+#' To find datasets for a specific task type, use [`list_oml_tasks()`] which supports filtering according to the task
+#' type.
+#'
 #' @details
 #' Filter values are usually provided as single atomic values (typically integer or character).
 #' Provide a numeric vector of length 2 (`c(l, u)`) to find matches in the range \eqn{[l, u]}.
@@ -44,29 +47,29 @@
 #'
 #' @export
 #' @examples
-#' \donttest{
-#' ### query data sets
-#' # search for titanic data set
-#' data_sets = list_oml_data(data_name = "titanic")
-#' print(data_sets)
+#' try({
+#'   ### query data sets
+#'   # search for titanic data set
+#'   data_sets = list_oml_data(data_name = "titanic")
+#'   print(data_sets)
 #'
-#' # search for a reduced version
-#' data_sets = list_oml_data(
-#'   data_name = "titanic",
-#'   number_instances = c(2200, 2300),
-#'   number_features = 4
-#' )
-#' print(data_sets)
+#'   # search for a reduced version
+#'   data_sets = list_oml_data(
+#'     data_name = "titanic",
+#'     number_instances = c(2200, 2300),
+#'     number_features = 4
+#'   )
+#'   print(data_sets)
 #'
-#' ### search tasks for this data set
-#' tasks = list_oml_tasks(data_id = data_sets$data_id)
-#' print(tasks)
+#'   ### search tasks for this data set
+#'   tasks = list_oml_tasks(data_id = data_sets$data_id)
+#'   print(tasks)
 #'
 #'
-#' # query runs, group by number of runs per task_id
-#' runs = list_oml_runs(task_id = tasks$task_id)
-#' runs[, .N, by = task_id]
-#' }
+#'   # query runs, group by number of runs per task_id
+#'   runs = list_oml_runs(task_id = tasks$task_id)
+#'   runs[, .N, by = task_id]
+#' }, silent = TRUE)
 list_oml_data = function(data_id = NULL, data_name = NULL, number_instances = NULL, number_features = NULL,
   number_classes = NULL, number_missing_values = NULL, tag = NULL, limit = limit_default(),
   test_server = test_server_default(), ...) {

diff --git a/R/list_oml_tasks.R b/R/list_oml_tasks.R
@@ -2,7 +2,7 @@
 #' @param task_id (`integer()`)\cr
 #'   Vector of task ids to restrict to.
 #' @param type (`character(1)`)\cr
-#'   The task type, supported values are: clasisf, regr, surv and clust.
+#'   The task type, supported values are: `"clasisf"`, `"regr"`, `"surv"` and `"clust"`.
 #' @export
 list_oml_tasks = function(task_id = NULL, data_id = NULL, number_instances = NULL, number_features = NULL,
   number_classes = NULL, number_missing_values = NULL, tag = NULL, limit = limit_default(),

diff --git a/R/utils.R b/R/utils.R
@@ -60,7 +60,7 @@ transpose_name_value = function(li, as_integer = FALSE) {
 }
 
 # remove this when it is merged in mlr3db (... in mlr3db is not passed to duckdb constructor...)
-as_duckdb_backend_character = function(data, primary_key = NULL) {
+as_duckdb_backend_character = function(data, primary_key = NULL, factors) {
   require_namespaces(c("DBI", "duckdb", "mlr3db"))
 
   assert_file_exists(data, access = "r", extension = "parquet")
@@ -123,7 +123,7 @@ as_duckdb_backend_character = function(data, primary_key = NULL) {
   }
 
   backend = mlr3db::DataBackendDuckDB$new(con, table = tbl, primary_key = primary_key,
-    strings_as_factors = TRUE
+    strings_as_factors = factors
   )
 
   on.exit()

diff --git a/R/zzz.R b/R/zzz.R
@@ -39,7 +39,7 @@
 #' **Relevant for developers**
 #'
 #' * `mlr3oml.test_server`:
-#'   The default value for whether to use the OpenML [test server](https://test.openml.org/).
+#'   The default value for whether to use the OpenML test server.
 #'   Default is `FALSE`.
 #' * `mlr3oml.test_api_key`:
 #'   API key to use for the test server. If not set, defaults to the value of the environment
@@ -82,7 +82,8 @@ utils::globalVariables(c("super"))
   ResampleResult$private_fields$oml = NULL
   BenchmarkResult$private_fields$oml = NULL
   library.dynam.unload("mlr3oml", libpath)
+  mlr_tasks$remove("oml")
+  mlr_resamplings$remove("oml")
 } # nocov end
 
-
 leanify_package()
diff --git a/cran-comments.md b/cran-comments.md
@@ -2,21 +2,7 @@
 
 None
 
-## R CMD check results
+## Comment
 
-There is only one note that informs about the maintainer change.
-
-Maintainer: 'Sebastian Fischer <sebf.fischer@gmail.com>'
-
-New maintainer:
-  Sebastian Fischer <sebf.fischer@gmail.com>
-Old maintainer(s):
-  Michel Lang <michellang@gmail.com>
-
-
-## Comments
-
-This package uses a REST API and therefore:
-
-* wraps examples in "\dontrun{...}"
-* disables tests relying on an existing server and internet connection on CRAN
+This release fixes the CRAN NOTE, that arose because examples did not fail gracefully when the OpenML server
+was not available.
diff --git a/man-roxygen/param_test_server.R b/man-roxygen/param_test_server.R
@@ -1,4 +1,3 @@
 #' @param test_server (`character(1)`)\cr
-#' Whether to use the OpenML test server (https://test.openml.org/) or public server
-#' (https://www.openml.org/).
+#' Whether to use the OpenML test server or public server.
 #' Defaults to value of option `"mlr3oml.test_server"`, or `FALSE` if not set.
diff --git a/man/benchmark_grid_oml.Rd b/man/benchmark_grid_oml.Rd
diff --git a/man/list_oml.Rd b/man/list_oml.Rd
diff --git a/man/mlr3oml-package.Rd b/man/mlr3oml-package.Rd