Merge pull request #36 from MattCowgill/master

bring in `strayr` functions
runapp-aus · May 31, 2021 · 9d80d34 · 9d80d34
2 parents 2963b10 + 228e6b2
commit 9d80d34
Show file tree

Hide file tree

Showing 22 changed files with 711 additions and 56 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,15 @@
 Package: abscorr
 Type: Package
 Title: Ready-to-use Australian common structures and classifications and tools for working with them
-Version: 0.1.0
-Author: person("Will", "Mackey", email = "wfmackey@gmail.com",
+Version: 0.1.1
+Author: c(person("Will", "Mackey", email = "wfmackey@gmail.com",
                 role = c("aut", "cre")),
         person(given = "Matt", family = "Johnson",
                 role = c("aut", "cre")),
         person(given = "David", family = "Diviny", email = "david.diviny@nousgroup.com.au"
-                role = c("aut", "cre"))
+                role = c("aut", "cre")),
+        person("Matt", "Cowgill", role = c("aut"), email = "mattcowgill@gmail.com", comment = c(ORCID = "0000-0003-0422-3300")),
+        person("Bryce", "Roney", role = c("ctb")))
 Maintainer: Will Mackey <wfmackey@gmail.com>
 License: GPL-3
 Description: This package provides data and functions for working with common structures and classifications used in Australia.
@@ -21,9 +23,11 @@ Imports:
     readxl,
     magrittr,
     stringr,
-    stringdist,
     tidyr,
-    rlang
+    rlang,
+    stringdist (>= 0.9.5.1),
+    lubridate,
+    parsedate
 Suggests: 
     testthat,
     here

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,8 +3,11 @@
 export("%>%")
 export(clean_anzsco)
 export(clean_anzsic)
+export(clean_state)
 export(get_seifa)
 export(get_seifa_index_sheet)
+export(is_holiday)
+export(strayr)
 import(absmapsdata)
 import(readxl)
 importFrom(dplyr,across)
@@ -20,6 +23,7 @@ importFrom(dplyr,starts_with)
 importFrom(lifecycle,deprecate_soft)
 importFrom(magrittr,"%>%")
 importFrom(purrr,map_dfr)
+importFrom(purrr,map_lgl)
 importFrom(rlang,.data)
 importFrom(stringdist,amatch)
 importFrom(stringr,str_remove_all)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
+# abscorr 0.1.1
+* Functions from `strayr` package to wrangle state names + public holidays added
 
 # abscorr 0.1.0
-
 * use pkgdown for documentation
 * Added a `NEWS.md` file to track changes to the package
 * Import SEIFA scores for various spatial geometries
diff --git a/R/auholidays.R b/R/auholidays.R
@@ -0,0 +1,10 @@
+#' @title Australian Public Holidays
+#' @description Tidy data set of Australian Public Holidays
+#' @format A data frame with 776 rows and 3 variables:
+#' \describe{
+#'   \item{\code{Date}}{The date of the holiday}
+#'   \item{\code{Name}}{The name of the holiday}
+#'   \item{\code{Jurisdiction}}{Pipe seperated list of jurisdictions to which the holiday applies}
+#'}
+#' @source \url{https://data.gov.au/data/dataset/australian-holidays-machine-readable-dataset}
+"auholidays"
diff --git a/R/clean_state.R b/R/clean_state.R
@@ -0,0 +1,103 @@
+#' Convert Australian state names and abbreviations into a consistent format
+#'
+#' @param x a (character) vector containing Australian state names or abbreviations or
+#' a (numeric) vector containing state codes (1 = NSW, 2 = Vic, 3 = Qld, 4 = SA,
+#' 5 = WA, 6 = Tas, 7 = NT, 8 = ACT).
+#'
+#' @param to what form should the state names be converted to? Options are
+#' "state_name", "state_abbr" (the default), "iso", "postal", and "code".
+#'
+#' @param fuzzy_match logical; either TRUE (the default) which indicates that
+#' approximate/fuzzy string matching should be used, or FALSE which indicates that
+#' only exact matches should be used.
+#'
+#' @param max_dist numeric, sets the maximum acceptable distance between your
+#' string and the matched string. Default is 0.4. Only relevant when fuzzy_match is TRUE.
+#'
+#' @param method the method used for approximate/fuzzy string matching. Default
+#' is "jw", the Jaro-Winker distance; see `??stringdist-metrics` for more options.
+#'
+#' @param ... all arguments to `strayr` are passed to `clean_state`
+#'
+#' @return a character vector of state names, abbreviations, or codes.
+#'
+#' @details `strayr()` is a wrapper around `clean_state()` and is provided for
+#' backwards compatibility. `strayr()` is soft-deprecated, but will not be removed
+#' for the foreseeable future. New code should use `clean_state()`.
+#'
+#' @rdname clean_state
+#' @examples
+#'
+#' x <- c("western Straya", "w. A ", "new soth wailes", "SA", "tazz")
+#'
+#' # Convert the above to state abbreviations
+#' clean_state(x)
+#'
+#' # Convert the elements of `x` to state names
+#'
+#' clean_state(x, to = "state_name")
+#'
+#' # Disable fuzzy matching; you'll get NAs unless exact matches can be found
+#'
+#' clean_state(x, fuzzy_match = FALSE)
+#'
+#' # You can use clean_state in a dplyr mutate call
+#'
+#' x_df <- data.frame(state = x, stringsAsFactors = FALSE)
+#'
+#' \dontrun{x_df %>% mutate(state_abbr = clean_state(state))}
+#'
+#' @importFrom stringdist amatch
+#' @export
+
+clean_state <- function(x, to = "state_abbr", fuzzy_match = TRUE, max_dist = 0.4, method = "jw"){
+
+
+  if(!is.logical(fuzzy_match)){
+    stop("`fuzzy_match` argument must be either `TRUE` or `FALSE`")
+  }
+
+  if(!is.numeric(x)) {
+    x <- state_string_tidy(x)
+  }
+
+  if(fuzzy_match){
+    matched_abbr <- names(state_dict[stringdist::amatch(x, tolower(state_dict),
+                                               method = method,
+                                               matchNA = FALSE,
+                                               maxDist = max_dist)])
+  } else {
+    matched_abbr <- names(state_dict[match(x, tolower(state_dict))])
+  }
+
+  ret <- state_table[[to]][match(matched_abbr, state_table$state_abbr)]
+
+  ret <- as.character(ret)
+
+  ret
+
+}
+
+#' @rdname clean_state
+#' @export
+strayr <- function(...) {
+  .Deprecated(new = "clean_state",
+              msg = "The strayr() function has been renamed clean_state().")
+  clean_state(...)
+}
+
+
+
+state_string_tidy <- function(string){
+
+  string <- tolower(string)
+
+  string <- trimws(string, "both")
+
+  string <- ifelse(string %in% c("na", "n.a", "n.a.", "n a",
+                                 "not applicable"),
+                   NA_character_,
+                   string)
+
+  string
+}
diff --git a/R/is_holiday.R b/R/is_holiday.R
@@ -0,0 +1,47 @@
+#' @title is_holiday
+#' @description Identifies if a given date is a public holiday in Australia
+#' @param date Date, POSIXct object or a string that can be parsed with `parsedate::parse_date`
+#' @param jurisdictions Vector of state jurisdictions you wish to filter on, defaults to all of Australia
+#' @return logical(`TRUE` or `FALSE`) vector
+#' @examples
+#' is_holiday('2020-01-01')
+#' is_holiday('2019-05-27', jurisdictions=c('ACT', 'TAS'))
+#' h_df <- data.frame(dates = c('2020-01-01', '2020-01-10'))
+#' h_df %>%
+#'   dplyr::mutate(IsHoliday = is_holiday(dates))
+#' @rdname is_holiday
+#' @export
+#' @importFrom purrr map_lgl
+is_holiday <- function(date, jurisdictions = c()) {
+  ret <- purrr::map_lgl(date, do_is_holiday, jurisdictions = jurisdictions)
+
+  return(ret)
+}
+
+do_is_holiday <- function(date, jurisdictions = c()) {
+  if (is.na(date)) {
+    stop("`date` argument cannot be NA")
+  }
+
+  if (!lubridate::is.Date(date) & !lubridate::is.POSIXct(date)) {
+    # Attempt to coerce to date
+    new_date <- parsedate::parse_date(date)
+
+    if (is.na(new_date)) {
+      stop("`date` must be a Date, POSIXct object or a string that can be parsed as a date")
+    }
+
+    # Remove any time or time zone element
+    new_date <- lubridate::as_date(new_date)
+  } else {
+    new_date <- lubridate::as_date(date)
+  }
+
+  if (length(jurisdictions) == 0) {
+    ret <- abscorr::auholidays[abscorr::auholidays$Date == new_date, ]
+  } else {
+    ret <- abscorr::auholidays[abscorr::auholidays$Date == new_date & (abscorr::auholidays$Jurisdiction %in% jurisdictions | abscorr::auholidays$Jurisdiction == "NAT"), ]
+  }
+
+  return(nrow(ret) > 0)
+}
diff --git a/R/sysdata.rda b/R/sysdata.rda