From 338836a7c78b9aa7ab6dba3df2490f689ad0e76c Mon Sep 17 00:00:00 2001 From: sdgamboa Date: Mon, 8 Apr 2024 17:39:28 -0400 Subject: [PATCH] fix indents and length of lines --- R/bacdive.R | 549 +++++++++++++++++++------------------- R/bugphyzz.R | 450 +++++++++++++++++-------------- R/cache.R | 24 +- R/fattyAcidComposition.R | 48 ++-- R/physiologies.R | 506 ++++++++++++++++++----------------- bugphyzz.Rproj | 2 +- man/getTaxonSignatures.Rd | 4 +- man/makeSignatures.Rd | 4 +- man/physiologies.Rd | 4 +- vignettes/bugphyzz.Rmd | 69 +++-- 10 files changed, 872 insertions(+), 788 deletions(-) diff --git a/R/bacdive.R b/R/bacdive.R index 7143a207..ca94273f 100644 --- a/R/bacdive.R +++ b/R/bacdive.R @@ -1,316 +1,325 @@ ## Main function for importing BacDive .getBacDive <- function(verbose = FALSE ) { - bacdive_data <- .importBacDiveExcel(verbose = verbose) - colnames(bacdive_data) <- .changeBDColNames(colnames(bacdive_data)) - .getTidyBD(bacdive_data) + bacdive_data <- .importBacDiveExcel(verbose = verbose) + colnames(bacdive_data) <- .changeBDColNames(colnames(bacdive_data)) + .getTidyBD(bacdive_data) } ## Helper function for .getBacDive .importBacDiveExcel <- function(verbose = FALSE) { - if (verbose) - message('Importing BacDive...') - url <- 'https://docs.google.com/spreadsheets/d/1smQTi1IKt4wSGTrGTW25I6u47M5txZkq/export?format=csv' - bacdive <- utils::read.csv(url) - colnames(bacdive) <- tolower(colnames(bacdive)) - return(bacdive) + if (verbose) + message('Importing BacDive...') + url <- 'https://docs.google.com/spreadsheets/d/1smQTi1IKt4wSGTrGTW25I6u47M5txZkq/export?format=csv' + bacdive <- utils::read.csv(url) + colnames(bacdive) <- tolower(colnames(bacdive)) + return(bacdive) } ## Helper function for .getBacDive .changeBDColNames <- function(x) { - dplyr::case_when( - x == 'bacdive_id' ~ 'BacDive_ID', - x == 'taxon_name' ~ 'Taxon_name', - x == 'ncbi_id' ~ 'NCBI_ID', - x == 'rank' ~ 'Rank', - x == 'parent_taxon_name' ~ 'Parent_name', - x == 'parent_ncbi_id' ~ 'Parent_NCBI_ID', - x == 'parent_rank' ~ 'Parent_rank', - x == 'sequence_16S_ncbi_id' ~ 'Seq16S_NCBI_ID', - x == 'sequence_genome_ncbi_id' ~ 'Genome_ID', - x == 'type_strain' ~ 'Type_strain', - TRUE ~ x - ) + dplyr::case_when( + x == 'bacdive_id' ~ 'BacDive_ID', + x == 'taxon_name' ~ 'Taxon_name', + x == 'ncbi_id' ~ 'NCBI_ID', + x == 'rank' ~ 'Rank', + x == 'parent_taxon_name' ~ 'Parent_name', + x == 'parent_ncbi_id' ~ 'Parent_NCBI_ID', + x == 'parent_rank' ~ 'Parent_rank', + x == 'sequence_16S_ncbi_id' ~ 'Seq16S_NCBI_ID', + x == 'sequence_genome_ncbi_id' ~ 'Genome_ID', + x == 'type_strain' ~ 'Type_strain', + TRUE ~ x + ) } ## Helper function for .getBacDive .getTidyBD <- function(bacdive_data) { - bacdive_data |> - tidyr::pivot_longer( - cols = gram_stain:tidyr::last_col(), # Attributes start in the gram_stain column - names_to = 'Attribute', values_to = 'Attribute_value' - ) |> - dplyr::filter(Attribute_value != '') |> - dplyr::mutate(Attribute = gsub('_', ' ', Attribute)) |> - dplyr::mutate( - Attribute = dplyr::case_when( - Attribute == 'oxygen tolerance' ~ 'aerophilicity', - Attribute == 'cell shape' ~ 'shape', - Attribute == 'pathogenicity animal' ~ 'animal pathongen', - Attribute == 'sample type' ~ 'isolation site', - TRUE ~ Attribute - ) - ) |> - dplyr::distinct() + bacdive_data |> + tidyr::pivot_longer( + # Attributes start in the gram_stain column + cols = gram_stain:tidyr::last_col(), + names_to = 'Attribute', values_to = 'Attribute_value' + ) |> + dplyr::filter(Attribute_value != '') |> + dplyr::mutate(Attribute = gsub('_', ' ', Attribute)) |> + dplyr::mutate( + Attribute = dplyr::case_when( + Attribute == 'oxygen tolerance' ~ 'aerophilicity', + Attribute == 'cell shape' ~ 'shape', + Attribute == 'pathogenicity animal' ~ 'animal pathongen', + Attribute == 'sample type' ~ 'isolation site', + TRUE ~ Attribute + ) + ) |> + dplyr::distinct() } ## Function for getting a list of data.frames (one per attribute) ## in tidy format from BacDive .reshapeBacDive <- function(df) { - df[['Attribute_source']] <- 'BacDive' - split_df <- split(df, factor(df[['Attribute']])) + df[['Attribute_source']] <- 'BacDive' + split_df <- split(df, factor(df[['Attribute']])) - ## Attributes that must be changed from character to logical (simplest fix) - attr_names <- c( - 'aerophilicity', - 'shape', - 'country', - 'cultivation medium used', - 'geographic location', - 'isolation site' - ## colony color (delete) - ) + ## Attributes that must be changed from character to logical (simplest fix) + attr_names <- c( + 'aerophilicity', + 'shape', + 'country', + 'cultivation medium used', + 'geographic location', + 'isolation site' + ## colony color (delete) + ) - for (i in seq_along(attr_names)) { - split_df[[attr_names[i]]] <- .catToLog(split_df[[attr_names[i]]]) - if (attr_names[i] %in% c('aerophilicity', 'shape')) { - split_df[[attr_names[i]]]$Attribute_type <- 'multistate-intersection' - } else { - split_df[[attr_names[i]]]$Attribute_type <- 'multistate-union' + for (i in seq_along(attr_names)) { + split_df[[attr_names[i]]] <- .catToLog(split_df[[attr_names[i]]]) + if (attr_names[i] %in% c('aerophilicity', 'shape')) { + split_df[[attr_names[i]]]$Attribute_type <- + 'multistate-intersection' + } else { + split_df[[attr_names[i]]]$Attribute_type <- 'multistate-union' + } } - } - - ## aerophilicity #### - ## This is only to match the data in the bugphyzz spreadsheet - aer <- split_df[['aerophilicity']] - aer$Attribute <- dplyr::case_when( - aer$Attribute == 'aerobe' ~ 'aerobic', - aer$Attribute == 'anaerobe' ~ 'anaerobic', - aer$Attribute == 'facultative anaerobe' ~ 'facultatively anaerobic', - aer$Attribute == 'microaerophile' ~ 'microaerophilic', - aer$Attribute == 'obligate anaerobe' ~ 'obligately anaerobic', - aer$Attribute == 'obligate aerobe' ~ 'obligately aerobic', - TRUE ~ aer$Attribute - ) - split_df[['aerophilicity']] <- aer - ## animal pathogen #### - pos <- names(split_df) == 'animal pathongen' - names(split_df)[pos] <- 'animal pathogen' - x_ <- split_df[['animal pathogen']][['Attribute_value']] - x_ <- ifelse(x_ == "yes, in single cases", "yes", x_) - x_ <- dplyr::case_when(x_ == 'yes' ~ TRUE, x_ == 'no' ~ FALSE) - split_df[['animal pathogen']][['Attribute_value']] <- x_ - split_df[['animal pathogen']][['Attribute_group']] <- 'animal pathogen' - split_df[['animal pathogen']][['Attribute']] <- 'animal pathogen' - split_df[['animal pathogen']][['Attribute_type']] <- 'binary' + ## aerophilicity #### + ## This is only to match the data in the bugphyzz spreadsheet + aer <- split_df[['aerophilicity']] + aer$Attribute <- dplyr::case_when( + aer$Attribute == 'aerobe' ~ 'aerobic', + aer$Attribute == 'anaerobe' ~ 'anaerobic', + aer$Attribute == 'facultative anaerobe' ~ 'facultatively anaerobic', + aer$Attribute == 'microaerophile' ~ 'microaerophilic', + aer$Attribute == 'obligate anaerobe' ~ 'obligately anaerobic', + aer$Attribute == 'obligate aerobe' ~ 'obligately aerobic', + TRUE ~ aer$Attribute + ) + split_df[['aerophilicity']] <- aer - ## biosafety level #### - y <- split_df[['biosafety level comment']][, c('BacDive_ID', 'Attribute_value')] - colnames(y)[2] <- 'Note' - x <- dplyr::left_join(split_df[['biosafety level']], y, by = 'BacDive_ID') - x[['Attribute_value']] <- paste0('biosafety level ', x[['Attribute_value']]) - x[['Attribute']] <- x[['Attribute_value']] - x[['Attribute_value']] <- TRUE - x[['Attribute_group']] <- 'biosafety level' - x[['Attribute_type']] <- 'multistate-intersection' - split_df[['biosafety level']] <- x - split_df[['biosafety level comment']] <- NULL + ## animal pathogen #### + pos <- names(split_df) == 'animal pathongen' + names(split_df)[pos] <- 'animal pathogen' + x_ <- split_df[['animal pathogen']][['Attribute_value']] + x_ <- ifelse(x_ == "yes, in single cases", "yes", x_) + x_ <- dplyr::case_when(x_ == 'yes' ~ TRUE, x_ == 'no' ~ FALSE) + split_df[['animal pathogen']][['Attribute_value']] <- x_ + split_df[['animal pathogen']][['Attribute_group']] <- 'animal pathogen' + split_df[['animal pathogen']][['Attribute']] <- 'animal pathogen' + split_df[['animal pathogen']][['Attribute_type']] <- 'binary' - ## colony color #### - ## This one must be removed - split_df[['colony color']] <- NULL + ## biosafety level #### + y <- split_df[['biosafety level comment']][ + , c('BacDive_ID', 'Attribute_value') + ] + colnames(y)[2] <- 'Note' + x <- dplyr::left_join(split_df[['biosafety level']], y, by = 'BacDive_ID') + x[['Attribute_value']] <- paste0('biosafety level ', x[['Attribute_value']]) + x[['Attribute']] <- x[['Attribute_value']] + x[['Attribute_value']] <- TRUE + x[['Attribute_group']] <- 'biosafety level' + x[['Attribute_type']] <- 'multistate-intersection' + split_df[['biosafety level']] <- x + split_df[['biosafety level comment']] <- NULL - ## cultivation medium used - growth medium #### - pos <- names(split_df) == 'cultivation medium used' - names(split_df)[pos] <- 'growth medium' - split_df[['growth medium']][['Attribute_group']] <- 'growth medium' + ## colony color #### + ## This one must be removed + split_df[['colony color']] <- NULL - ## growth temperature #### - ## culture temperature - ## culture temperature growth - ## culture temperature range (ignore) - ## culture temperature type (ignore) - split_df[['culture temperature range']] <- NULL - split_df[['culture temperature type']] <- NULL - a <- split_df[['culture temperature']] - b <- split_df[['culture temperature growth']] - b_ <- b[,c('BacDive_ID', 'Attribute_value')] - colnames(b_)[2] <- 'growth' - ab <- dplyr::left_join(a, b_, by = 'BacDive_ID') - ab <- ab[ab[['growth']] == 'positive',] - ab[['growth']] <- NULL - ab[['Attribute_group']] <- 'growth temperature' - ab[['Attribute_type']] <- 'range' - ab[['Attribute']] <- 'growth temperature' - split_df[['growth temperature']] <- ab - split_df[['culture temperature']] <- NULL - split_df[['culture temperature growth']] <- NULL + ## cultivation medium used - growth medium #### + pos <- names(split_df) == 'cultivation medium used' + names(split_df)[pos] <- 'growth medium' + split_df[['growth medium']][['Attribute_group']] <- 'growth medium' - ## gram stain #### - gs <- split_df[['gram stain']] - gs[['Attribute']] <- paste(gs[['Attribute']], gs[['Attribute_value']]) - gs[['Attribute_value']] <- TRUE - gs[['Attribute_group']] <- 'gram stain' - gs[['Attribute_type']] <- 'multistate-intersection' - split_df[['gram stain']] <- gs + ## growth temperature #### + ## culture temperature + ## culture temperature growth + ## culture temperature range (ignore) + ## culture temperature type (ignore) + split_df[['culture temperature range']] <- NULL + split_df[['culture temperature type']] <- NULL + a <- split_df[['culture temperature']] + b <- split_df[['culture temperature growth']] + b_ <- b[,c('BacDive_ID', 'Attribute_value')] + colnames(b_)[2] <- 'growth' + ab <- dplyr::left_join(a, b_, by = 'BacDive_ID') + ab <- ab[ab[['growth']] == 'positive',] + ab[['growth']] <- NULL + ab[['Attribute_group']] <- 'growth temperature' + ab[['Attribute_type']] <- 'range' + ab[['Attribute']] <- 'growth temperature' + split_df[['growth temperature']] <- ab + split_df[['culture temperature']] <- NULL + split_df[['culture temperature growth']] <- NULL - ## halophily #### - valid_terms <- c( - 'NaCl', 'KCl', 'MgCl2', 'MgCl2x6H2O', 'Na\\+', 'MgSO4x7H2O', 'Na2SO4', - 'Sea salts', 'Chromium \\(Cr6\\+\\)' - ) - regex <- paste0('(', paste0(valid_terms, collapse = '|'), ')') - split_df[['halophily']] <- split_df[['halophily']] |> - dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |> - tidyr::unnest(cols = 'Attribute_value') |> - dplyr::filter(!grepl('no growth', Attribute_value)) |> - dplyr::mutate( - Attribute_value = stringr::str_squish(Attribute_value), - Attribute_value = sub('NaCL', 'NaCl', Attribute_value), - Attribute_value = sub('Marine', 'Sea', Attribute_value), - Attribute_value = sub('Salts', 'salts', Attribute_value) - ) |> - dplyr::filter(grepl(regex, Attribute_value)) |> - dplyr::mutate( - Attribute = stringr::str_extract(Attribute_value, regex), - Unit = Attribute_value |> - stringr::str_extract(' [<>]??[0-9]+\\.??[0-9]*.*') |> - stringr::str_squish() |> - stringr::str_remove('^.* '), - Attribute_value = Attribute_value |> - stringr::str_extract(' [<>]??[0-9]+\\.??[0-9]*.*') |> - stringr::str_squish() |> - stringr::str_remove(' .*$'), - Attribute_group = 'halophily', - Attribute_type = 'range' - ) |> - dplyr::filter(!grepl('[0-9]', Unit)) |> - dplyr::distinct() + ## gram stain #### + gs <- split_df[['gram stain']] + gs[['Attribute']] <- paste(gs[['Attribute']], gs[['Attribute_value']]) + gs[['Attribute_value']] <- TRUE + gs[['Attribute_group']] <- 'gram stain' + gs[['Attribute_type']] <- 'multistate-intersection' + split_df[['gram stain']] <- gs - ## hemolysis #### - split_df[['hemolysis']] <- split_df[['hemolysis']] |> - dplyr::mutate( - Attribute_value = strsplit(Attribute_value, ';|/') - ) |> - tidyr::unnest(Attribute_value) |> - dplyr::mutate(Attribute_value = stringr::str_squish(Attribute_value)) |> - dplyr::filter(Attribute_value != '') |> - dplyr::mutate( - Attribute = Attribute_value, - Attribute_value = TRUE, - Attribute_group = 'hemolysis', - Attribute_type = 'multistate-intersection' + ## halophily #### + valid_terms <- c( + 'NaCl', 'KCl', 'MgCl2', 'MgCl2x6H2O', 'Na\\+', 'MgSO4x7H2O', 'Na2SO4', + 'Sea salts', 'Chromium \\(Cr6\\+\\)' ) + regex <- paste0('(', paste0(valid_terms, collapse = '|'), ')') + split_df[['halophily']] <- split_df[['halophily']] |> + dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |> + tidyr::unnest(cols = 'Attribute_value') |> + dplyr::filter(!grepl('no growth', Attribute_value)) |> + dplyr::mutate( + Attribute_value = stringr::str_squish(Attribute_value), + Attribute_value = sub('NaCL', 'NaCl', Attribute_value), + Attribute_value = sub('Marine', 'Sea', Attribute_value), + Attribute_value = sub('Salts', 'salts', Attribute_value) + ) |> + dplyr::filter(grepl(regex, Attribute_value)) |> + dplyr::mutate( + Attribute = stringr::str_extract(Attribute_value, regex), + Unit = Attribute_value |> + stringr::str_extract(' [<>]??[0-9]+\\.??[0-9]*.*') |> + stringr::str_squish() |> + stringr::str_remove('^.* '), + Attribute_value = Attribute_value |> + stringr::str_extract(' [<>]??[0-9]+\\.??[0-9]*.*') |> + stringr::str_squish() |> + stringr::str_remove(' .*$'), + Attribute_group = 'halophily', + Attribute_type = 'range' + ) |> + dplyr::filter(!grepl('[0-9]', Unit)) |> + dplyr::distinct() - ## incubation period - ## This one must be removed - split_df[['incubation period']] <- NULL + ## hemolysis #### + split_df[['hemolysis']] <- split_df[['hemolysis']] |> + dplyr::mutate( + Attribute_value = strsplit(Attribute_value, ';|/') + ) |> + tidyr::unnest(Attribute_value) |> + dplyr::mutate(Attribute_value = stringr::str_squish(Attribute_value)) |> + dplyr::filter(Attribute_value != '') |> + dplyr::mutate( + Attribute = Attribute_value, + Attribute_value = TRUE, + Attribute_group = 'hemolysis', + Attribute_type = 'multistate-intersection' + ) - ## motility #### - split_df[['motility']] <- split_df[['motility']] |> - dplyr::mutate( - Attribute_value = dplyr::case_when( - Attribute_value == 'yes' ~ TRUE, - Attribute_value == 'no' ~ FALSE - ) - ) - split_df[['motility']][['Attribute_group']] <- 'motility' - split_df[['motility']][['Attribute_type']] <- 'binary' + ## incubation period + ## This one must be removed + split_df[['incubation period']] <- NULL - ## pathogenicity human #### - pat <- split_df[['pathogenicity human']] - pat[['Note']] <- stringr::str_extract(pat[['Attribute_value']], 'in single cases') - pat[['Note']] <- ifelse(is.na(pat[['Note']]), "", pat[['Note']]) - pat[['Attribute_value']] <- ifelse(grepl('^yes', pat[['Attribute_value']]), TRUE, NA) - pat <- pat[!is.na(pat[['Attribute_value']]),] - pat[['Attribute_group']] <- 'pathogenicity human' - pat[['Attribute_type']] <- 'binary' - split_df[['pathogenicity human']] <- pat + ## motility #### + split_df[['motility']] <- split_df[['motility']] |> + dplyr::mutate( + Attribute_value = dplyr::case_when( + Attribute_value == 'yes' ~ TRUE, + Attribute_value == 'no' ~ FALSE + ) + ) + split_df[['motility']][['Attribute_group']] <- 'motility' + split_df[['motility']][['Attribute_type']] <- 'binary' - ## metabolite production #### - mp <- split_df[['metabolite production']] - mp <- mp |> - dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |> - tidyr::unnest(Attribute_value) - x <- stringr::str_extract(mp[['Attribute_value']], '(yes|no)$') - mp <- mp[which(!is.na(x)),] - y <- stringr::str_extract(mp[['Attribute_value']], '(yes|no)$') - mp[['Attribute']] <- mp[['Attribute_value']] - mp[['Attribute_value']] <- ifelse(y == 'yes', TRUE , FALSE) - mp[['Attribute']] <- sub(' (yes|no)$', '', mp[['Attribute']]) - mp[['Attribute_group']] <- 'metabolite utilization' - mp[['Attribute_type']] <- 'multistate-intersection' - split_df[['metabolite production']] <- mp + ## pathogenicity human #### + pat <- split_df[['pathogenicity human']] + pat[['Note']] <- stringr::str_extract( + pat[['Attribute_value']], 'in single cases' + ) + pat[['Note']] <- ifelse(is.na(pat[['Note']]), "", pat[['Note']]) + pat[['Attribute_value']] <- ifelse( + grepl('^yes', pat[['Attribute_value']]), TRUE, NA + ) + pat <- pat[!is.na(pat[['Attribute_value']]),] + pat[['Attribute_group']] <- 'pathogenicity human' + pat[['Attribute_type']] <- 'binary' + split_df[['pathogenicity human']] <- pat - ## metabolite utilization #### - pos <- names(split_df) == 'metabolite utiilization' - names(split_df)[pos] <- 'metabolite utilization' - mu <- split_df[['metabolite utilization']] - mu <- mu |> - dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |> - tidyr::unnest(Attribute_value) |> - dplyr::mutate(Attribute_value = stringr::str_squish(Attribute_value)) - x <- sub('^.* (\\+|-|\\+/-) *.*$', '\\1', mu[['Attribute_value']]) - y <- ifelse(!x %in% c('+', '-', '+/-'), NA, x) - mu <- mu[which(!is.na(y)),] - mu[['Attribute']] <- stringr::str_remove(mu[['Attribute_value']], ' (\\+|-|\\+/-) *.*$') - mu[['Note']] <- sub('^.*(\\+|-|\\+/-) ', '', mu[['Attribute_value']]) - mu[['Note']] <- paste('kind of utilization tested:', mu[['Note']]) - y <- y[!is.na(y)] - mu[['Attribute_value']] <- dplyr::case_when( - y == '+' ~ 'TRUE', - y == '-' ~ 'FALSE', - y == '+/-' ~ 'TRUE/FALSE' - ) - mu <- mu |> - dplyr::mutate(Attribute_value = strsplit(Attribute_value, '/')) |> - tidyr::unnest(Attribute_value) |> - dplyr::mutate(Attribute_value = as.logical(Attribute_value)) - mu[['Attribute_group']] <- 'metabolite utilization' - mu[['Attribute_type']] <- 'multistate-intersection' - split_df[['metabolite utilization']] <- mu + ## metabolite production #### + mp <- split_df[['metabolite production']] + mp <- mp |> + dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |> + tidyr::unnest(Attribute_value) + x <- stringr::str_extract(mp[['Attribute_value']], '(yes|no)$') + mp <- mp[which(!is.na(x)),] + y <- stringr::str_extract(mp[['Attribute_value']], '(yes|no)$') + mp[['Attribute']] <- mp[['Attribute_value']] + mp[['Attribute_value']] <- ifelse(y == 'yes', TRUE , FALSE) + mp[['Attribute']] <- sub(' (yes|no)$', '', mp[['Attribute']]) + mp[['Attribute_group']] <- 'metabolite utilization' + mp[['Attribute_type']] <- 'multistate-intersection' + split_df[['metabolite production']] <- mp - ## spore formation #### - sf <- split_df[['spore formation']] - sf <- sf |> - dplyr::mutate( - Attribute_value = dplyr::case_when( - Attribute_value == 'yes' ~ TRUE, - Attribute_value == 'no' ~ FALSE - ), - Attribute_group = 'spore formation', - Attribute_type = 'binary' - ) |> - dplyr::filter(!is.na(Attribute_value)) - split_df[['spore formation']] <- sf + ## metabolite utilization #### + pos <- names(split_df) == 'metabolite utiilization' + names(split_df)[pos] <- 'metabolite utilization' + mu <- split_df[['metabolite utilization']] + mu <- mu |> + dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |> + tidyr::unnest(Attribute_value) |> + dplyr::mutate(Attribute_value = stringr::str_squish(Attribute_value)) + x <- sub('^.* (\\+|-|\\+/-) *.*$', '\\1', mu[['Attribute_value']]) + y <- ifelse(!x %in% c('+', '-', '+/-'), NA, x) + mu <- mu[which(!is.na(y)),] + mu[['Attribute']] <- stringr::str_remove( + mu[['Attribute_value']], ' (\\+|-|\\+/-) *.*$' + ) + mu[['Note']] <- sub('^.*(\\+|-|\\+/-) ', '', mu[['Attribute_value']]) + mu[['Note']] <- paste('kind of utilization tested:', mu[['Note']]) + y <- y[!is.na(y)] + mu[['Attribute_value']] <- dplyr::case_when( + y == '+' ~ 'TRUE', + y == '-' ~ 'FALSE', + y == '+/-' ~ 'TRUE/FALSE' + ) + mu <- mu |> + dplyr::mutate(Attribute_value = strsplit(Attribute_value, '/')) |> + tidyr::unnest(Attribute_value) |> + dplyr::mutate(Attribute_value = as.logical(Attribute_value)) + mu[['Attribute_group']] <- 'metabolite utilization' + mu[['Attribute_type']] <- 'multistate-intersection' + split_df[['metabolite utilization']] <- mu - split_df <- lapply(split_df, function(x) { - x <- as.data.frame(x) - x[['NCBI_ID']] <- as.character(x[['NCBI_ID']]) - x[['Parent_NCBI_ID']] <- as.character(x[['Parent_NCBI_ID']]) - x[['Frequency']] <- 'always' - x <- x[!is.na(x[['Attribute_value']]),] - if (unique(x[['Attribute_type']]) == 'numeric') { - x <- .numericToRange(x) - } else if (unique(x[['Attribute_type']] == 'range')) { - x <- .modifyRange(x) - } - x <- x[,purrr::map_lgl(x, ~ !all(is.na(.x)))] - x <- dplyr::distinct(x) - as.data.frame(x) - # x <- .addSourceInfo(x) - }) + ## spore formation #### + sf <- split_df[['spore formation']] + sf <- sf |> + dplyr::mutate( + Attribute_value = dplyr::case_when( + Attribute_value == 'yes' ~ TRUE, + Attribute_value == 'no' ~ FALSE + ), + Attribute_group = 'spore formation', + Attribute_type = 'binary' + ) |> + dplyr::filter(!is.na(Attribute_value)) + split_df[['spore formation']] <- sf + + split_df <- lapply(split_df, function(x) { + x <- as.data.frame(x) + x[['NCBI_ID']] <- as.character(x[['NCBI_ID']]) + x[['Parent_NCBI_ID']] <- as.character(x[['Parent_NCBI_ID']]) + x[['Frequency']] <- 'always' + x <- x[!is.na(x[['Attribute_value']]),] + if (unique(x[['Attribute_type']]) == 'numeric') { + x <- .numericToRange(x) + } else if (unique(x[['Attribute_type']] == 'range')) { + x <- .modifyRange(x) + } + x <- x[,purrr::map_lgl(x, ~ !all(is.na(.x)))] + x <- dplyr::distinct(x) + as.data.frame(x) + }) - return(split_df) + return(split_df) } ## Helper function for .reshapeBacDive .catToLog <- function(df) { - df[['Attribute_group']] <- df[['Attribute']] - df[['Attribute']] <- df[['Attribute_value']] - df[['Attribute_value']] <- TRUE - df[['Attribute_type']] <- 'discrete' # TODO Maybe remove this line? I don't think this is being used - return(df) + df[['Attribute_group']] <- df[['Attribute']] + df[['Attribute']] <- df[['Attribute_value']] + df[['Attribute_value']] <- TRUE + df[['Attribute_type']] <- 'discrete' + return(df) } diff --git a/R/bugphyzz.R b/R/bugphyzz.R index 1986e0a6..dd48e225 100644 --- a/R/bugphyzz.R +++ b/R/bugphyzz.R @@ -1,9 +1,9 @@ utils::globalVariables(c( - "Rank", - "Attribute", "Attribute_group", "Attribute_new", "Attribute_range", - "Attribute_value", "Attribute_value_max", "Attribute_value_min", "Br-C10:1", - "Evidence", "Frequency", "NCBI_ID", "Oxo-C19:1", "Taxon_name", "Unit", - "attribute", "functionname", "gram_stain", "physiology", "unit", "value" + "Rank", + "Attribute", "Attribute_group", "Attribute_new", "Attribute_range", + "Attribute_value", "Attribute_value_max", "Attribute_value_min", "Br-C10:1", + "Evidence", "Frequency", "NCBI_ID", "Oxo-C19:1", "Taxon_name", "Unit", + "attribute", "functionname", "gram_stain", "physiology", "unit", "value" )) #' Import bugphyzz @@ -54,65 +54,76 @@ utils::globalVariables(c( #' names(bp) #' importBugphyzz <- function( - version = "d3fd894", force_download = FALSE, v = 0.5, exclude_rarely = TRUE + version = "d3fd894", force_download = FALSE, v = 0.5, + exclude_rarely = TRUE ) { - ## output is a list of three data.frames - ## one of each: binary, multistate, numeric - output <- .downloadResource(version, force_download) + ## output is a list of three data.frames + ## one of each: binary, multistate, numeric + output <- .downloadResource(version, force_download) - ## TODO add release version - output <- lapply(output, function(x) split(x, x$Attribute)) - output <- purrr::list_flatten(output) + ## TODO add release version + output <- lapply(output, function(x) split(x, x$Attribute)) + output <- purrr::list_flatten(output) - ## TODO correct plant pathogenicity name earlier in the workflow or - ## better yet, directly in the curation - pos <- which(names(output) == "plant pathogenity") - names(output)[pos] <- "plant pathogenicity" - output <- purrr::map(output, ~ { - .x |> - dplyr::mutate( - Attribute = ifelse( - Attribute == "plant pathogenity", - "plant pathogenicity", - Attribute - ) - ) - }) + ## TODO correct plant pathogenicity name earlier in the workflow or + ## better yet, directly in the curation + pos <- which(names(output) == "plant pathogenity") + names(output)[pos] <- "plant pathogenicity" + output <- purrr::map(output, ~ { + .x |> + dplyr::mutate( + Attribute = ifelse( + Attribute == "plant pathogenity", + "plant pathogenicity", + Attribute + ) + ) + }) - names(output) <- purrr::map_chr(output, ~ unique(.x$Attribute)) - val <- .validationData() |> - dplyr::filter(rank == "all") |> - dplyr::select(physiology, attribute, value) |> - dplyr::mutate(physiology = tolower(physiology)) |> - dplyr::mutate(attribute = tolower(attribute)) + names(output) <- purrr::map_chr(output, ~ unique(.x$Attribute)) + val <- .validationData() |> + dplyr::filter(rank == "all") |> + dplyr::select(physiology, attribute, value) |> + dplyr::mutate(physiology = tolower(physiology)) |> + dplyr::mutate(attribute = tolower(attribute)) - output <- purrr::map(output, ~ { - attr_type <- unique(.x$Attribute_type) - if (attr_type == "binary") { - val <- dplyr::select(val, Attribute = attribute, value) - o <- dplyr::left_join(.x, val, by = "Attribute" ) - } else if (attr_type == "multistate-intersection" || attr_type == "multistate-union") { - val <- dplyr::select(val, Attribute = physiology, Attribute_value = attribute, value) - o <- dplyr::left_join(dplyr::mutate(.x, Attribute_value = tolower(Attribute_value)) , val, by = c("Attribute", "Attribute_value")) - } else if (attr_type == "numeric") { - val <- dplyr::select(val, Attribute = attribute, value) - o <- dplyr::left_join(.x, val, by = "Attribute") |> - dplyr::rename(NSTI = nsti) - } - o |> - dplyr::filter( - !(value < v & Evidence == "asr") - ) |> - dplyr::mutate(value = ifelse(Evidence != "asr", NA, value)) |> - dplyr::rename(Validation = value) - }) + output <- purrr::map(output, ~ { + attr_type <- unique(.x$Attribute_type) + if (attr_type == "binary") { + val <- dplyr::select(val, Attribute = attribute, value) + o <- dplyr::left_join(.x, val, by = "Attribute" ) + } else if ( + attr_type == "multistate-intersection" || + attr_type == "multistate-union" + ) { + val <- dplyr::select( + val, Attribute = physiology, Attribute_value = attribute, value + ) + o <- dplyr::left_join( + dplyr::mutate(.x, Attribute_value = tolower(Attribute_value)), + val, by = c("Attribute", "Attribute_value") + ) + } else if (attr_type == "numeric") { + val <- dplyr::select(val, Attribute = attribute, value) + o <- dplyr::left_join(.x, val, by = "Attribute") |> + dplyr::rename(NSTI = nsti) + } + o |> + dplyr::filter( + !(value < v & Evidence == "asr") + ) |> + dplyr::mutate(value = ifelse(Evidence != "asr", NA, value)) |> + dplyr::rename(Validation = value) + }) - if (exclude_rarely) { - output <- purrr::map(output, ~ dplyr::filter(.x, Frequency != "rarely")) - } - return(output) + if (exclude_rarely) { + output <- purrr::map( + output, ~ dplyr::filter(.x, Frequency != "rarely") + ) + } + return(output) } #' Make signatures @@ -126,8 +137,8 @@ importBugphyzz <- function( #' @param tax_level A character vector. Taxonomic rank. Valid options: #' superkingdom, kingdom, phylum, class, order, family, genus, species, strain. #' They can be combined. "mixed" is equivalent to select all valid ranks. -#' @param evidence A character vector. Valid options: exp, igc, nas, tas, tax, asr. -#' They can be combined. Default is all. +#' @param evidence A character vector. Valid options: exp, igc, nas, tas, tax, +#' asr. They can be combined. Default is all. #' @param frequency A character vector. Valid options: always, usually, #' sometimes, rarely, unknown. They can be combined. By default, "rarely" is #' excluded. @@ -147,51 +158,56 @@ importBugphyzz <- function( #' sigs <- purrr::list_flatten(sigs, name_spec = "{inner}") #' makeSignatures <- function( - dat, tax_id_type = "NCBI_ID", - tax_level = "mixed", - evidence = c("exp", "igc", "tas", "nas", "tax", "asr"), - frequency = c("always", "usually", "sometimes", "unknown"), - min_size = 10, min = NULL, max = NULL + dat, tax_id_type = "NCBI_ID", + tax_level = "mixed", + evidence = c("exp", "igc", "tas", "nas", "tax", "asr"), + frequency = c("always", "usually", "sometimes", "unknown"), + min_size = 10, min = NULL, max = NULL ) { - attr_type <- unique(dat$Attribute_type) - if ("mixed" %in% tax_level) { - tax_level <- c( - "kingdom", "phylum", "class", "order", "family", "genus", "species", - "strain" - ) - } - dat <- dat |> - dplyr::filter(Rank %in% tax_level) |> - dplyr::filter(Evidence %in% evidence) |> - dplyr::filter(Frequency %in% frequency) - if (!nrow(dat)) { - warning( - "Not enough data for creating signatures. Try different filtering options", - call. = FALSE - ) - return(NULL) - } - if (attr_type %in% c("multistate-intersection", "binary", "multistate-union")) { - s <- .makeSignaturesDiscrete(dat = dat, tax_id_type = tax_id_type) - } else if (attr_type %in% c("range", "numeric")) { - s <- .makeSignaturesNumeric( - dat = dat, tax_id_type = tax_id_type, min = min, max = max - ) - } - output <- purrr::keep(s, ~ length(.x) >= min_size) - if (!length(output)) { - warning( - "Not enough data for creating signatures. Try different filtering options", - call. = FALSE - ) - } - return(output) + attr_type <- unique(dat$Attribute_type) + if ("mixed" %in% tax_level) { + tax_level <- c( + "kingdom", "phylum", "class", "order", "family", "genus", "species", + "strain" + ) + } + dat <- dat |> + dplyr::filter(Rank %in% tax_level) |> + dplyr::filter(Evidence %in% evidence) |> + dplyr::filter(Frequency %in% frequency) + if (!nrow(dat)) { + warning( + "Not enough data for creating signatures.", + " Try different filtering options", + call. = FALSE + ) + return(NULL) + } + if ( + attr_type %in% + c("multistate-intersection", "binary", "multistate-union") + ) { + s <- .makeSignaturesDiscrete(dat = dat, tax_id_type = tax_id_type) + } else if (attr_type %in% c("range", "numeric")) { + s <- .makeSignaturesNumeric( + dat = dat, tax_id_type = tax_id_type, min = min, max = max + ) + } + output <- purrr::keep(s, ~ length(.x) >= min_size) + if (!length(output)) { + warning( + "Not enough data for creating signatures.", + " Try different filtering options", + call. = FALSE + ) + } + return(output) } #' Get Taxon Signatures #' -#' \code{getTaxonSignatures} returns the names of all of the signatures associated -#' with a particular taxon. More details can be found in the main +#' \code{getTaxonSignatures} returns the names of all of the signatures +#' associated with a particular taxon. More details can be found in the main #' bugphyzz vignette; please run `browseVignettes("bugphyzz")`. #' #' @param tax A valid NCBI ID or taxon name. If taxon name is used, the @@ -210,151 +226,171 @@ makeSignatures <- function( #' sig_names_2 <- getTaxonSignatures(taxonName, bp, tax_id_type = "Taxon_name") #' getTaxonSignatures <- function(tax, bp, ...) { - sigs <- purrr::map(bp, makeSignatures, ...) - sigs <- purrr::list_flatten(sigs, name_spec = "{inner}") - pos <- which(purrr::map_lgl(sigs, ~ tax %in% .x)) - output <- names(sigs)[pos] - return(output) + sigs <- purrr::map(bp, makeSignatures, ...) + sigs <- purrr::list_flatten(sigs, name_spec = "{inner}") + pos <- which(purrr::map_lgl(sigs, ~ tax %in% .x)) + output <- names(sigs)[pos] + return(output) } # Non exported functions ---------------------------------------------------- .makeSignaturesDiscrete <- function(dat, tax_id_type = "NCBI_ID") { - dat |> - dplyr::mutate( - Attribute = paste0("bugphyzz:", Attribute, "|", Attribute_value) - ) |> - {\(y) split(y, y$Attribute)}() |> - lapply(function(x) unique(x[[tax_id_type]])) + dat |> + dplyr::mutate( + Attribute = paste0("bugphyzz:", Attribute, "|", Attribute_value) + ) |> + {\(y) split(y, y$Attribute)}() |> + lapply(function(x) unique(x[[tax_id_type]])) } .makeSignaturesNumeric <- function( - dat, tax_id_type = "NCBI_ID", min = NULL, max = NULL + dat, tax_id_type = "NCBI_ID", min = NULL, max = NULL ) { - if (!is.null(min) || !is.null(max)) { - if (is.null(min)) { - message("Minimum unespecified. Using ", min(dat$Attribute_value), ".") - min <- min(dat$Attribute_value) - } - if (is.null(max)) { - message("Maximum unespecified. Using ", max(dat$Attribute_value), ".") - max <- max(dat$Attribute_value) - } - dat <- dat |> - dplyr::filter( - Attribute_value >= min & Attribute_value <= max - ) |> - dplyr::mutate( - Attribute = paste0("bugphyzz:", Attribute, "| >=", min, " & <=", max) - ) - } else { - thr <- .thresholds() |> - dplyr::filter(Attribute_group == unique(dat$Attribute)) - attr_name <- thr$Attribute - min_values <- thr$lower - max_values <- thr$upper - dat$tmp_col <- NA - for (i in seq_along(attr_name)) { - if (is.na(min_values[i])) - min_values[i] <- min(dat$Attribute_value) - 0.01 - if (is.na(max_values[i])) - max_values[i] <- max(dat$Attribute_value) - pos <- which(dat$Attribute_value > min_values[i] & dat$Attribute_value <= max_values[i]) - dat$tmp_col[pos] <- attr_name[i] - dat$Attribute[pos] <- paste0("bugphyzz:", dat$Attribute[pos], "|", attr_name[i], "| > ", round(min_values[i], 2), " & <= ", max_values[i]) + if (!is.null(min) || !is.null(max)) { + if (is.null(min)) { + message( + "Minimum unespecified. Using ", min(dat$Attribute_value), "." + ) + min <- min(dat$Attribute_value) + } + if (is.null(max)) { + message( + "Maximum unespecified. Using ", max(dat$Attribute_value), "." + ) + max <- max(dat$Attribute_value) + } + dat <- dat |> + dplyr::filter( + Attribute_value >= min & Attribute_value <= max + ) |> + dplyr::mutate( + Attribute = paste0( + "bugphyzz:", Attribute, "| >=", min, " & <=", max + ) + ) + } else { + thr <- .thresholds() |> + dplyr::filter(Attribute_group == unique(dat$Attribute)) + attr_name <- thr$Attribute + min_values <- thr$lower + max_values <- thr$upper + dat$tmp_col <- NA + for (i in seq_along(attr_name)) { + if (is.na(min_values[i])) + min_values[i] <- min(dat$Attribute_value) - 0.01 + if (is.na(max_values[i])) + max_values[i] <- max(dat$Attribute_value) + pos <- which( + dat$Attribute_value > min_values[i] & + dat$Attribute_value <= max_values[i] + ) + dat$tmp_col[pos] <- attr_name[i] + dat$Attribute[pos] <- paste0( + "bugphyzz:", dat$Attribute[pos], "|", attr_name[i], "| > ", + round(min_values[i], 2), " & <= ", max_values[i] + ) + } } - } - dat |> - {\(y) split(y, y$Attribute)}() |> - lapply(function(x) unique(x[[tax_id_type]])) + dat |> + {\(y) split(y, y$Attribute)}() |> + lapply(function(x) unique(x[[tax_id_type]])) } .thresholds <- function() { - fpath <- file.path('extdata', 'thresholds.tsv') - fname <- system.file(fpath, package = 'bugphyzz', mustWork = TRUE) - utils::read.table(fname, header = TRUE, sep = '\t') |> - dplyr::mutate( - range = dplyr::case_when( - is.na(lower) ~ paste0('<=', upper), - is.na(upper) ~ paste0('>=', lower), - TRUE ~ paste0(lower, '-', upper) - ), - unit = ifelse(is.na(unit), '', unit) - ) |> - dplyr::mutate(Attribute_range = paste0(range, unit)) |> - dplyr::relocate( - Attribute_group, Attribute, Attribute_range - ) + fpath <- file.path('extdata', 'thresholds.tsv') + fname <- system.file(fpath, package = 'bugphyzz', mustWork = TRUE) + utils::read.table(fname, header = TRUE, sep = '\t') |> + dplyr::mutate( + range = dplyr::case_when( + is.na(lower) ~ paste0('<=', upper), + is.na(upper) ~ paste0('>=', lower), + TRUE ~ paste0(lower, '-', upper) + ), + unit = ifelse(is.na(unit), '', unit) + ) |> + dplyr::mutate(Attribute_range = paste0(range, unit)) |> + dplyr::relocate( + Attribute_group, Attribute, Attribute_range + ) } .validationData <- function() { - fname <- system.file("extdata", "validation_summary.tsv", package = "bugphyzz") - utils::read.table( - file = fname, header = TRUE, sep = "\t", row.names = NULL - ) |> - dplyr::mutate( - value = dplyr::case_when( - !is.na(mcc_mean) & is.na(r2_mean) ~ mcc_mean, - is.na(mcc_mean) & !is.na(r2_mean) ~ r2_mean - ) + fname <- system.file( + "extdata", "validation_summary.tsv", package = "bugphyzz" ) + utils::read.table( + file = fname, header = TRUE, sep = "\t", row.names = NULL + ) |> + dplyr::mutate( + value = dplyr::case_when( + !is.na(mcc_mean) & is.na(r2_mean) ~ mcc_mean, + is.na(mcc_mean) & !is.na(r2_mean) ~ r2_mean + ) + ) } ## Import a version of bupghyzz .downloadResource <- function(version, force_download) { - if (stringr::str_detect(version, "^10.5281/zenodo.[0-9]+$")) { - suffix <- sub("^10.5281/zenodo\\.", "", version) - output <- .downloadZ(suffix, force_download) - } else if (version == "devel" || stringr::str_detect(version, stringr::regex("^[:alnum:]{7}$")) ){ - output <- .downloadGH(version, force_download) - } else { - stop("Version must be a Zenodo DOI, GitHub commit hash, or 'devel'.") - } - return(output) + if (stringr::str_detect(version, "^10.5281/zenodo.[0-9]+$")) { + suffix <- sub("^10.5281/zenodo\\.", "", version) + output <- .downloadZ(suffix, force_download) + } else if ( + version == "devel" || + stringr::str_detect(version, stringr::regex("^[:alnum:]{7}$")) + ){ + output <- .downloadGH(version, force_download) + } else { + stop("Version must be a Zenodo DOI, GitHub commit hash, or 'devel'.") + } + return(output) } ## Function for downloading data on Zenodo .downloadZ <- function(record, force_download) { - base_url <- paste0("https://zenodo.org/api/records/", record) - req <- httr2::request(base_url) - res <- httr2::req_perform(req) - l <- httr2::resp_body_json(res) + base_url <- paste0("https://zenodo.org/api/records/", record) + req <- httr2::request(base_url) + res <- httr2::req_perform(req) + l <- httr2::resp_body_json(res) - file_names_api <- purrr::map_chr(l$files, ~ .x$links$self) - file_names_url <- sub("(^.*)(api/)(.*)(/content$)", "\\1\\3", file_names_api) + file_names_api <- purrr::map_chr(l$files, ~ .x$links$self) + file_names_url <- sub( + "(^.*)(api/)(.*)(/content$)", "\\1\\3", file_names_api + ) - rpath <- .getResource( - rname = paste0("bugphyzz.zip"), - url = file_names_url, verbose = TRUE, force = force_download - ) - temp_dir <- tempdir() - utils::unzip(zipfile = rpath, exdir = temp_dir, junkpaths = TRUE) - files <- list.files(temp_dir, pattern = "csv", full.names = TRUE) + rpath <- .getResource( + rname = paste0("bugphyzz.zip"), + url = file_names_url, verbose = TRUE, force = force_download + ) + temp_dir <- tempdir() + utils::unzip(zipfile = rpath, exdir = temp_dir, junkpaths = TRUE) + files <- list.files(temp_dir, pattern = "csv", full.names = TRUE) - output <- vector("list", length(files)) - for (i in seq_along(output)) { - output[[i]] <- utils::read.csv(files, header = TRUE, skip = 1) |> - dplyr::mutate(Attribute = tolower(Attribute)) - } - return(output) + output <- vector("list", length(files)) + for (i in seq_along(output)) { + output[[i]] <- utils::read.csv(files, header = TRUE, skip = 1) |> + dplyr::mutate(Attribute = tolower(Attribute)) + } + return(output) } ## Function for downloading data on GitHub .downloadGH <- function(version, force_download) { file_suffix <- c("binary", "multistate", "numeric") - urls <- paste0("https://github.com/waldronlab/bugphyzzExports/raw/", - version, "/bugphyzz_", file_suffix, ".csv" + urls <- paste0( + "https://github.com/waldronlab/bugphyzzExports/raw/", version, + "/bugphyzz_", file_suffix, ".csv" ) names(urls) <- c("binary", "multistate", "numeric") output <- vector("list", length(urls)) for (i in seq_along(output)) { - message("Importing ", names(urls)[i], " data...") - names(output)[i] <- names(urls)[i] - rpath <- .getResource( - rname = paste0("bugphyzz_", names(urls)[i], ".csv"), - url = urls[i], verbose = TRUE, force = force_download - ) - output[[i]] <- utils::read.csv(rpath, header = TRUE, skip = 1) |> - dplyr::mutate(Attribute = tolower(Attribute)) + message("Importing ", names(urls)[i], " data...") + names(output)[i] <- names(urls)[i] + rpath <- .getResource( + rname = paste0("bugphyzz_", names(urls)[i], ".csv"), + url = urls[i], verbose = TRUE, force = force_download + ) + output[[i]] <- utils::read.csv(rpath, header = TRUE, skip = 1) |> + dplyr::mutate(Attribute = tolower(Attribute)) } return(output) } diff --git a/R/cache.R b/R/cache.R index d1aebb41..70f2f329 100644 --- a/R/cache.R +++ b/R/cache.R @@ -8,18 +8,18 @@ res <- BiocFileCache::bfcquery(cache, rname, "rname", exact = TRUE) rid <- res$rid if (length(rid) > 0 && force) { - time <- res$create_time - if (verbose) { - message('Replacing previous version downloaded on ', time, '.') - } - BiocFileCache::bfcremove(cache, rid) - rid <- names(BiocFileCache::bfcadd(cache, rname, url)) + time <- res$create_time + if (verbose) { + message('Replacing previous version downloaded on ', time, '.') + } + BiocFileCache::bfcremove(cache, rid) + rid <- names(BiocFileCache::bfcadd(cache, rname, url)) } if (!length(rid)) { - if (verbose) { - message( "Downloading, ", rname, ".") - } - rid <- names(BiocFileCache::bfcadd(cache, rname, url)) + if (verbose) { + message( "Downloading, ", rname, ".") + } + rid <- names(BiocFileCache::bfcadd(cache, rname, url)) } time <- BiocFileCache::bfcinfo(cache, rid)$create_time message('Using data downloaded on ', time, '.') @@ -27,6 +27,6 @@ } .removeCache <- function() { - cache <- .getCache() - BiocFileCache::removebfc(cache) + cache <- .getCache() + BiocFileCache::removebfc(cache) } diff --git a/R/fattyAcidComposition.R b/R/fattyAcidComposition.R index 92b5c67e..7e9097e4 100644 --- a/R/fattyAcidComposition.R +++ b/R/fattyAcidComposition.R @@ -4,26 +4,28 @@ ## TODO Names of the Fatty Acids should be more "user-friendly" ## TODO Maybe a threshold should be decided to consider a FA as present or not. .fattyAcidComposition <- function(){ - link <- .customLinks() |> - dplyr::filter(functionname == "fattyAcidComposition") |> - dplyr::pull(link) - fac_wide <- utils::read.csv(link, check.names = FALSE) - fac_long <- fac_wide |> - tidyr::pivot_longer( - cols = `Br-C10:1`:`Oxo-C19:1`, - names_to = "Attribute_new", values_to = "Attribute_value" - ) |> - dplyr::mutate(NCBI_ID = as.character(NCBI_ID)) - dplyr::left_join(fac_long, ranks_parents, by = "NCBI_ID") |> - as.data.frame() |> - .addSourceInfo() |> - purrr::modify_at( - .at = c('Attribute', 'Frequency', 'Evidence', 'Confidence_in_curation'), - .f = ~ stringr::str_to_lower(.x) - ) |> - dplyr::select(-Attribute) |> - dplyr::rename(Attribute = Attribute_new) |> - .reorderColumns(attr_type = 'numeric') + link <- .customLinks() |> + dplyr::filter(functionname == "fattyAcidComposition") |> + dplyr::pull(link) + fac_wide <- utils::read.csv(link, check.names = FALSE) + fac_long <- fac_wide |> + tidyr::pivot_longer( + cols = `Br-C10:1`:`Oxo-C19:1`, + names_to = "Attribute_new", values_to = "Attribute_value" + ) |> + dplyr::mutate(NCBI_ID = as.character(NCBI_ID)) + dplyr::left_join(fac_long, ranks_parents, by = "NCBI_ID") |> + as.data.frame() |> + .addSourceInfo() |> + purrr::modify_at( + .at = c( + 'Attribute', 'Frequency', 'Evidence', 'Confidence_in_curation' + ), + .f = ~ stringr::str_to_lower(.x) + ) |> + dplyr::select(-Attribute) |> + dplyr::rename(Attribute = Attribute_new) |> + .reorderColumns(attr_type = 'numeric') } ## Function to import custom links @@ -31,7 +33,9 @@ fname <- system.file("extdata/spreadsheet_customlinks.tsv", package = "bugphyzz") links <- utils::read.table(fname, sep = '\t', header = TRUE) - ifelse(keyword[1] == "all", links, links <- - links[links$physiology %in% keyword,]) + ifelse( + keyword[1] == "all", links, + links <- links[links$physiology %in% keyword,] + ) links } diff --git a/R/physiologies.R b/R/physiologies.R index c401a9ba..651cd795 100644 --- a/R/physiologies.R +++ b/R/physiologies.R @@ -9,8 +9,8 @@ #' Valid keyboards can be checked with \code{showPhys}. If 'all', all #' physiologies are imported. #' @param full_source Logical. If `TRUE`, the Attribute_source column will -#' contain full source information. If `FALSE`, the Attribute_source column will -#' contain shortened versions of the sources. Default is `FALSE`. +#' contain full source information. If `FALSE`, the Attribute_source column +#' will contain shortened versions of the sources. Default is `FALSE`. #' #' @return A list of data.frames in tidy format. #' @@ -21,69 +21,69 @@ #' df <- physiologies('aerophilicity')[[1]] #' physiologies <- function(keyword = 'all', full_source = FALSE) { - keyword <- .checkKeyword(keyword) - cond1 <- any(keyword %in% showPhys('spreadsheets')) - cond2 <- any(keyword %in% showPhys('bacdive')) + keyword <- .checkKeyword(keyword) + cond1 <- any(keyword %in% showPhys('spreadsheets')) + cond2 <- any(keyword %in% showPhys('bacdive')) - if (cond1 && cond2) { - spreadsheets <- .importSpreadsheets(keyword = keyword) - spreadsheets <- spreadsheets[names(spreadsheets) %in% keyword] - bacdive <- .reshapeBacDive(.getBacDive(verbose = FALSE)) - bacdive <- bacdive[names(bacdive) %in% keyword] - physiologies <- vector('list', length(keyword)) - for (i in seq_along(keyword)) { - df1 <- spreadsheets[[keyword[i]]] - df2 <- bacdive[[keyword[i]]] - physiologies[[i]] <- dplyr::bind_rows(df1, df2) - names(physiologies)[i] <- keyword[i] - message('Finished ', keyword[i], '.') + if (cond1 && cond2) { + spreadsheets <- .importSpreadsheets(keyword = keyword) + spreadsheets <- spreadsheets[names(spreadsheets) %in% keyword] + bacdive <- .reshapeBacDive(.getBacDive(verbose = FALSE)) + bacdive <- bacdive[names(bacdive) %in% keyword] + physiologies <- vector('list', length(keyword)) + for (i in seq_along(keyword)) { + df1 <- spreadsheets[[keyword[i]]] + df2 <- bacdive[[keyword[i]]] + physiologies[[i]] <- dplyr::bind_rows(df1, df2) + names(physiologies)[i] <- keyword[i] + message('Finished ', keyword[i], '.') + } + } else if (cond1 && !cond2) { + spreadsheets <- .importSpreadsheets(keyword = keyword) + physiologies <- spreadsheets[names(spreadsheets) %in% keyword] + for (i in seq_along(keyword)) { + message('Finished ', keyword[i], '.') + } + } else if (!cond1 && cond2) { + bacdive <- .reshapeBacDive(.getBacDive(verbose = FALSE)) + physiologies <- bacdive[names(bacdive) %in% keyword] + for (i in seq_along(keyword)) { + message('Finished ', keyword[i], '.') + } } - } else if (cond1 && !cond2) { - spreadsheets <- .importSpreadsheets(keyword = keyword) - physiologies <- spreadsheets[names(spreadsheets) %in% keyword] - for (i in seq_along(keyword)) { - message('Finished ', keyword[i], '.') - } - } else if (!cond1 && cond2) { - bacdive <- .reshapeBacDive(.getBacDive(verbose = FALSE)) - physiologies <- bacdive[names(bacdive) %in% keyword] - for (i in seq_along(keyword)) { - message('Finished ', keyword[i], '.') - } - } - physiologies <- lapply(physiologies, function(df) { - df <- df |> - purrr::modify_if(.p = is.character, ~ stringr::str_squish(.x)) |> - .addSourceInfo() |> - purrr::modify_at( - .at = c('Frequency', 'Evidence', 'Confidence_in_curation'), - ~ stringr::str_squish(stringr::str_to_lower(.x)) - ) |> - dplyr::distinct() + physiologies <- lapply(physiologies, function(df) { + df <- df |> + purrr::modify_if(.p = is.character, ~ stringr::str_squish(.x)) |> + .addSourceInfo() |> + purrr::modify_at( + .at = c('Frequency', 'Evidence', 'Confidence_in_curation'), + ~ stringr::str_squish(stringr::str_to_lower(.x)) + ) |> + dplyr::distinct() - if (full_source) { - df$Attribute_source <- df$full_source - } - df$full_source <- NULL + if (full_source) { + df$Attribute_source <- df$full_source + } + df$full_source <- NULL - df <- .reorderColumns( - df = df, - name = unique(df$Attribute_group), - attr_type = unique(df$Attribute_type) - ) + df <- .reorderColumns( + df = df, + name = unique(df$Attribute_group), + attr_type = unique(df$Attribute_type) + ) - df <- as.data.frame(df[, vapply(df, \(y) !all(is.na(y)), logical(1))]) + df <- as.data.frame(df[, vapply(df, \(y) !all(is.na(y)), logical(1))]) - ## TODO this code could be somewhere else - if (unique(df$Attribute_group) == 'aerophilicity') { - df <- .homogenizeAerophilicityAttributeNames(df) - } + ## TODO this code could be somewhere else + if (unique(df$Attribute_group) == 'aerophilicity') { + df <- .homogenizeAerophilicityAttributeNames(df) + } - dplyr::distinct(df) - }) + dplyr::distinct(df) + }) - return(physiologies) + return(physiologies) } #' Show list of available physiologies (for devs) @@ -104,243 +104,267 @@ physiologies <- function(keyword = 'all', full_source = FALSE) { #' showPhys('spreadsheets') #' showPhys <- function(which_names = 'all') { - fname <- system.file('extdata', 'spreadsheet_links.tsv', package = 'bugphyzz') - links <- utils::read.table(fname, header = TRUE, sep = '\t') - spreadsheet_phys <- links[['physiology']] - if (which_names == 'all') - ## bacdive_phys_names is a character vector saved as internal data - phys_names <- sort(unique(c(spreadsheet_phys, bacdive_phys_names))) - if (which_names == 'spreadsheets') - phys_names <- spreadsheet_phys - if (which_names == 'bacdive') - phys_names <- bacdive_phys_names - return(phys_names) + fname <- system.file( + 'extdata', 'spreadsheet_links.tsv', package = 'bugphyzz' + ) + links <- utils::read.table(fname, header = TRUE, sep = '\t') + spreadsheet_phys <- links[['physiology']] + if (which_names == 'all') + ## bacdive_phys_names is a character vector saved as internal data + phys_names <- sort(unique(c(spreadsheet_phys, bacdive_phys_names))) + if (which_names == 'spreadsheets') + phys_names <- spreadsheet_phys + if (which_names == 'bacdive') + phys_names <- bacdive_phys_names + return(phys_names) } ## Helper function for physiologies .checkKeyword <- function(keyword) { - keyword <- unique(sort(keyword)) - if ('all' %in% keyword) { - if (length(keyword) > 1) { - stop( - "Found 'all' among the keywords. Are you sure that you want to ", - "import all of the physiologies? If so, use 'all' alone. Quitting.", - call. = FALSE - ) - } else if (length(keyword) == 1) { - message('All physiologies will be imported.') - keyword <- showPhys() + keyword <- unique(sort(keyword)) + if ('all' %in% keyword) { + if (length(keyword) > 1) { + stop( + "Found 'all' among the keywords.", + "Are you sure that you want to import all of the physiologies?", + "If so, use 'all' alone. Quitting.", + call. = FALSE + ) + } else if (length(keyword) == 1) { + message('All physiologies will be imported.') + keyword <- showPhys() + } } - } - valid_keywords <- showPhys() - lgl_vct <- keyword %in% valid_keywords - if (any(!lgl_vct) ) { - invalid_keywords <- keyword[!lgl_vct] - stop( - "Invalid keyword(s): ", paste0(invalid_keywords, collapse = ', '), '.', - " Check valid keywords with showPhys() or use 'all' to import all", - " physiologies.", - call. = FALSE - ) - } - return(keyword) + valid_keywords <- showPhys() + lgl_vct <- keyword %in% valid_keywords + if (any(!lgl_vct) ) { + invalid_keywords <- keyword[!lgl_vct] + stop( + "Invalid keyword(s): ", + paste0(invalid_keywords, collapse = ', '), '.', + " Check valid keywords with showPhys() or use 'all' to import all", + " physiologies.", + call. = FALSE + ) + } + return(keyword) } ## Helper function for physiologies .importSpreadsheets <- function(keyword) { - parent_col_names <- c('Parent_name', 'Parent_NCBI_ID', 'Parent_rank') - fname <- system.file('extdata', 'spreadsheet_links.tsv', package = 'bugphyzz') - links <- utils::read.table(fname, header = TRUE, sep = '\t') - links <- links[links[['physiology']] %in% keyword,] - spreadsheets <- vector('list', nrow(links)) - for (i in seq_along(spreadsheets)) { - phys_name <- links[i, 'physiology', drop = FALSE][[1]] - attr_type <- links[i, 'attribute_type', drop = FALSE][[1]] - names(spreadsheets)[i] <- phys_name - url <- links[i, 'link', drop = FALSE][[1]] - df <- dplyr::distinct(utils::read.csv(url)) - df[['Attribute_type']] <- attr_type - df[['Attribute_group']] <- phys_name - df[['NCBI_ID']] <- as.character(df[['NCBI_ID']]) - df <- df[!is.na(df[['Attribute_value']]),] + parent_col_names <- c('Parent_name', 'Parent_NCBI_ID', 'Parent_rank') + fname <- system.file( + 'extdata', 'spreadsheet_links.tsv', package = 'bugphyzz' + ) + links <- utils::read.table(fname, header = TRUE, sep = '\t') + links <- links[links[['physiology']] %in% keyword,] + spreadsheets <- vector('list', nrow(links)) + for (i in seq_along(spreadsheets)) { + phys_name <- links[i, 'physiology', drop = FALSE][[1]] + attr_type <- links[i, 'attribute_type', drop = FALSE][[1]] + names(spreadsheets)[i] <- phys_name + url <- links[i, 'link', drop = FALSE][[1]] + df <- dplyr::distinct(utils::read.csv(url)) + df[['Attribute_type']] <- attr_type + df[['Attribute_group']] <- phys_name + df[['NCBI_ID']] <- as.character(df[['NCBI_ID']]) + df <- df[!is.na(df[['Attribute_value']]),] - if (unique(df[['Attribute_type']]) == 'numeric') { - df <- .numericToRange(df) - } else if (unique(df[['Attribute_type']] == 'range')) { - df <- .modifyRange(df) - } else if (unique(df[['Attribute_type']] %in% .DISCRETE_ATTRIBUTE_TYPES())) { - df <- dplyr::filter(df, Attribute_value == TRUE | Attribute_value == FALSE) - } + if (unique(df[['Attribute_type']]) == 'numeric') { + df <- .numericToRange(df) + } else if (unique(df[['Attribute_type']] == 'range')) { + df <- .modifyRange(df) + } else if ( + unique(df[['Attribute_type']] %in% .DISCRETE_ATTRIBUTE_TYPES()) + ) { + df <- dplyr::filter( + df, Attribute_value == TRUE | Attribute_value == FALSE + ) + } - if (all(parent_col_names %in% colnames(df))) { - df$Parent_NCBI_ID <- stringr::str_squish(as.character(df$Parent_NCBI_ID)) - } else { - ## ranks_parents is an internal object (data.frame) in bugphyzz - rp <- purrr::modify_at( - .x = ranks_parents, - .at = c('NCBI_ID', 'Parent_NCBI_ID'), - .f = as.character - ) - df <- dplyr::left_join(df, rp, by = "NCBI_ID") + if (all(parent_col_names %in% colnames(df))) { + df$Parent_NCBI_ID <- stringr::str_squish( + as.character(df$Parent_NCBI_ID) + ) + } else { + ## ranks_parents is an internal object (data.frame) in bugphyzz + rp <- purrr::modify_at( + .x = ranks_parents, + .at = c('NCBI_ID', 'Parent_NCBI_ID'), + .f = as.character + ) + df <- dplyr::left_join(df, rp, by = "NCBI_ID") + } + spreadsheets[[i]] <- df } - spreadsheets[[i]] <- df - } - return(spreadsheets) + return(spreadsheets) } ## Helper function for .importSpreadsheets .numericToRange <- function(df) { - df <- df |> - dplyr::group_by(NCBI_ID, Taxon_name) |> - dplyr::mutate( - Attribute_value_min = as.double(Attribute_value), - Attribute_value_max = as.double(Attribute_value), - Attribute_type = 'range' - ) |> - dplyr::ungroup() |> - dplyr::distinct() - df[['Attribute_value']] <- NULL - return(df) + df <- df |> + dplyr::group_by(NCBI_ID, Taxon_name) |> + dplyr::mutate( + Attribute_value_min = as.double(Attribute_value), + Attribute_value_max = as.double(Attribute_value), + Attribute_type = 'range' + ) |> + dplyr::ungroup() |> + dplyr::distinct() + df[['Attribute_value']] <- NULL + return(df) } ## Helper function for .importSpreadsheets .modifyRange <- function(df) { - num <- '[0-9]+(\\.[0-9]+)?' - regex1 <- paste0('^\\-?', num, '(\\-', num, ')?$') - regex2 <- paste0('^(<|>)(\\-)?', num, '$') - regex <- paste0('(', regex1, '|', regex2, ')') - df <- df |> - dplyr::filter(grepl(regex, Attribute_value)) |> - dplyr::mutate( - Attribute_value = sub('^(\\-)([0-9]+(\\.[0-9]+)?)', 'minus\\2', Attribute_value) - ) |> - dplyr::mutate( - Attribute_value = gsub(' ', '', Attribute_value), - Attribute_value = dplyr::case_when( - grepl('<', Attribute_value) ~ paste0('-', Attribute_value), - grepl('>', Attribute_value) ~ paste0(Attribute_value, '-'), - !grepl("\\-", Attribute_value) ~ paste0(Attribute_value, '-', Attribute_value), - grepl("^\\-", Attribute_value) ~ paste0("minusInf", Attribute_value), - grepl("\\-$", Attribute_value) ~ paste0(Attribute_value, "Inf"), - TRUE ~ Attribute_value - ), - Attribute_value = sub('(<|>)', '', Attribute_value), - Attribute_value = dplyr::case_when( - grepl("^\\-", Attribute_value) ~ paste0("minusInf", Attribute_value), - grepl("\\-$", Attribute_value) ~ paste0(Attribute_value, "Inf"), - TRUE ~ Attribute_value - ) - ) |> - tidyr::separate( - col = 'Attribute_value', - into = c('Attribute_value_min', 'Attribute_value_max'), sep = '-' - ) |> - dplyr::mutate( - Attribute_value_min = sub('minus', '-', Attribute_value_min), - Attribute_value_max = sub('minus', '-', Attribute_value_min) - ) |> - dplyr::mutate( - Attribute_value_min = as.double(Attribute_value_min), - Attribute_value_max = as.double(Attribute_value_max) - ) |> - dplyr::distinct() + num <- '[0-9]+(\\.[0-9]+)?' + regex1 <- paste0('^\\-?', num, '(\\-', num, ')?$') + regex2 <- paste0('^(<|>)(\\-)?', num, '$') + regex <- paste0('(', regex1, '|', regex2, ')') + df <- df |> + dplyr::filter(grepl(regex, Attribute_value)) |> + dplyr::mutate( + Attribute_value = sub( + '^(\\-)([0-9]+(\\.[0-9]+)?)', 'minus\\2', Attribute_value + ) + ) |> + dplyr::mutate( + Attribute_value = gsub(' ', '', Attribute_value), + Attribute_value = dplyr::case_when( + grepl('<', Attribute_value) ~ paste0('-', Attribute_value), + grepl('>', Attribute_value) ~ paste0(Attribute_value, '-'), + !grepl("\\-", Attribute_value) ~ + paste0(Attribute_value, '-', Attribute_value), + grepl("^\\-", Attribute_value) ~ + paste0("minusInf", Attribute_value), + grepl("\\-$", Attribute_value) ~ paste0(Attribute_value, "Inf"), + TRUE ~ Attribute_value + ), + Attribute_value = sub('(<|>)', '', Attribute_value), + Attribute_value = dplyr::case_when( + grepl("^\\-", Attribute_value) ~ + paste0("minusInf", Attribute_value), + grepl("\\-$", Attribute_value) ~ paste0(Attribute_value, "Inf"), + TRUE ~ Attribute_value + ) + ) |> + tidyr::separate( + col = 'Attribute_value', + into = c('Attribute_value_min', 'Attribute_value_max'), sep = '-' + ) |> + dplyr::mutate( + Attribute_value_min = sub('minus', '-', Attribute_value_min), + Attribute_value_max = sub('minus', '-', Attribute_value_min) + ) |> + dplyr::mutate( + Attribute_value_min = as.double(Attribute_value_min), + Attribute_value_max = as.double(Attribute_value_max) + ) |> + dplyr::distinct() } ## helper function for .importSpreadsheets .DISCRETE_ATTRIBUTE_TYPES <- function() { - fname <- system.file('extdata', 'spreadsheet_links.tsv', package = 'bugphyzz') - dat <- utils::read.table(file = fname, header = TRUE, sep = '\t') - unique(dat[dat$trait_type == 'discrete',]$attribute_type) + fname <- system.file( + 'extdata', 'spreadsheet_links.tsv', package = 'bugphyzz' + ) + dat <- utils::read.table(file = fname, header = TRUE, sep = '\t') + unique(dat[dat$trait_type == 'discrete',]$attribute_type) } ## Helper function for physiologies .addSourceInfo <- function(dat) { - fpath <- system.file('extdata', 'attribute_sources.tsv', package = 'bugphyzz') - source_data <- utils::read.table( - file = fpath, header = TRUE, sep = '\t', quote = '', check.names = FALSE, - comment.char = '' - ) - dplyr::left_join(dat, source_data, by = 'Attribute_source') + fpath <- system.file( + 'extdata', 'attribute_sources.tsv', package = 'bugphyzz' + ) + source_data <- utils::read.table( + file = fpath, header = TRUE, sep = '\t', quote = '', + check.names = FALSE, comment.char = '' + ) + dplyr::left_join(dat, source_data, by = 'Attribute_source') } ## Helper function for physiologies .reorderColumns <- function(df, name = NULL, attr_type) { - col_names <- colnames(df) - req_cols <- .requiredColumns(attr_type) - cols_lgl <- req_cols %in% col_names - if (!all(cols_lgl)) { - missing_cols <- paste0(req_cols[!cols_lgl], collapse = ', ') - if (!is.null(name)) { - msg <- paste0( - 'Missing columns in ', name, '.', ' Missing columns are: ', - missing_cols - ) - } else { - msg <- paste0( - 'Missing columns.', ' Missing columns are: ', missing_cols - ) + col_names <- colnames(df) + req_cols <- .requiredColumns(attr_type) + cols_lgl <- req_cols %in% col_names + if (!all(cols_lgl)) { + missing_cols <- paste0(req_cols[!cols_lgl], collapse = ', ') + if (!is.null(name)) { + msg <- paste0( + 'Missing columns in ', name, '.', ' Missing columns are: ', + missing_cols + ) + } else { + msg <- paste0( + 'Missing columns.', ' Missing columns are: ', missing_cols + ) + } + warning(msg, call. = FALSE) } - warning(msg, call. = FALSE) - } - cols <- req_cols[cols_lgl] - df |> - dplyr::relocate(dplyr::all_of(cols)) + cols <- req_cols[cols_lgl] + df |> + dplyr::relocate(dplyr::all_of(cols)) } ## Helper function for physiologies .homogenizeAerophilicityAttributeNames <- function(df) { - df |> dplyr::mutate( - Attribute = dplyr::case_when( - Attribute == 'obligately anaerobic' ~ 'anaerobic', - Attribute == 'microaerophilic' ~ 'aerobic', - Attribute == 'obligately aerobic' ~ 'aerobic', - TRUE ~ Attribute + df |> dplyr::mutate( + Attribute = dplyr::case_when( + Attribute == 'obligately anaerobic' ~ 'anaerobic', + Attribute == 'microaerophilic' ~ 'aerobic', + Attribute == 'obligately aerobic' ~ 'aerobic', + TRUE ~ Attribute + ) ) - ) } ## Required columns .requiredColumns <- function(attr_type) { - fname <- system.file("extdata/curation_template.tsv", package = "bugphyzz") - df <- utils::read.table(fname, sep = "\t", header = TRUE) - lgl_vct_1 <- df$requiredness == "required" - lgl_vct_2 <- grepl(attr_type, df$attribute_types) - df <- df[lgl_vct_1 & lgl_vct_2,] - df[order(df[["required_column_order"]]), , drop = FALSE] - output <- df[['column_name']] - return(output) + fname <- system.file("extdata/curation_template.tsv", package = "bugphyzz") + df <- utils::read.table(fname, sep = "\t", header = TRUE) + lgl_vct_1 <- df$requiredness == "required" + lgl_vct_2 <- grepl(attr_type, df$attribute_types) + df <- df[lgl_vct_1 & lgl_vct_2,] + df[order(df[["required_column_order"]]), , drop = FALSE] + output <- df[['column_name']] + return(output) } ## Generate a template for a bugphyzz dataset .template <- function(dataset) { - template_tsv <- system.file("extdata/curation_template.tsv", package = "bugphyzz") - template <- utils::read.table( - file = template_tsv, sep = "\t", check.names = FALSE, header = TRUE, - allowEscapes = TRUE ) - # template <- readr::read_tsv(template_tsv, show_col_types = FALSE) - template[template[["column_name"]] %in% colnames(dataset), ] + template_tsv <- system.file( + "extdata/curation_template.tsv", package = "bugphyzz" + ) + template <- utils::read.table( + file = template_tsv, sep = "\t", check.names = FALSE, header = TRUE, + allowEscapes = TRUE ) + # template <- readr::read_tsv(template_tsv, show_col_types = FALSE) + template[template[["column_name"]] %in% colnames(dataset), ] } ## Print valid attributes .attributes <- function() { - fname <- system.file("extdata/attributes.tsv", package = "bugphyzz") - df <- utils::read.table( - fname, sep = "\t", header = TRUE, check.names = FALSE - ) - unique(df[,"attribute"]) + fname <- system.file("extdata/attributes.tsv", package = "bugphyzz") + df <- utils::read.table( + fname, sep = "\t", header = TRUE, check.names = FALSE + ) + unique(df[,"attribute"]) } ## Append links to error table .appendLinks <- function(x) { - fname1 <- system.file('extdata/spreadsheet_links.tsv', package = 'bugphyzz') - links <- utils::read.table(fname1, header = TRUE, sep = '\t') - select_cols <- c("physiology", "source_link") - phys_links <- links |> - dplyr::select(tidyselect::all_of(select_cols)) - custom_links <- .customLinks() |> - dplyr::select(tidyselect::all_of((select_cols))) - links <- dplyr::bind_rows(phys_links, custom_links) - x |> - dplyr::left_join(links, by = c("dataset" = "physiology")) + fname1 <- system.file( + 'extdata/spreadsheet_links.tsv', package = 'bugphyzz' + ) + links <- utils::read.table(fname1, header = TRUE, sep = '\t') + select_cols <- c("physiology", "source_link") + phys_links <- links |> + dplyr::select(tidyselect::all_of(select_cols)) + custom_links <- .customLinks() |> + dplyr::select(tidyselect::all_of((select_cols))) + links <- dplyr::bind_rows(phys_links, custom_links) + x |> + dplyr::left_join(links, by = c("dataset" = "physiology")) } - diff --git a/bugphyzz.Rproj b/bugphyzz.Rproj index 69fafd4b..fd8dd284 100644 --- a/bugphyzz.Rproj +++ b/bugphyzz.Rproj @@ -6,7 +6,7 @@ AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes -NumSpacesForTab: 2 +NumSpacesForTab: 4 Encoding: UTF-8 RnwWeave: Sweave diff --git a/man/getTaxonSignatures.Rd b/man/getTaxonSignatures.Rd index 7957c6a9..e6b5dde1 100644 --- a/man/getTaxonSignatures.Rd +++ b/man/getTaxonSignatures.Rd @@ -18,8 +18,8 @@ argument tax_id_type = "Taxon_name" must also be used.} A character vector with the names of the signatures for a taxon. } \description{ -\code{getTaxonSignatures} returns the names of all of the signatures associated -with a particular taxon. More details can be found in the main +\code{getTaxonSignatures} returns the names of all of the signatures +associated with a particular taxon. More details can be found in the main bugphyzz vignette; please run \code{browseVignettes("bugphyzz")}. } \examples{ diff --git a/man/makeSignatures.Rd b/man/makeSignatures.Rd index b6ae2c14..5eb2b59e 100644 --- a/man/makeSignatures.Rd +++ b/man/makeSignatures.Rd @@ -24,8 +24,8 @@ makeSignatures( superkingdom, kingdom, phylum, class, order, family, genus, species, strain. They can be combined. "mixed" is equivalent to select all valid ranks.} -\item{evidence}{A character vector. Valid options: exp, igc, nas, tas, tax, asr. -They can be combined. Default is all.} +\item{evidence}{A character vector. Valid options: exp, igc, nas, tas, tax, +asr. They can be combined. Default is all.} \item{frequency}{A character vector. Valid options: always, usually, sometimes, rarely, unknown. They can be combined. By default, "rarely" is diff --git a/man/physiologies.Rd b/man/physiologies.Rd index bbe8230b..fe7fc3b5 100644 --- a/man/physiologies.Rd +++ b/man/physiologies.Rd @@ -12,8 +12,8 @@ Valid keyboards can be checked with \code{showPhys}. If 'all', all physiologies are imported.} \item{full_source}{Logical. If \code{TRUE}, the Attribute_source column will -contain full source information. If \code{FALSE}, the Attribute_source column will -contain shortened versions of the sources. Default is \code{FALSE}.} +contain full source information. If \code{FALSE}, the Attribute_source column +will contain shortened versions of the sources. Default is \code{FALSE}.} } \value{ A list of data.frames in tidy format. diff --git a/vignettes/bugphyzz.Rmd b/vignettes/bugphyzz.Rmd index 3faafd25..dbdd2073 100644 --- a/vignettes/bugphyzz.Rmd +++ b/vignettes/bugphyzz.Rmd @@ -1,6 +1,8 @@ --- title: "bugphyzz" -subtitle: "A harmonized data resource and software for enrichment analysis of microbial physiologies" +subtitle: + "A harmonized data resource and software for enrichment analysis of + microbial physiologies" author: - name: "Samuel Gamboa" email: "Samuel.Gamboa.Tuz@gmail.com" @@ -18,8 +20,8 @@ vignette: > ```{r, include = FALSE} knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" + collapse = TRUE, + comment = "#>" ) ``` @@ -28,7 +30,8 @@ knitr::opts_chunk$set( [bugphyzz](https://github.com/waldronlab/bugphyzzExports) is an electronic database of standardized microbial annotations that allows: 1. annotation of microbial taxa from multiple sources, and -2. creation of microbial signatures based on shared attributes for bug set enrichment analysis. +2. creation of microbial signatures based on shared attributes for bug set +enrichment analysis. # Data schema @@ -63,7 +66,8 @@ observed or measured. 5. _Attribute type_: A character string indicating the data type: * numeric: Attributes with numeric values (e.g., growth temperature: 25°C). * binary: Attributes with boolean values (e.g., butyrate-producing: TRUE). - * multistate-intersection: A set of related binary attributes (e.g., habitat). + * multistate-intersection: A set of related binary attributes + (e.g., habitat). * multistate-union: Attributes with three or more values represented as character strings (e.g., aerophilicity: aerobic, anaerobic, or facultatively anaerobic). @@ -86,7 +90,8 @@ Metadata associated with attribute values: 9. _Support values_: * Frequency and Score: Confidence that a given taxon exhibits a trait based on the curator’s knowledge or results of ASR or IBD. - * Validation: Score from the [10-fold cross-validation analysis](https://github.com/waldronlab/taxPProValidation). + * Validation: Score from the + [10-fold cross-validation analysis](https://github.com/waldronlab/taxPProValidation). Matthews correlation coefficient (MCC) for discrete attributes and R-squared for numeric attributes. Default threshold value is 0.5 and above. * NSTI: Nearest sequence taxon index from [PICRUSt](https://doi.org/10.1038/nbt.2676) @@ -103,13 +108,17 @@ three, two, or one of these criteria. **Additional information** -+ Description of **sources** and **attributes**: https://waldronlab.io/bugphyzz/articles/attributes.html ++ Description of **sources** and **attributes**: +https://waldronlab.io/bugphyzz/articles/attributes.html -+ Description of ontology **evidence** codes: https://geneontology.org/docs/guide-go-evidence-codes/ ++ Description of ontology **evidence** codes: +https://geneontology.org/docs/guide-go-evidence-codes/ -+ Description of **frequency** keywords and scores were based on: https://grammarist.com/grammar/adverbs-of-frequency/ ++ Description of **frequency** keywords and scores were based on: +https://grammarist.com/grammar/adverbs-of-frequency/ -+ IBD and ASR were performed with taxPPro: https://github.com/waldronlab/taxPPro ++ IBD and ASR were performed with taxPPro: +https://github.com/waldronlab/taxPPro # Analysis and Stats @@ -176,7 +185,7 @@ attribute (discrete): ```{r} aer_sigs_g <- makeSignatures( - dat = bp[["aerophilicity"]], tax_id_type = "Taxon_name", tax_level = "genus" + dat = bp[["aerophilicity"]], tax_id_type = "Taxon_name", tax_level = "genus" ) map(aer_sigs_g, head) ``` @@ -186,8 +195,8 @@ temperature attribute (numeric): ```{r} gt_sigs_sp <- makeSignatures( - dat = bp[["growth temperature"]], tax_id_type = "Taxon_name", - tax_level = 'species' + dat = bp[["growth temperature"]], tax_id_type = "Taxon_name", + tax_level = 'species' ) map(gt_sigs_sp, head) ``` @@ -197,8 +206,8 @@ attribute (numeric): ```{r} gt_sigs_mix <- makeSignatures( - dat = bp[["growth temperature"]], tax_id_type = "Taxon_name", - tax_level = "mixed", min = 0, max = 25 + dat = bp[["growth temperature"]], tax_id_type = "Taxon_name", + tax_level = "mixed", min = 0, max = 25 ) map(gt_sigs_mix, head) ``` @@ -207,8 +216,8 @@ map(gt_sigs_mix, head) ```{r} ap_sigs_mix <- makeSignatures( - dat = bp[["animal pathogen"]], tax_id_type = "NCBI_ID", - tax_level = "mixed", evidence = c("exp", "igc", "nas", "tas") + dat = bp[["animal pathogen"]], tax_id_type = "NCBI_ID", + tax_level = "mixed", evidence = c("exp", "igc", "nas", "tas") ) map(ap_sigs_mix, head) ``` @@ -217,7 +226,7 @@ map(ap_sigs_mix, head) ```{r} sigs <- map(bp, makeSignatures) |> - list_flatten(name_spec = "{inner}") + list_flatten(name_spec = "{inner}") length(sigs) ``` @@ -228,12 +237,14 @@ head(map(sigs, head)) # Run a bug set enrichment analysis Bugphyzz signatures are suitable for conducting bug set enrichment analysis -using existing tools available in R. In this example, we will perform a set enrichment analysis using a dataset +using existing tools available in R. In this example, we will perform a set +enrichment analysis using a dataset with a known biological ground truth. The dataset originates from the Human Microbiome Project (2012) and compares subgingival and supragingival plaque. -This data will be imported using the [MicrobiomeBenchmarkData package](https://bioconductor.org/packages/release/data/experiment/html/MicrobiomeBenchmarkData.html). +This data will be imported using the +[MicrobiomeBenchmarkData package](https://bioconductor.org/packages/release/data/experiment/html/MicrobiomeBenchmarkData.html). For the implementation of the enrichment analysis, we will utilize the Gene Set Enrichment Analysis (GSEA) method available in the [EnrichmentBrowser package](https://bioconductor.org/packages/release/bioc/html/EnrichmentBrowser.html). @@ -265,7 +276,7 @@ Perform differential abundance (DA) analysis to get sets of microbes: ```{r} tse_subset$GROUP <- ifelse( - tse_subset$body_subsite == 'subgingival_plaque', 0, 1 + tse_subset$body_subsite == 'subgingival_plaque', 0, 1 ) se <- EnrichmentBrowser::deAna( expr = tse_subset, de.method = 'limma', padj.method = 'fdr', @@ -275,7 +286,7 @@ se <- EnrichmentBrowser::deAna( dat <- data.frame(colData(se)) design <- stats::model.matrix(~ GROUP, data = dat) assay(se) <- limma::voom( - counts = assay(se), design = design, plot = FALSE + counts = assay(se), design = design, plot = FALSE )$E ``` @@ -283,15 +294,15 @@ Perform GSEA and display the results: ```{r, message=FALSE} gsea <- EnrichmentBrowser::sbea( - method = 'gsea', se = se, gs = aer_sigs_g, perm = 1000, - alpha = 0.1 + method = 'gsea', se = se, gs = aer_sigs_g, perm = 1000, + alpha = 0.1 ) gsea_tbl <- as.data.frame(gsea$res.tbl) |> - mutate( - GENE.SET = ifelse(PVAL < 0.05, paste0(GENE.SET, ' *'), GENE.SET), - PVAL = round(PVAL, 3), - ) |> - dplyr::rename(BUG.SET = GENE.SET) + mutate( + GENE.SET = ifelse(PVAL < 0.05, paste0(GENE.SET, ' *'), GENE.SET), + PVAL = round(PVAL, 3), + ) |> + dplyr::rename(BUG.SET = GENE.SET) knitr::kable(gsea_tbl) ```