Merge remote-tracking branch 'wirawara/master'

Conflicts: README.md
BgeeDB · Oct 29, 2016 · c0e2dec · c0e2dec
2 parents b853855 + 3d54d69
commit c0e2dec
Show file tree

Hide file tree

Showing 32 changed files with 1,451 additions and 916 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,32 +1,22 @@
 Package: BgeeDB
 Type: Package
-Title: Annotation and gene expression data from Bgee database
-Version: 0.99.6
-Date: 2016-03-15
+Title: Annotation and gene expression data retrieval from Bgee database
+Version: 1.99.0
+Date: 2016-10-06
 Author: Andrea Komljenovic [aut, cre], Julien Roux [aut, cre]
 Maintainer: Andrea Komljenovic  <andreakomljenovic@gmail.com>, Frederic Bastian <bgee@sib.swiss>
 Description: A package for the annotation and gene expression
     data download from Bgee database, and TopAnat analysis: GO-like enrichment of anatomical terms,
     mapped to genes by expression patterns.
-Depends:
-    R (>= 3.0),
-    topGO,
-    tidyr
-Imports:
-    data.table,
-    RCurl,
-    methods,
-    stats,
-    utils,
-    dplyr,
-    graph
+Depends: R (>= 3.3.0), topGO, tidyr
+Imports: data.table, RCurl, digest, methods, stats, utils, dplyr,
+        graph, Biobase
 License: GPL-2
 VignetteBuilder: knitr
-biocViews: Software, DataImport, Sequencing, GeneExpression, Microarray, GO
-Suggests:
-    knitr,
-    BiocStyle,
-    testthat,
-    rmarkdown
+biocViews: Software, DataImport, Sequencing, GeneExpression,
+        Microarray, GO, GeneSetEnrichment
+Suggests: knitr, BiocStyle, testthat, rmarkdown
 LazyLoad: yes
 RoxygenNote: 5.0.1
+NeedsCompilation: no
+Packaged: 2016-10-11 14:05:07 UTC; akomljen
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,17 +1,23 @@
 # Generated by roxygen2: do not edit by hand
 
 export(Bgee)
+export(formatData)
+export(getAnnotation)
+export(getData)
+export(listBgeeRelease)
 export(listBgeeSpecies)
 export(loadTopAnatData)
 export(makeTable)
 export(topAnat)
 exportClasses(Bgee)
+import(Biobase)
 import(RCurl)
 import(data.table)
+import(digest)
 import(graph)
 import(methods)
 import(stats)
 import(topGO)
 import(utils)
 importFrom(dplyr,"%>%")
-importFrom(tidyr,spread)
+importFrom(tidyr,spread_)
diff --git a/R/Bgee.R b/R/Bgee.R
diff --git a/R/formatData.R b/R/formatData.R
@@ -0,0 +1,194 @@
+#' @title Format RNA-seq or Affymetrix data downloaded from Bgee.
+#'
+#' @description This function formats the data downloaded with the getData() function into an object of the Bioconductor "expressionSet" Class.
+#'
+#' @param myBgeeObject A Reference Class Bgee object, notably specifying the targeted species and data type.
+#'
+#' @param data A list of data frames including data from multiple experiments, or a data frame including data from a single experiment.
+#'
+#' @param stats A character indicating what expression values should be used in the formatted data expressionSet object matrix.
+#'  \itemize{
+#'    \item{"rpkm" for RNA-seq}
+#'    \item{"counts" for RNA-seq}
+#'    \item{"tpm" for RNA-seq (Bgee release 14 and above)}
+#'    \item{"intensities" for Affymetrix microarrays}
+#'  }
+#'
+#' @param callType A character indicating whether intensities should be displayed only for present (i.e., expressed) genes, present high quality genes, or all genes (default).
+#'  \itemize{
+#'    \item{"present"}
+#'    \item{"present high quality"}
+#'    \item{"all"}
+#'  }
+#'
+#' @return If data was a list of data frames from multiple experiments, returns a list of ExpressionSet objects. If data was a data frame from a single experiment, returns an ExpressionSet object.
+#'
+#' @author Andrea Komljenovic and Julien Roux.
+#'
+#' @examples{
+#'   bgee <- Bgee$new(species = "Mus_musculus", dataType = "rna_seq")
+#'   dataMouseGSE30617 <- getData(bgee, experimentId = "GSE30617")
+#'   dataMouseGSE30617.rpkm <- formatData(bgee, dataMouseGSE30617, callType = "present", stats = "rpkm")
+#' }
+#'
+#' @importFrom dplyr %>%
+#' @importFrom tidyr spread_
+#' @export
+
+formatData = function(myBgeeObject, data, stats = NULL, callType = "all"){
+
+  ## check that fields of Bgee object are not empty
+  if (length(myBgeeObject$quantitativeData) == 0 | length(myBgeeObject$dataType) == 0){
+    stop("ERROR: there seems to be a problem with the input Bgee class object, some fields are empty. Please check that the object is valid.")
+  }
+
+  ## Check that download of data is possible for targeted species and data type
+  if (myBgeeObject$quantitativeData != TRUE){
+    stop("ERROR: formatting quantitative data is not possible for the species and data type of the input Bgee class object.")
+  }
+
+  if (length(stats) == 0){
+    if (myBgeeObject$dataType == "affymetrix"){
+      cat("\nWARNING: stats parameter set to \"intensities\" for the next steps.\n")
+      stats <- "intensities"
+    } else if (myBgeeObject$dataType == "rna_seq"){
+      stop("Please specify the stats parameters. Should be set to \"rpkm\", \"counts\" or \"tpm\"")
+    }
+  } else if (myBgeeObject$dataType == "affymetrix" & stats != "intensities"){
+    cat("\nWARNING: For Affymetrix microarray data, stats parameter can only be set to \"intensities\". This will be used for the next steps.\n")
+    stats <- "intensities"
+  } else if (myBgeeObject$dataType == "rna_seq" & myBgeeObject$release == "13_2" & !(stats %in% c('rpkm', 'counts'))){
+    stop("Choose whether data formatting should create a matrix of RPKMs or read counts, with stats option set as \"rpkm\" or \"counts\"")
+  } else if (myBgeeObject$dataType == "rna_seq" & !(stats %in% c('rpkm', 'counts', 'tpm'))){
+    stop("Choose whether data formatting should create a matrix of RPKMs, TPMs or read counts, with stats option set as \"rpkm\", \"tpm\" or \"counts\"")
+  }
+
+  if(!(callType %in% c('present','present high quality','all'))){
+    stop("Choose between displaying intensities for present genes, present high quality genes or all genes, e.g., 'present', 'present high quality', 'all' ")
+  }
+
+  if(length(data) == 1) data[[1]] else data
+
+  cat("\nExtracting expression data matrix...\n")
+  if(stats  == "rpkm"){
+    columns <- c("Library.ID", "Gene.ID", "RPKM")
+    expr <- .extract.data(data, columns, callType)
+  } else if(stats  == "tpm"){
+    columns <- c("Library.ID", "Gene.ID", "TPM")
+    expr <- .extract.data(data, columns, callType)
+  } else if (stats == "counts"){
+    columns <- c("Library.ID", "Gene.ID", "Read.count")
+    expr <- .extract.data(data, columns, callType)
+  } else {
+    columns <- c("Chip.ID", "Probeset.ID", "Log.of.normalized.signal.intensity", "Gene.ID")
+    expr <- .extract.data(data, columns, callType)
+  }
+  ## one data matrix
+  if(is.data.frame(expr$assayData)){
+
+    ## sort objects to have samples in the same order
+    expr$assayData <- expr$assayData[, order(names(expr$assayData))]
+    expr$pheno <- expr$pheno[order(sampleNames(expr$pheno))]
+
+    ## create ExpressionSet object
+    eset <- new('ExpressionSet',
+                exprs=as.matrix(expr$assayData),
+                phenoData=expr$pheno,
+                featureData=expr$features)
+  } else if(is.list(expr$assayData)){
+    ## multiple data matrices
+    eset <- mapply(function(x,y,z){
+      ## sort objects to have samples in the same order
+      x <- x[, order(names(x))]
+      y <- y[order(sampleNames(y))]
+
+      ## create ExpressionSet object
+      new('ExpressionSet',
+          exprs=as.matrix(x),
+          phenoData=y,
+          featureData=z)
+    },
+    expr$assayData, expr$pheno, expr$features)
+  }
+  return(eset)
+}
+
+## Take in unformatted data downloaded from Bgee and outputs a list of expression matrices, phenotype annotations, feature annotations, and calls.
+.extract.data <- function(data, columns, callType){
+  ## if multiple data frames (multiple experiments or chips)
+  if(class(data) == "list"){
+    calls <- lapply(data, function(x) .calling(x, callType, columns[3]))
+    expr <- lapply(calls,
+                   function(x) {
+                     ## subset the data to keep relevant columns
+                     xt <- x[, columns[1:3]]
+                     ## from sample and feature columns, create a matrix with features as rows and samples as columns
+                     xtt <- xt %>% spread_(columns[1], columns[3])
+                     rownames(xtt) <- xtt[, columns[2]]
+                     ## Remove feature column to keep only data
+                     xtt[,-1, drop = FALSE]
+                   }
+    )
+    cat("\nExtracting features information...\n")
+    features <- mapply(.extract.data.feature, calls, expr, rep(list(columns), times=length(calls)))
+    cat("\nExtracting samples information...\n")
+    phenos <- mapply(.extract.data.pheno, calls, rep(list(columns[1]), times=length(calls)))
+  } else {
+    ## if only a single dataframe
+    calls <- .calling(data, callType, columns[3])
+    ## subset the data to keep relevant columns
+    xt <- calls[, columns[1:3]]
+    xtt <- xt %>% spread_(columns[1], columns[3])
+    rownames(xtt) <- xtt[, columns[2]]
+    ## Remove feature column to keep only data
+    expr <- xtt[,-1, drop = FALSE]
+
+    cat("\nExtracting features information...\n")
+    features <- .extract.data.feature( calls, expr, columns)
+    cat("\nExtracting samples information...\n")
+    phenos <- .extract.data.pheno( calls, columns[1])
+  }
+  return(list(assayData = expr, pheno = phenos, features = features, calls = calls))
+}
+
+# Extract feature data (probesets or genes)
+.extract.data.feature <- function(calls, expr, columns){
+  ## RNA-seq
+  if(length(columns) == 3){
+    fdata <- calls[match(rownames(expr), calls[, columns[2]]), columns[2], drop = FALSE]
+  }
+  ## Affymetrix, 4 columns
+  else if(length(columns) == 4){
+    fdata <- calls[match(rownames(expr), calls[, columns[2]]), columns[c(2,4)], drop = FALSE]
+  }
+  rownames(fdata) <- fdata[, columns[2]]
+  fdata <- as(fdata, "AnnotatedDataFrame")
+  return(fdata)
+}
+
+# Extract annotation of samples
+.extract.data.pheno <- function(calls, column){
+  phdata <- calls[, c(column, "Anatomical.entity.ID", "Anatomical.entity.name", "Stage.ID", "Stage.name")]
+  phdata <- phdata[!duplicated(phdata[, column]), ]
+  rownames(phdata) <- phdata[, column]
+  phdata <- as.data.frame(phdata)
+  metadata <- data.frame(labelDescription=colnames(phdata),
+                         row.names=colnames(phdata))
+  phenodata <- new("AnnotatedDataFrame",
+                   data=phdata,
+                   varMetadata=metadata
+  )
+  return(phenodata)
+}
+
+.calling <- function(x, callType, column){
+  ## check data type
+  if(callType == "present"){
+    cat("  Keeping only present genes.\n")
+    x[(x$Detection.flag == "absent"), column] <- NA
+  } else if (callType == "present high quality"){
+    cat("  Keeping only present high quality genes.\n")
+    x[which(x$Detection.flag == "absent" | x$Detection.quality == "poor quality"), column] <- NA
+  }
+  return(x)
+}
diff --git a/R/getAnnotation.R b/R/getAnnotation.R
@@ -0,0 +1,83 @@
+#' @title Retrieve Bgee experiments annotation for targeted species and data type.
+#'
+#' @description This function loads the annotation of experiments and samples of quantitative expression datasets (rna_seq, affymetrix) that are available from Bgee.
+#'
+#' @param myBgeeObject A Reference Class Bgee object, notably specifying the targeted species and data type.
+
+#' @return A list of two elements, including a data frame of the annotation of experiments for chosen species (field "experiment.annotation") and a data frame of the annotation of chips/libraries from these experiments (field "sample.annotation").
+#'
+#' @author Andrea Komljenovic and Julien Roux.
+#'
+#' @examples{
+#'   bgee <- Bgee$new(species = "Mus_musculus", dataType = "rna_seq")
+#'   myAnnotation <- getAnnotation(bgee)
+#' }
+#'
+#' @export
+
+getAnnotation = function(myBgeeObject){
+
+  ## check that the Bgee object is valid
+  if (length(myBgeeObject$quantitativeData) == 0 ){
+    stop("ERROR: there seems to be a problem with the input Bgee class object, some fields are empty. Please check that the object was correctly built.")
+  } else {
+    ## Check that download of quantitative data is possible for targeted species and data type
+    ## Try to output error message that helps a bit the user
+    if (myBgeeObject$quantitativeData != TRUE){
+      if (length(myBgeeObject$dataType) > 1){
+        stop("ERROR: downloading annotation files is only possible if a single data type (\"rna_seq\" or \"affymetrix\") is specified in the input Bgee class object.")
+      } else if (length(myBgeeObject$dataType) == 1 & (myBgeeObject$dataType == "rna_seq" | myBgeeObject$dataType == "affymetrix")){
+        stop("ERROR: downloading annotation files is not possible for the species and data type specified in the input Bgee class object. Maybe the specified data type is not available for the targeted species in Bgee? See listBgeeSpecies() for details on data types availability for each species.")
+      } else {
+        stop("ERROR: downloading annotation files is not possible for the species and data type specified in the input Bgee class object.")
+      }
+    } else if (length(myBgeeObject$annotationUrl) == 0 | length(myBgeeObject$dataType) == 0 | length(myBgeeObject$pathToData) == 0){
+      stop("ERROR: there seems to be a problem with the input Bgee class object, some fields are empty. Please check that the object was correctly built.")
+    }
+  }
+
+  ## Get name of annotation file from the URL field of Bgee object
+  annotationFile <- basename(myBgeeObject$annotationUrl)
+
+  ## To get the names of experiment and sample files, we start from the annotationFile
+  if (myBgeeObject$dataType == "affymetrix"){
+    annotationExperiments <- gsub("_chips", "", annotationFile)
+  } else if (myBgeeObject$dataType == "rna_seq"){
+    annotationExperiments <- gsub("_libraries", "", annotationFile)
+  }
+  annotationExperiments <- gsub("zip", "tsv", annotationExperiments)
+  annotationSamples     <- gsub("_experiments", "", annotationFile)
+  annotationSamples     <- gsub("zip", "tsv", annotationSamples)
+
+  ## Check if file is already in cache. If so, skip download step
+  if (file.exists(file.path(myBgeeObject$pathToData, annotationExperiments)) && file.exists(file.path(myBgeeObject$pathToData, annotationSamples))){
+    cat(paste0("\nNOTE: annotation files for this species were found in the download directory ", myBgeeObject$pathToData,
+               ". Data will not be redownloaded.\n"))
+  } else {
+    cat("\nDownloading annotation files...\n")
+    success <- download.file(myBgeeObject$annotationUrl,
+                             destfile=file.path(myBgeeObject$pathToData, annotationFile),
+                             mode='wb')
+    if (success != 0){
+      stop("ERROR: Download from FTP was not successful.")
+    }
+    unzip(file.path(myBgeeObject$pathToData, annotationFile), exdir=myBgeeObject$pathToData)
+    cat("\nSaved annotation files in", myBgeeObject$pathToData, "folder.\n")
+    ## Clean directory and remove .zip file
+    file.remove(file.path(myBgeeObject$pathToData, annotationFile))
+    ## Test if extracted files are OK
+    if (!(file.exists(file.path(myBgeeObject$pathToData, annotationExperiments)) & file.exists(file.path(myBgeeObject$pathToData, annotationSamples)))){
+      stop("ERROR: extraction of annotation files from downloaded zip file went wrong.")
+    }
+  }
+
+  ## Read the 2 annotation files
+  myAnnotation <- list(sample.annotation=as.data.frame(fread(file.path(myBgeeObject$pathToData, annotationSamples))),
+                       experiment.annotation=as.data.frame(fread(file.path(myBgeeObject$pathToData, annotationExperiments)))
+  )
+  ## remove spaces in headers
+  for (i in 1:length(myAnnotation)){
+    names(myAnnotation[[i]]) <- make.names(names(myAnnotation[[i]]))
+  }
+  return(myAnnotation)
+}