Skip to content

Commit

Permalink
Merge remote-tracking branch 'wirawara/master'
Browse files Browse the repository at this point in the history
Conflicts:
	README.md
  • Loading branch information
fbastian committed Oct 29, 2016
2 parents b853855 + 3d54d69 commit c0e2dec
Show file tree
Hide file tree
Showing 32 changed files with 1,451 additions and 916 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
32 changes: 11 additions & 21 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,32 +1,22 @@
Package: BgeeDB
Type: Package
Title: Annotation and gene expression data from Bgee database
Version: 0.99.6
Date: 2016-03-15
Title: Annotation and gene expression data retrieval from Bgee database
Version: 1.99.0
Date: 2016-10-06
Author: Andrea Komljenovic [aut, cre], Julien Roux [aut, cre]
Maintainer: Andrea Komljenovic <andreakomljenovic@gmail.com>, Frederic Bastian <bgee@sib.swiss>
Description: A package for the annotation and gene expression
data download from Bgee database, and TopAnat analysis: GO-like enrichment of anatomical terms,
mapped to genes by expression patterns.
Depends:
R (>= 3.0),
topGO,
tidyr
Imports:
data.table,
RCurl,
methods,
stats,
utils,
dplyr,
graph
Depends: R (>= 3.3.0), topGO, tidyr
Imports: data.table, RCurl, digest, methods, stats, utils, dplyr,
graph, Biobase
License: GPL-2
VignetteBuilder: knitr
biocViews: Software, DataImport, Sequencing, GeneExpression, Microarray, GO
Suggests:
knitr,
BiocStyle,
testthat,
rmarkdown
biocViews: Software, DataImport, Sequencing, GeneExpression,
Microarray, GO, GeneSetEnrichment
Suggests: knitr, BiocStyle, testthat, rmarkdown
LazyLoad: yes
RoxygenNote: 5.0.1
NeedsCompilation: no
Packaged: 2016-10-11 14:05:07 UTC; akomljen
8 changes: 7 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
# Generated by roxygen2: do not edit by hand

export(Bgee)
export(formatData)
export(getAnnotation)
export(getData)
export(listBgeeRelease)
export(listBgeeSpecies)
export(loadTopAnatData)
export(makeTable)
export(topAnat)
exportClasses(Bgee)
import(Biobase)
import(RCurl)
import(data.table)
import(digest)
import(graph)
import(methods)
import(stats)
import(topGO)
import(utils)
importFrom(dplyr,"%>%")
importFrom(tidyr,spread)
importFrom(tidyr,spread_)
492 changes: 199 additions & 293 deletions R/Bgee.R

Large diffs are not rendered by default.

194 changes: 194 additions & 0 deletions R/formatData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
#' @title Format RNA-seq or Affymetrix data downloaded from Bgee.
#'
#' @description This function formats the data downloaded with the getData() function into an object of the Bioconductor "expressionSet" Class.
#'
#' @param myBgeeObject A Reference Class Bgee object, notably specifying the targeted species and data type.
#'
#' @param data A list of data frames including data from multiple experiments, or a data frame including data from a single experiment.
#'
#' @param stats A character indicating what expression values should be used in the formatted data expressionSet object matrix.
#' \itemize{
#' \item{"rpkm" for RNA-seq}
#' \item{"counts" for RNA-seq}
#' \item{"tpm" for RNA-seq (Bgee release 14 and above)}
#' \item{"intensities" for Affymetrix microarrays}
#' }
#'
#' @param callType A character indicating whether intensities should be displayed only for present (i.e., expressed) genes, present high quality genes, or all genes (default).
#' \itemize{
#' \item{"present"}
#' \item{"present high quality"}
#' \item{"all"}
#' }
#'
#' @return If data was a list of data frames from multiple experiments, returns a list of ExpressionSet objects. If data was a data frame from a single experiment, returns an ExpressionSet object.
#'
#' @author Andrea Komljenovic and Julien Roux.
#'
#' @examples{
#' bgee <- Bgee$new(species = "Mus_musculus", dataType = "rna_seq")
#' dataMouseGSE30617 <- getData(bgee, experimentId = "GSE30617")
#' dataMouseGSE30617.rpkm <- formatData(bgee, dataMouseGSE30617, callType = "present", stats = "rpkm")
#' }
#'
#' @importFrom dplyr %>%
#' @importFrom tidyr spread_
#' @export

formatData = function(myBgeeObject, data, stats = NULL, callType = "all"){

## check that fields of Bgee object are not empty
if (length(myBgeeObject$quantitativeData) == 0 | length(myBgeeObject$dataType) == 0){
stop("ERROR: there seems to be a problem with the input Bgee class object, some fields are empty. Please check that the object is valid.")
}

## Check that download of data is possible for targeted species and data type
if (myBgeeObject$quantitativeData != TRUE){
stop("ERROR: formatting quantitative data is not possible for the species and data type of the input Bgee class object.")
}

if (length(stats) == 0){
if (myBgeeObject$dataType == "affymetrix"){
cat("\nWARNING: stats parameter set to \"intensities\" for the next steps.\n")
stats <- "intensities"
} else if (myBgeeObject$dataType == "rna_seq"){
stop("Please specify the stats parameters. Should be set to \"rpkm\", \"counts\" or \"tpm\"")
}
} else if (myBgeeObject$dataType == "affymetrix" & stats != "intensities"){
cat("\nWARNING: For Affymetrix microarray data, stats parameter can only be set to \"intensities\". This will be used for the next steps.\n")
stats <- "intensities"
} else if (myBgeeObject$dataType == "rna_seq" & myBgeeObject$release == "13_2" & !(stats %in% c('rpkm', 'counts'))){
stop("Choose whether data formatting should create a matrix of RPKMs or read counts, with stats option set as \"rpkm\" or \"counts\"")
} else if (myBgeeObject$dataType == "rna_seq" & !(stats %in% c('rpkm', 'counts', 'tpm'))){
stop("Choose whether data formatting should create a matrix of RPKMs, TPMs or read counts, with stats option set as \"rpkm\", \"tpm\" or \"counts\"")
}

if(!(callType %in% c('present','present high quality','all'))){
stop("Choose between displaying intensities for present genes, present high quality genes or all genes, e.g., 'present', 'present high quality', 'all' ")
}

if(length(data) == 1) data[[1]] else data

cat("\nExtracting expression data matrix...\n")
if(stats == "rpkm"){
columns <- c("Library.ID", "Gene.ID", "RPKM")
expr <- .extract.data(data, columns, callType)
} else if(stats == "tpm"){
columns <- c("Library.ID", "Gene.ID", "TPM")
expr <- .extract.data(data, columns, callType)
} else if (stats == "counts"){
columns <- c("Library.ID", "Gene.ID", "Read.count")
expr <- .extract.data(data, columns, callType)
} else {
columns <- c("Chip.ID", "Probeset.ID", "Log.of.normalized.signal.intensity", "Gene.ID")
expr <- .extract.data(data, columns, callType)
}
## one data matrix
if(is.data.frame(expr$assayData)){

## sort objects to have samples in the same order
expr$assayData <- expr$assayData[, order(names(expr$assayData))]
expr$pheno <- expr$pheno[order(sampleNames(expr$pheno))]

## create ExpressionSet object
eset <- new('ExpressionSet',
exprs=as.matrix(expr$assayData),
phenoData=expr$pheno,
featureData=expr$features)
} else if(is.list(expr$assayData)){
## multiple data matrices
eset <- mapply(function(x,y,z){
## sort objects to have samples in the same order
x <- x[, order(names(x))]
y <- y[order(sampleNames(y))]

## create ExpressionSet object
new('ExpressionSet',
exprs=as.matrix(x),
phenoData=y,
featureData=z)
},
expr$assayData, expr$pheno, expr$features)
}
return(eset)
}

## Take in unformatted data downloaded from Bgee and outputs a list of expression matrices, phenotype annotations, feature annotations, and calls.
.extract.data <- function(data, columns, callType){
## if multiple data frames (multiple experiments or chips)
if(class(data) == "list"){
calls <- lapply(data, function(x) .calling(x, callType, columns[3]))
expr <- lapply(calls,
function(x) {
## subset the data to keep relevant columns
xt <- x[, columns[1:3]]
## from sample and feature columns, create a matrix with features as rows and samples as columns
xtt <- xt %>% spread_(columns[1], columns[3])
rownames(xtt) <- xtt[, columns[2]]
## Remove feature column to keep only data
xtt[,-1, drop = FALSE]
}
)
cat("\nExtracting features information...\n")
features <- mapply(.extract.data.feature, calls, expr, rep(list(columns), times=length(calls)))
cat("\nExtracting samples information...\n")
phenos <- mapply(.extract.data.pheno, calls, rep(list(columns[1]), times=length(calls)))
} else {
## if only a single dataframe
calls <- .calling(data, callType, columns[3])
## subset the data to keep relevant columns
xt <- calls[, columns[1:3]]
xtt <- xt %>% spread_(columns[1], columns[3])
rownames(xtt) <- xtt[, columns[2]]
## Remove feature column to keep only data
expr <- xtt[,-1, drop = FALSE]

cat("\nExtracting features information...\n")
features <- .extract.data.feature( calls, expr, columns)
cat("\nExtracting samples information...\n")
phenos <- .extract.data.pheno( calls, columns[1])
}
return(list(assayData = expr, pheno = phenos, features = features, calls = calls))
}

# Extract feature data (probesets or genes)
.extract.data.feature <- function(calls, expr, columns){
## RNA-seq
if(length(columns) == 3){
fdata <- calls[match(rownames(expr), calls[, columns[2]]), columns[2], drop = FALSE]
}
## Affymetrix, 4 columns
else if(length(columns) == 4){
fdata <- calls[match(rownames(expr), calls[, columns[2]]), columns[c(2,4)], drop = FALSE]
}
rownames(fdata) <- fdata[, columns[2]]
fdata <- as(fdata, "AnnotatedDataFrame")
return(fdata)
}

# Extract annotation of samples
.extract.data.pheno <- function(calls, column){
phdata <- calls[, c(column, "Anatomical.entity.ID", "Anatomical.entity.name", "Stage.ID", "Stage.name")]
phdata <- phdata[!duplicated(phdata[, column]), ]
rownames(phdata) <- phdata[, column]
phdata <- as.data.frame(phdata)
metadata <- data.frame(labelDescription=colnames(phdata),
row.names=colnames(phdata))
phenodata <- new("AnnotatedDataFrame",
data=phdata,
varMetadata=metadata
)
return(phenodata)
}

.calling <- function(x, callType, column){
## check data type
if(callType == "present"){
cat(" Keeping only present genes.\n")
x[(x$Detection.flag == "absent"), column] <- NA
} else if (callType == "present high quality"){
cat(" Keeping only present high quality genes.\n")
x[which(x$Detection.flag == "absent" | x$Detection.quality == "poor quality"), column] <- NA
}
return(x)
}
83 changes: 83 additions & 0 deletions R/getAnnotation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#' @title Retrieve Bgee experiments annotation for targeted species and data type.
#'
#' @description This function loads the annotation of experiments and samples of quantitative expression datasets (rna_seq, affymetrix) that are available from Bgee.
#'
#' @param myBgeeObject A Reference Class Bgee object, notably specifying the targeted species and data type.

#' @return A list of two elements, including a data frame of the annotation of experiments for chosen species (field "experiment.annotation") and a data frame of the annotation of chips/libraries from these experiments (field "sample.annotation").
#'
#' @author Andrea Komljenovic and Julien Roux.
#'
#' @examples{
#' bgee <- Bgee$new(species = "Mus_musculus", dataType = "rna_seq")
#' myAnnotation <- getAnnotation(bgee)
#' }
#'
#' @export

getAnnotation = function(myBgeeObject){

## check that the Bgee object is valid
if (length(myBgeeObject$quantitativeData) == 0 ){
stop("ERROR: there seems to be a problem with the input Bgee class object, some fields are empty. Please check that the object was correctly built.")
} else {
## Check that download of quantitative data is possible for targeted species and data type
## Try to output error message that helps a bit the user
if (myBgeeObject$quantitativeData != TRUE){
if (length(myBgeeObject$dataType) > 1){
stop("ERROR: downloading annotation files is only possible if a single data type (\"rna_seq\" or \"affymetrix\") is specified in the input Bgee class object.")
} else if (length(myBgeeObject$dataType) == 1 & (myBgeeObject$dataType == "rna_seq" | myBgeeObject$dataType == "affymetrix")){
stop("ERROR: downloading annotation files is not possible for the species and data type specified in the input Bgee class object. Maybe the specified data type is not available for the targeted species in Bgee? See listBgeeSpecies() for details on data types availability for each species.")
} else {
stop("ERROR: downloading annotation files is not possible for the species and data type specified in the input Bgee class object.")
}
} else if (length(myBgeeObject$annotationUrl) == 0 | length(myBgeeObject$dataType) == 0 | length(myBgeeObject$pathToData) == 0){
stop("ERROR: there seems to be a problem with the input Bgee class object, some fields are empty. Please check that the object was correctly built.")
}
}

## Get name of annotation file from the URL field of Bgee object
annotationFile <- basename(myBgeeObject$annotationUrl)

## To get the names of experiment and sample files, we start from the annotationFile
if (myBgeeObject$dataType == "affymetrix"){
annotationExperiments <- gsub("_chips", "", annotationFile)
} else if (myBgeeObject$dataType == "rna_seq"){
annotationExperiments <- gsub("_libraries", "", annotationFile)
}
annotationExperiments <- gsub("zip", "tsv", annotationExperiments)
annotationSamples <- gsub("_experiments", "", annotationFile)
annotationSamples <- gsub("zip", "tsv", annotationSamples)

## Check if file is already in cache. If so, skip download step
if (file.exists(file.path(myBgeeObject$pathToData, annotationExperiments)) && file.exists(file.path(myBgeeObject$pathToData, annotationSamples))){
cat(paste0("\nNOTE: annotation files for this species were found in the download directory ", myBgeeObject$pathToData,
". Data will not be redownloaded.\n"))
} else {
cat("\nDownloading annotation files...\n")
success <- download.file(myBgeeObject$annotationUrl,
destfile=file.path(myBgeeObject$pathToData, annotationFile),
mode='wb')
if (success != 0){
stop("ERROR: Download from FTP was not successful.")
}
unzip(file.path(myBgeeObject$pathToData, annotationFile), exdir=myBgeeObject$pathToData)
cat("\nSaved annotation files in", myBgeeObject$pathToData, "folder.\n")
## Clean directory and remove .zip file
file.remove(file.path(myBgeeObject$pathToData, annotationFile))
## Test if extracted files are OK
if (!(file.exists(file.path(myBgeeObject$pathToData, annotationExperiments)) & file.exists(file.path(myBgeeObject$pathToData, annotationSamples)))){
stop("ERROR: extraction of annotation files from downloaded zip file went wrong.")
}
}

## Read the 2 annotation files
myAnnotation <- list(sample.annotation=as.data.frame(fread(file.path(myBgeeObject$pathToData, annotationSamples))),
experiment.annotation=as.data.frame(fread(file.path(myBgeeObject$pathToData, annotationExperiments)))
)
## remove spaces in headers
for (i in 1:length(myAnnotation)){
names(myAnnotation[[i]]) <- make.names(names(myAnnotation[[i]]))
}
return(myAnnotation)
}
Loading

0 comments on commit c0e2dec

Please sign in to comment.