Skip to content

Commit

Permalink
Fixes to SCTransform for new Assay requierments
Browse files Browse the repository at this point in the history
  • Loading branch information
mojaveazure committed Apr 11, 2019
1 parent a19ca58 commit 15823a5
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 141 deletions.
274 changes: 138 additions & 136 deletions R/preprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,95 @@ Read10X_h5 <- function(filename, use.names = TRUE, unique.features = TRUE) {
}
}

#' Normalize raw data to fractions
#'
#' Normalize count data to relative counts per cell by dividing by the total
#' per cell. Optionally use a scale factor, e.g. for counts per million (CPM)
#' use \code{scale.factor = 1e6}.
#'
#' @param data Matrix with the raw count data
#' @param scale.factor Scale the result. Default is 1
#' @param verbose Print progress
#' @param ... Ignored
#' @return Returns a matrix with the relative counts
#'
#' @import Matrix
#' @importFrom methods as
#'
#' @export
#'
#' @examples
#' mat <- matrix(data = rbinom(n = 25, size = 5, prob = 0.2), nrow = 5)
#' mat
#' mat_norm <- RelativeCounts(data = mat)
#' mat_norm
#'
RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE, ...) {
if (class(x = data) == "data.frame") {
data <- as.matrix(x = data)
}
if (class(x = data) != "dgCMatrix") {
data <- as(object = data, Class = "dgCMatrix")
}
if (verbose) {
cat("Performing relative-counts-normalization\n", file = stderr())
}
norm.data <- data
norm.data@x <- norm.data@x / rep.int(colSums(norm.data), diff(norm.data@p)) * scale.factor
return(norm.data)
}

#' Sample UMI
#'
#' Downsample each cell to a specified number of UMIs. Includes
#' an option to upsample cells below specified UMI as well.
#'
#' @param data Matrix with the raw count data
#' @param max.umi Number of UMIs to sample to
#' @param upsample Upsamples all cells with fewer than max.umi
#' @param verbose Display the progress bar
#'
#' @import Matrix
#' @importFrom methods as
#'
#' @return Matrix with downsampled data
#'
#' @export
#'
#' @examples
#' counts = as.matrix(x = GetAssayData(object = pbmc_small, assay = "RNA", slot = "counts"))
#' downsampled = SampleUMI(data = counts)
#' head(x = downsampled)
#'
SampleUMI <- function(
data,
max.umi = 1000,
upsample = FALSE,
verbose = FALSE
) {
data <- as(object = data, Class = "dgCMatrix")
if (length(x = max.umi) == 1) {
return(
RunUMISampling(
data = data,
sample_val = max.umi,
upsample = upsample,
display_progress = verbose
)
)
} else if (length(x = max.umi) != ncol(x = data)) {
stop("max.umi vector not equal to number of cells")
}
new_data = RunUMISamplingPerCell(
data = data,
sample_val = max.umi,
upsample = upsample,
display_progress = verbose
)
dimnames(new_data) <- dimnames(data)
return(new_data)
}

#' Use regularized negative binomial regression to normalize UMI count data
#'
#' This function calls sctransform::vst. The sctransform package is available at
Expand Down Expand Up @@ -931,7 +1020,7 @@ SCTransform <- function(
vars.to.regress = NULL,
do.scale = FALSE,
do.center = TRUE,
clip.range = c(-sqrt(ncol(object[[assay]])/30), sqrt(ncol(object[[assay]])/30)),
clip.range = c(-sqrt(x = ncol(x = object[[assay]]) / 30), sqrt(x = ncol(x = object[[assay]]) / 30)),
conserve.memory = FALSE,
return.only.var.genes = TRUE,
seed.use = 1448145,
Expand All @@ -948,87 +1037,97 @@ SCTransform <- function(
assay.obj <- GetAssay(object = object, assay = assay)
umi <- GetAssayData(object = assay.obj, slot = 'counts')
cell.attr <- slot(object = object, name = 'meta.data')

vst.args <- list(...)
# check for batch_var in meta data
if ('batch_var' %in% names(vst.args)) {
if (!(vst.args[['batch_var']] %in% colnames(cell.attr))) {
if ('batch_var' %in% names(x = vst.args)) {
if (!(vst.args[['batch_var']] %in% colnames(x = cell.attr))) {
stop('batch_var not found in seurat object meta data')
}
}

# check for latent_var in meta data
if ('latent_var' %in% names(vst.args)) {
if ('latent_var' %in% names(x = vst.args)) {
known.attr <- c('umi', 'gene', 'log_umi', 'log_gene', 'umi_per_gene', 'log_umi_per_gene')
if (!all(vst.args[['latent_var']] %in% c(colnames(cell.attr), known.attr))) {
stop('latent_var values are not from the set of cell attributes sctransform calculates
by default and cannot be found in seurat object meta data')
if (!all(vst.args[['latent_var']] %in% c(colnames(x = cell.attr), known.attr))) {
stop('latent_var values are not from the set of cell attributes sctransform calculates by default and cannot be found in seurat object meta data')
}
}

if (any(c('cell_attr', 'show_progress', 'return_cell_attr', 'return_gene_attr', 'return_corrected_umi') %in% names(vst.args))) {
warning(paste('the following arguments will be ignored because they are set within this function:',
paste(c('cell_attr', 'show_progress', 'return_cell_attr', 'return_gene_attr',
'return_corrected_umi'), collapse = ', ')))
if (any(c('cell_attr', 'show_progress', 'return_cell_attr', 'return_gene_attr', 'return_corrected_umi') %in% names(x = vst.args))) {
warning(
'the following arguments will be ignored because they are set within this function:',
paste(
c(
'cell_attr',
'show_progress',
'return_cell_attr',
'return_gene_attr',
'return_corrected_umi'
),
collapse = ', '
),
call. = FALSE,
immediate. = TRUE
)
}
vst.args[['umi']] <- umi
vst.args[['cell_attr']] <- cell.attr
vst.args[['show_progress']] <- verbose
vst.args[['return_cell_attr']] <- TRUE
vst.args[['return_gene_attr']] <- TRUE
vst.args[['return_corrected_umi']] <- do.correct.umi

residual.type <- vst.args[['residual_type']] %||% 'pearson'
res.clip.range <- vst.args[['res_clip_range']] %||% c(-sqrt(ncol(umi)), sqrt(ncol(umi)))

res.clip.range <- vst.args[['res_clip_range']] %||% c(-sqrt(x = ncol(x = umi)), sqrt(x = ncol(x = umi)))
if (conserve.memory) {
return.only.var.genes <- TRUE
}

if (conserve.memory) {
vst.args[['residual_type']] <- 'none'
vst.out <- do.call(sctransform::vst, vst.args)
feature.variance <- sctransform::get_residual_var(vst_out = vst.out,
umi = umi,
residual_type = residual.type,
res_clip_range = res.clip.range)
vst.out <- do.call(what = sctransform::vst, args = vst.args)
feature.variance <- sctransform::get_residual_var(
vst_out = vst.out,
umi = umi,
residual_type = residual.type,
res_clip_range = res.clip.range
)
vst.out$gene_attr$residual_variance <- NA_real_
vst.out$gene_attr[names(feature.variance), 'residual_variance'] <- feature.variance
vst.out$gene_attr[names(x = feature.variance), 'residual_variance'] <- feature.variance
} else {
vst.out <- do.call(sctransform::vst, vst.args)
vst.out <- do.call(what = sctransform::vst, args = vst.args)
feature.variance <- setNames(
object = vst.out$gene_attr$residual_variance,
nm = rownames(x = vst.out$gene_attr)
)
}

if (verbose) {
message('Determine variable features')
}
feature.variance <- sort(x = feature.variance, decreasing = TRUE)
if (!is.null(x = variable.features.n)) {
top.features <- names(x = feature.variance)[1:min(variable.features.n, length(feature.variance))]
top.features <- names(x = feature.variance)[1:min(variable.features.n, length(x = feature.variance))]
} else {
top.features <- names(x = feature.variance)[feature.variance >= variable.features.rv.th]
}
if (verbose) {
message('Set ', length(x = top.features), ' variable features')
}

if(conserve.memory) {
if (conserve.memory) {
# actually get the residuals this time
if (verbose) {
message("Return only variable features for scale.data slot of the output assay")
}
vst.out$y <- sctransform::get_residuals(vst_out = vst.out,
umi = umi[top.features, ],
residual_type = residual.type,
res_clip_range = res.clip.range)
vst.out$y <- sctransform::get_residuals(
vst_out = vst.out,
umi = umi[top.features, ],
residual_type = residual.type,
res_clip_range = res.clip.range
)
if (do.correct.umi & residual.type == 'pearson') {
vst.out$umi_corrected <- sctransform::correct_counts(vst.out, umi = umi, show_progress = verbose)
vst.out$umi_corrected <- sctransform::correct_counts(
x = vst.out,
umi = umi,
show_progress = verbose
)
}
}

# create output assay and put (corrected) umi counts in count slot
if (do.correct.umi & residual.type == 'pearson') {
if (verbose) {
Expand All @@ -1040,30 +1139,26 @@ SCTransform <- function(
}
# set the variable genes
VariableFeatures(object = assay.out) <- top.features

# put log1p transformed counts in data
assay.out <- SetAssayData(
object = assay.out,
slot = 'data',
new.data = log1p(GetAssayData(object = assay.out, slot = 'counts'))
new.data = log1p(x = GetAssayData(object = assay.out, slot = 'counts'))
)

if (return.only.var.genes & !conserve.memory) {
scale.data <- vst.out$y[top.features, ]
} else {
scale.data <- vst.out$y
}

# clip the residuals
scale.data[scale.data < clip.range[1]] <- clip.range[1]
scale.data[scale.data > clip.range[2]] <- clip.range[2]

# 2nd regression
scale.data <- ScaleData(
scale.data,
features = NULL,
vars.to.regress = vars.to.regress,
latent.data = cell.attr[, vars.to.regress, drop=FALSE],
latent.data = cell.attr[, vars.to.regress, drop = FALSE],
model.use = 'linear',
use.umi = FALSE,
do.scale = do.scale,
Expand All @@ -1073,20 +1168,16 @@ SCTransform <- function(
min.cells.to.block = 3000,
verbose = verbose
)

assay.out <- SetAssayData(
object = assay.out,
slot = 'scale.data',
new.data = scale.data
)

# save vst output (except y) in @misc slot
vst.out$y <- NULL
assay.out@misc <- list(vst.out = vst.out)
Misc(object = assay.out, slot = 'vst.out') <- vst.out
# also put gene attributes in meta.features
#assay.out[[names(x = vst.out$gene_attr)]] <- vst.out$gene_attr # use this when Paul has implemented automatic padding
assay.out@meta.features <- vst.out$gene_attr # hacky workaround to make vignettes compile

assay.out[[names(x = vst.out$gene_attr)]] <- vst.out$gene_attr
object[[new.assay.name]] <- assay.out
if (verbose) {
message(paste("Set default assay to", new.assay.name))
Expand All @@ -1096,95 +1187,6 @@ SCTransform <- function(
return(object)
}

#' Normalize raw data to fractions
#'
#' Normalize count data to relative counts per cell by dividing by the total
#' per cell. Optionally use a scale factor, e.g. for counts per million (CPM)
#' use \code{scale.factor = 1e6}.
#'
#' @param data Matrix with the raw count data
#' @param scale.factor Scale the result. Default is 1
#' @param verbose Print progress
#' @param ... Ignored
#' @return Returns a matrix with the relative counts
#'
#' @import Matrix
#' @importFrom methods as
#'
#' @export
#'
#' @examples
#' mat <- matrix(data = rbinom(n = 25, size = 5, prob = 0.2), nrow = 5)
#' mat
#' mat_norm <- RelativeCounts(data = mat)
#' mat_norm
#'
RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE, ...) {
if (class(x = data) == "data.frame") {
data <- as.matrix(x = data)
}
if (class(x = data) != "dgCMatrix") {
data <- as(object = data, Class = "dgCMatrix")
}
if (verbose) {
cat("Performing relative-counts-normalization\n", file = stderr())
}
norm.data <- data
norm.data@x <- norm.data@x / rep.int(colSums(norm.data), diff(norm.data@p)) * scale.factor
return(norm.data)
}

#' Sample UMI
#'
#' Downsample each cell to a specified number of UMIs. Includes
#' an option to upsample cells below specified UMI as well.
#'
#' @param data Matrix with the raw count data
#' @param max.umi Number of UMIs to sample to
#' @param upsample Upsamples all cells with fewer than max.umi
#' @param verbose Display the progress bar
#'
#' @import Matrix
#' @importFrom methods as
#'
#' @return Matrix with downsampled data
#'
#' @export
#'
#' @examples
#' counts = as.matrix(x = GetAssayData(object = pbmc_small, assay = "RNA", slot = "counts"))
#' downsampled = SampleUMI(data = counts)
#' head(x = downsampled)
#'
SampleUMI <- function(
data,
max.umi = 1000,
upsample = FALSE,
verbose = FALSE
) {
data <- as(object = data, Class = "dgCMatrix")
if (length(x = max.umi) == 1) {
return(
RunUMISampling(
data = data,
sample_val = max.umi,
upsample = upsample,
display_progress = verbose
)
)
} else if (length(x = max.umi) != ncol(x = data)) {
stop("max.umi vector not equal to number of cells")
}
new_data = RunUMISamplingPerCell(
data = data,
sample_val = max.umi,
upsample = upsample,
display_progress = verbose
)
dimnames(new_data) <- dimnames(data)
return(new_data)
}

#' Subset a Seurat Object based on the Barcode Distribution Inflection Points
#'
#' This convenience function subsets a Seurat object based on calculated inflection points.
Expand Down
Loading

0 comments on commit 15823a5

Please sign in to comment.