Fixes to SCTransform for new Assay requierments

ws2260 · Apr 11, 2019 · 15823a5 · 15823a5
1 parent a19ca58
commit 15823a5
Show file tree

Hide file tree

Showing 2 changed files with 142 additions and 141 deletions.
diff --git a/R/preprocessing.R b/R/preprocessing.R
@@ -882,6 +882,95 @@ Read10X_h5 <- function(filename, use.names = TRUE, unique.features = TRUE) {
   }
 }
 
+#' Normalize raw data to fractions
+#'
+#' Normalize count data to relative counts per cell by dividing by the total
+#' per cell. Optionally use a scale factor, e.g. for counts per million (CPM)
+#' use \code{scale.factor = 1e6}.
+#'
+#' @param data Matrix with the raw count data
+#' @param scale.factor Scale the result. Default is 1
+#' @param verbose Print progress
+#' @param ... Ignored
+#' @return Returns a matrix with the relative counts
+#'
+#' @import Matrix
+#' @importFrom methods as
+#'
+#' @export
+#'
+#' @examples
+#' mat <- matrix(data = rbinom(n = 25, size = 5, prob = 0.2), nrow = 5)
+#' mat
+#' mat_norm <- RelativeCounts(data = mat)
+#' mat_norm
+#'
+RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE, ...) {
+  if (class(x = data) == "data.frame") {
+    data <- as.matrix(x = data)
+  }
+  if (class(x = data) != "dgCMatrix") {
+    data <- as(object = data, Class = "dgCMatrix")
+  }
+  if (verbose) {
+    cat("Performing relative-counts-normalization\n", file = stderr())
+  }
+  norm.data <- data
+  norm.data@x <- norm.data@x / rep.int(colSums(norm.data), diff(norm.data@p)) * scale.factor
+  return(norm.data)
+}
+
+#' Sample UMI
+#'
+#' Downsample each cell to a specified number of UMIs. Includes
+#' an option to upsample cells below specified UMI as well.
+#'
+#' @param data Matrix with the raw count data
+#' @param max.umi Number of UMIs to sample to
+#' @param upsample Upsamples all cells with fewer than max.umi
+#' @param verbose Display the progress bar
+#'
+#' @import Matrix
+#' @importFrom methods as
+#'
+#' @return Matrix with downsampled data
+#'
+#' @export
+#'
+#' @examples
+#' counts = as.matrix(x = GetAssayData(object = pbmc_small, assay = "RNA", slot = "counts"))
+#' downsampled = SampleUMI(data = counts)
+#' head(x = downsampled)
+#'
+SampleUMI <- function(
+  data,
+  max.umi = 1000,
+  upsample = FALSE,
+  verbose = FALSE
+) {
+  data <- as(object = data, Class = "dgCMatrix")
+  if (length(x = max.umi) == 1) {
+    return(
+      RunUMISampling(
+        data = data,
+        sample_val = max.umi,
+        upsample = upsample,
+        display_progress = verbose
+      )
+    )
+  } else if (length(x = max.umi) != ncol(x = data)) {
+    stop("max.umi vector not equal to number of cells")
+  }
+  new_data = RunUMISamplingPerCell(
+    data = data,
+    sample_val = max.umi,
+    upsample = upsample,
+    display_progress = verbose
+  )
+  dimnames(new_data) <- dimnames(data)
+  return(new_data)
+}
+
 #' Use regularized negative binomial regression to normalize UMI count data
 #'
 #' This function calls sctransform::vst. The sctransform package is available at
@@ -931,7 +1020,7 @@ SCTransform <- function(
   vars.to.regress = NULL,
   do.scale = FALSE,
   do.center = TRUE,
-  clip.range = c(-sqrt(ncol(object[[assay]])/30), sqrt(ncol(object[[assay]])/30)),
+  clip.range = c(-sqrt(x = ncol(x = object[[assay]]) / 30), sqrt(x = ncol(x = object[[assay]]) / 30)),
   conserve.memory = FALSE,
   return.only.var.genes = TRUE,
   seed.use = 1448145,
@@ -948,87 +1037,97 @@ SCTransform <- function(
   assay.obj <- GetAssay(object = object, assay = assay)
   umi <- GetAssayData(object = assay.obj, slot = 'counts')
   cell.attr <- slot(object = object, name = 'meta.data')
-
   vst.args <- list(...)
   # check for batch_var in meta data
-  if ('batch_var' %in% names(vst.args)) {
-    if (!(vst.args[['batch_var']] %in% colnames(cell.attr))) {
+  if ('batch_var' %in% names(x = vst.args)) {
+    if (!(vst.args[['batch_var']] %in% colnames(x = cell.attr))) {
       stop('batch_var not found in seurat object meta data')
     }
   }
-
   # check for latent_var in meta data
-  if ('latent_var' %in% names(vst.args)) {
+  if ('latent_var' %in% names(x = vst.args)) {
     known.attr <- c('umi', 'gene', 'log_umi', 'log_gene', 'umi_per_gene', 'log_umi_per_gene')
-    if (!all(vst.args[['latent_var']] %in% c(colnames(cell.attr), known.attr))) {
-      stop('latent_var values are not from the set of cell attributes sctransform calculates
-           by default and cannot be found in seurat object meta data')
+    if (!all(vst.args[['latent_var']] %in% c(colnames(x = cell.attr), known.attr))) {
+      stop('latent_var values are not from the set of cell attributes sctransform calculates by default and cannot be found in seurat object meta data')
     }
   }
-
-  if (any(c('cell_attr', 'show_progress', 'return_cell_attr', 'return_gene_attr', 'return_corrected_umi') %in% names(vst.args))) {
-    warning(paste('the following arguments will be ignored because they are set within this function:',
-                  paste(c('cell_attr', 'show_progress', 'return_cell_attr', 'return_gene_attr',
-                          'return_corrected_umi'), collapse = ', ')))
+  if (any(c('cell_attr', 'show_progress', 'return_cell_attr', 'return_gene_attr', 'return_corrected_umi') %in% names(x = vst.args))) {
+    warning(
+      'the following arguments will be ignored because they are set within this function:',
+      paste(
+        c(
+          'cell_attr',
+          'show_progress',
+          'return_cell_attr',
+          'return_gene_attr',
+          'return_corrected_umi'
+        ),
+        collapse = ', '
+      ),
+      call. = FALSE,
+      immediate. = TRUE
+    )
   }
   vst.args[['umi']] <- umi
   vst.args[['cell_attr']] <- cell.attr
   vst.args[['show_progress']] <- verbose
   vst.args[['return_cell_attr']] <- TRUE
   vst.args[['return_gene_attr']] <- TRUE
   vst.args[['return_corrected_umi']] <- do.correct.umi
-
   residual.type <- vst.args[['residual_type']] %||% 'pearson'
-  res.clip.range <- vst.args[['res_clip_range']] %||% c(-sqrt(ncol(umi)), sqrt(ncol(umi)))
-
+  res.clip.range <- vst.args[['res_clip_range']] %||% c(-sqrt(x = ncol(x = umi)), sqrt(x = ncol(x = umi)))
   if (conserve.memory) {
     return.only.var.genes <- TRUE
   }
-
   if (conserve.memory) {
     vst.args[['residual_type']] <- 'none'
-    vst.out <- do.call(sctransform::vst, vst.args)
-    feature.variance <- sctransform::get_residual_var(vst_out = vst.out,
-                                                      umi = umi,
-                                                      residual_type = residual.type,
-                                                      res_clip_range = res.clip.range)
+    vst.out <- do.call(what = sctransform::vst, args = vst.args)
+    feature.variance <- sctransform::get_residual_var(
+      vst_out = vst.out,
+      umi = umi,
+      residual_type = residual.type,
+      res_clip_range = res.clip.range
+    )
     vst.out$gene_attr$residual_variance <- NA_real_
-    vst.out$gene_attr[names(feature.variance), 'residual_variance'] <- feature.variance
+    vst.out$gene_attr[names(x = feature.variance), 'residual_variance'] <- feature.variance
   } else {
-    vst.out <- do.call(sctransform::vst, vst.args)
+    vst.out <- do.call(what = sctransform::vst, args = vst.args)
     feature.variance <- setNames(
       object = vst.out$gene_attr$residual_variance,
       nm = rownames(x = vst.out$gene_attr)
     )
   }
-
   if (verbose) {
     message('Determine variable features')
   }
   feature.variance <- sort(x = feature.variance, decreasing = TRUE)
   if (!is.null(x = variable.features.n)) {
-    top.features <- names(x = feature.variance)[1:min(variable.features.n, length(feature.variance))]
+    top.features <- names(x = feature.variance)[1:min(variable.features.n, length(x = feature.variance))]
   } else {
     top.features <- names(x = feature.variance)[feature.variance >= variable.features.rv.th]
   }
   if (verbose) {
     message('Set ', length(x = top.features), ' variable features')
   }
-
-  if(conserve.memory) {
+  if (conserve.memory) {
     # actually get the residuals this time
     if (verbose) {
       message("Return only variable features for scale.data slot of the output assay")
     }
-    vst.out$y <- sctransform::get_residuals(vst_out = vst.out,
-                                            umi = umi[top.features, ],
-                                            residual_type = residual.type,
-                                            res_clip_range = res.clip.range)
+    vst.out$y <- sctransform::get_residuals(
+      vst_out = vst.out,
+      umi = umi[top.features, ],
+      residual_type = residual.type,
+      res_clip_range = res.clip.range
+    )
     if (do.correct.umi & residual.type == 'pearson') {
-      vst.out$umi_corrected <- sctransform::correct_counts(vst.out, umi = umi, show_progress = verbose)
+      vst.out$umi_corrected <- sctransform::correct_counts(
+        x = vst.out,
+        umi = umi,
+        show_progress = verbose
+      )
     }
   }
-
   # create output assay and put (corrected) umi counts in count slot
   if (do.correct.umi & residual.type == 'pearson') {
     if (verbose) {
@@ -1040,30 +1139,26 @@ SCTransform <- function(
   }
   # set the variable genes
   VariableFeatures(object = assay.out) <- top.features
-
   # put log1p transformed counts in data
   assay.out <- SetAssayData(
     object = assay.out,
     slot = 'data',
-    new.data = log1p(GetAssayData(object = assay.out, slot = 'counts'))
+    new.data = log1p(x = GetAssayData(object = assay.out, slot = 'counts'))
   )
-
   if (return.only.var.genes & !conserve.memory) {
     scale.data <- vst.out$y[top.features, ]
   } else {
     scale.data <- vst.out$y
   }
-
   # clip the residuals
   scale.data[scale.data < clip.range[1]] <- clip.range[1]
   scale.data[scale.data > clip.range[2]] <- clip.range[2]
-
   # 2nd regression
   scale.data <- ScaleData(
     scale.data,
     features = NULL,
     vars.to.regress = vars.to.regress,
-    latent.data = cell.attr[, vars.to.regress, drop=FALSE],
+    latent.data = cell.attr[, vars.to.regress, drop = FALSE],
     model.use = 'linear',
     use.umi = FALSE,
     do.scale = do.scale,
@@ -1073,20 +1168,16 @@ SCTransform <- function(
     min.cells.to.block = 3000,
     verbose = verbose
   )
-
   assay.out <- SetAssayData(
     object = assay.out,
     slot = 'scale.data',
     new.data = scale.data
   )
-
   # save vst output (except y) in @misc slot
   vst.out$y <- NULL
-  assay.out@misc <- list(vst.out = vst.out)
+  Misc(object = assay.out, slot = 'vst.out') <- vst.out
   # also put gene attributes in meta.features
-  #assay.out[[names(x = vst.out$gene_attr)]] <- vst.out$gene_attr  # use this when Paul has implemented automatic padding
-  assay.out@meta.features <- vst.out$gene_attr  # hacky workaround to make vignettes compile
-
+  assay.out[[names(x = vst.out$gene_attr)]] <- vst.out$gene_attr
   object[[new.assay.name]] <- assay.out
   if (verbose) {
     message(paste("Set default assay to", new.assay.name))
@@ -1096,95 +1187,6 @@ SCTransform <- function(
   return(object)
 }
 
-#' Normalize raw data to fractions
-#'
-#' Normalize count data to relative counts per cell by dividing by the total
-#' per cell. Optionally use a scale factor, e.g. for counts per million (CPM)
-#' use \code{scale.factor = 1e6}.
-#'
-#' @param data Matrix with the raw count data
-#' @param scale.factor Scale the result. Default is 1
-#' @param verbose Print progress
-#' @param ... Ignored
-#' @return Returns a matrix with the relative counts
-#'
-#' @import Matrix
-#' @importFrom methods as
-#'
-#' @export
-#'
-#' @examples
-#' mat <- matrix(data = rbinom(n = 25, size = 5, prob = 0.2), nrow = 5)
-#' mat
-#' mat_norm <- RelativeCounts(data = mat)
-#' mat_norm
-#'
-RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE, ...) {
-  if (class(x = data) == "data.frame") {
-    data <- as.matrix(x = data)
-  }
-  if (class(x = data) != "dgCMatrix") {
-    data <- as(object = data, Class = "dgCMatrix")
-  }
-  if (verbose) {
-    cat("Performing relative-counts-normalization\n", file = stderr())
-  }
-  norm.data <- data
-  norm.data@x <- norm.data@x / rep.int(colSums(norm.data), diff(norm.data@p)) * scale.factor
-  return(norm.data)
-}
-
-#' Sample UMI
-#'
-#' Downsample each cell to a specified number of UMIs. Includes
-#' an option to upsample cells below specified UMI as well.
-#'
-#' @param data Matrix with the raw count data
-#' @param max.umi Number of UMIs to sample to
-#' @param upsample Upsamples all cells with fewer than max.umi
-#' @param verbose Display the progress bar
-#'
-#' @import Matrix
-#' @importFrom methods as
-#'
-#' @return Matrix with downsampled data
-#'
-#' @export
-#'
-#' @examples
-#' counts = as.matrix(x = GetAssayData(object = pbmc_small, assay = "RNA", slot = "counts"))
-#' downsampled = SampleUMI(data = counts)
-#' head(x = downsampled)
-#'
-SampleUMI <- function(
-  data,
-  max.umi = 1000,
-  upsample = FALSE,
-  verbose = FALSE
-) {
-  data <- as(object = data, Class = "dgCMatrix")
-  if (length(x = max.umi) == 1) {
-    return(
-      RunUMISampling(
-        data = data,
-        sample_val = max.umi,
-        upsample = upsample,
-        display_progress = verbose
-      )
-    )
-  } else if (length(x = max.umi) != ncol(x = data)) {
-    stop("max.umi vector not equal to number of cells")
-  }
-  new_data = RunUMISamplingPerCell(
-    data = data,
-    sample_val = max.umi,
-    upsample = upsample,
-    display_progress = verbose
-  )
-  dimnames(new_data) <- dimnames(data)
-  return(new_data)
-}
-
 #' Subset a Seurat Object based on the Barcode Distribution Inflection Points
 #'
 #' This convenience function subsets a Seurat object based on calculated inflection points.