Question

batch correction: cds from seurat

0

Entering edit mode

20 days ago

sooni ▴ 20

Hello.

I want to integrate the 7 Seurat objects and then convert them to cds objects to perform batch correction. First of all, this is the code I ran before changing to cds.

donor1 <- Read10X_h5("GSM5723843_Donor1_raw_feature_bc_matrix.h5")
donor2 <- Read10X_h5("GSM5723844_Donor2_raw_feature_bc_matrix.h5")
donor3 <- Read10X_h5("GSM5723845_Donor3_raw_feature_bc_matrix.h5")
donor4 <- Read10X_h5("GSM5723846_Donor4_raw_feature_bc_matrix.h5")
donor5 <- Read10X_h5("GSM5723847_Donor5_raw_feature_bc_matrix.h5")
donor6 <- Read10X_h5("GSM5723848_Donor6_raw_feature_bc_matrix.h5")
donor7 <- Read10X_h5("GSM5723849_Donor7_raw_feature_bc_matrix.h5")

donor1 <- CreateSeuratObject(donor1, project = "Donor1", min.cells = 3,
                             min.features = 200)
donor2 <- CreateSeuratObject(donor2, project = "Donor2", min.cells = 3,
                             min.features = 200)
donor3 <- CreateSeuratObject(donor3, project = "Donor3", min.cells = 3,
                             min.features = 200)
donor4 <- CreateSeuratObject(donor4, project = "Donor4", min.cells = 3,
                             min.features = 200)
donor5 <- CreateSeuratObject(donor5, project = "Donor5", min.cells = 3,
                             min.features = 200)
donor6 <- CreateSeuratObject(donor6, project = "Donor6", min.cells = 3,
                             min.features = 200)
donor7 <- CreateSeuratObject(donor7, project = "Donor7", min.cells = 3,
                             min.features = 200)

donor1 <- PercentageFeatureSet(donor1, pattern = "^MT-", col.name = "percent.mt")
donor2 <- PercentageFeatureSet(donor2, pattern = "^MT-", col.name = "percent.mt")
donor3 <- PercentageFeatureSet(donor3, pattern = "^MT-", col.name = "percent.mt")
donor4 <- PercentageFeatureSet(donor4, pattern = "^MT-", col.name = "percent.mt")
donor5 <- PercentageFeatureSet(donor5, pattern = "^MT-", col.name = "percent.mt")
donor6 <- PercentageFeatureSet(donor6, pattern = "^MT-", col.name = "percent.mt")
donor7 <- PercentageFeatureSet(donor7, pattern = "^MT-", col.name = "percent.mt")

donors <- merge(donor1, y = c(donor2, donor3, donor4,
                              donor5, donor6, donor7))

VlnPlot(donors, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 3, pt.size = 0)

donors <- subset(donors, nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

donors.list <- SplitObject(donors, split.by = "orig.ident")

donors.list <- lapply(X = donors.list, FUN = function(x) {
  x <- NormalizeData(x)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

features <- SelectIntegrationFeatures(object.list = donors.list)
donors.anchors <- FindIntegrationAnchors(object.list = donors.list, 
                                         anchor.features = features)

saydonors.combined <- IntegrateData(anchorset = donors.anchors)

DefaultAssay(donors.combined) <- "integrated"

donors.combined <- ScaleData(donors.combined, verbose = FALSE)
donors.combined <- RunPCA(donors.combined, npcs = 30, verbose = FALSE)
donors.combined <- RunUMAP(donors.combined, reduction = "pca", dims = 1:30)
donors.combined <- FindNeighbors(donors.combined, reduction = "pca", dims = 1:30)

#### another way to convert seurat to cds ####
my.so <- donors.combined

# Project PC dimensions to whole data set
my.so <- ProjectDim(my.so, reduction = "pca")

expression_matrix <- my.so@assays$RNA@counts

cell_metadata <- my.so@meta.data
if (all.equal(colnames(expression_matrix), rownames(cell_metadata))) {
  print(sprintf("Cell identifiers match"))
} else {
  print(sprintf("Cell identifier mismatch - %i cells in expression matrix, %i cells in metadata",
                ncol(expression_matrix), nrow(cell_metadata)))
  print("If the counts are equal, sort differences will throw this error")
}

gene_annotation <- data.frame(gene_short_name = rownames(my.so@assays$RNA), 
                              row.names = rownames(my.so@assays$RNA))
if (all.equal(rownames(expression_matrix), rownames(gene_annotation))) {
  print(sprintf("Gene identifiers all match"))
} else {
  print(sprintf("Gene identifier mismatch - %i genes in expression matrix, %i gene in gene annotation",
                nrow(expression_matrix), nrow(gene_annotation)))
  print("If the counts are equal, sort differences will throw this error")
}

cell_metadata$Samples <- rep(NA)
cell_metadata <- cell_metadata %>%
  mutate(Samples = case_when(
    grepl("Donor1", orig.ident) ~ "Donor1",
    grepl("Donor2", orig.ident) ~ "Donor2",
    grepl("Donor3", orig.ident) ~ "Donor3",
    grepl("Donor4", orig.ident) ~ "Donor4",
    grepl("Donor5", orig.ident) ~ "Donor5",
    grepl("Donor6", orig.ident) ~ "Donor6",
    grepl("Donor7", orig.ident) ~ "Donor7",
    TRUE ~ NA_character_
  ))

my.cds <- new_cell_data_set(expression_matrix,
                            cell_metadata = cell_metadata,
                            gene_metadata = gene_annotation)

I would like to perform batch correction on the cds object created this way. The code that I batch corrected in the cds object is as follows.

my.cds <- preprocess_cds(my.cds, num_dim = 100)
reduce_my.cds <- reduce_dimension(my.cds)
plot_cells(reduce_my.cds, color_cells_by = "Samples", label_cell_groups = F,
           group_label_size = 5)

batch_my.cds <- align_cds(reduce_my.cds, num_dim = 100, alignment_group = "Samples")
batch_my.cds <- reduce_dimension(batch_my.cds)
plot_cells(batch_my.cds, color_cells_by = "Samples", label_cell_groups = F)

When UMAP is drawn after performing batch correction with the code above, the UMAP is drawn poorly. Each cluster is held together like a single lump.

enter image description here

Is my code wrong? If there is a site with useful code, please let me know.

Thank you for help!

batch-correction R seurat monocle3 cds • 261 views

ADD COMMENT • link updated 20 days ago by Ram 43k • written 20 days ago by sooni ▴ 20