-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathpreprocess.R
More file actions
176 lines (158 loc) · 6.78 KB
/
preprocess.R
File metadata and controls
176 lines (158 loc) · 6.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#' Pre-processing function for sex classification
#'
#' The purpose of this function is to process a single cell counts matrix into
#' the appropriate format for the \code{classifySex} function.
#'
#' This function will filter out cells that are unable to be classified due to
#' zero counts on *XIST/Xist* and all of the Y chromosome genes. If
#' \code{qc=TRUE} additional cells are removed as identified by the
#' \code{perCellQCMetrics} and \code{quickPerCellQC} functions from the
#' \code{scuttle} package. The resulting counts matrix is then log-normalised
#' and scaled.
#'
#' @param x the counts matrix, rows are genes and columns are cells. Row names
#' must be gene symbols.
#' @param genome the genome the data arises from. Current options are
#' human: genome = "Hs" or mouse: genome = "Mm".
#' @param qc logical, indicates whether to perform additional quality control
#' on the cells. qc = TRUE will predict cells that pass quality control only
#' and the filtered cells will not be classified. qc = FALSE will predict
#' every cell except the cells with zero counts on *XIST/Xist* and the sum
#' of the Y genes. Default is TRUE.
#'
#' @return outputs a list object with the following components
#' \item{tcm.final }{A transposed count matrix where rows are cells and columns
#' are the features used for classification.}
#' \item{data.df }{The normalised and scaled \code{tcm.final} matrix.}
#' \item{discarded.cells }{Character vector of cell IDs for the cells that are
#' discarded when \code{qc=TRUE}.}
#' \item{zero.cells }{Character vector of cell IDs for the cells that can not
#' be classified as male/female due to zero counts on *Xist* and all the
#' Y chromosome genes.}
#'
#' @importFrom AnnotationDbi select
#' @importFrom stringr str_to_title
#' @importFrom scuttle perCellQCMetrics
#' @importFrom scuttle quickPerCellQC
#' @importFrom org.Hs.eg.db org.Hs.eg.db
#' @importFrom org.Mm.eg.db org.Mm.eg.db
#' @export preprocess
#'
#' @examples
#'
#' library(speckle)
#' library(SingleCellExperiment)
#' library(CellBench)
#' library(org.Hs.eg.db)
#'
#' # Get data from CellBench library
#' sc_data <- load_sc_data()
#' sc_10x <- sc_data$sc_10x
#'
#' # Get counts matrix in correct format with gene symbol as rownames
#' # rather than ENSEMBL ID.
#' counts <- counts(sc_10x)
#' ann <- select(org.Hs.eg.db, keys=rownames(sc_10x),
#' columns=c("ENSEMBL","SYMBOL"), keytype="ENSEMBL")
#' m <- match(rownames(counts), ann$ENSEMBL)
#' rownames(counts) <- ann$SYMBOL[m]
#'
#' # Preprocess data
#' pro.data <- preprocess(counts, genome="Hs", qc = TRUE)
#'
#' # Look at counts on XIST and superY.all
#' plot(pro.data$tcm.final$XIST, pro.data$tcm.final$superY)
#'
#' # Cells that are identified as low quality
#' pro.data$discarded.cells
#'
#' # Cells with zero counts on XIST and all Y genes
#' pro.data$zero.cells
#'
preprocess<- function(x, genome=genome, qc=qc){
# x <- as.matrix(x)
row.names(x)<- toupper(row.names(x))
if (is.null(row.names(x))){
stop("Missing rownames for the input count matrix.
Please use gene symbols as rownames.")
}
if (length(unique(colnames(x))) != ncol(x)){
message("Cell names are missing/duplicated. Cells are renamed to cell1 - cell", ncol(x))
colnames(x) = paste(rep("cell", ncol(x)), seq(1, ncol(x)), sep="")
}
# genes located in the X chromosome that have been reported to escape
# X-inactivation
# http://bioinf.wehi.edu.au/software/GenderGenes/index.html
Xgenes<- c("ARHGAP4","STS","ARSD", "ARSL", "AVPR2", "BRS3", "S100G",
"CHM", "CLCN4", "DDX3X","EIF1AX","EIF2S3", "GPM6B",
"GRPR", "HCFC1", "L1CAM", "MAOA", "MYCLP1", "NAP1L3",
"GPR143", "CDK16", "PLXNB3", "PRKX", "RBBP7", "RENBP",
"RPS4X", "TRAPPC2", "SH3BGRL", "TBL1X","UBA1", "KDM6A",
"XG", "XIST", "ZFX", "PUDP", "PNPLA4", "USP9X", "KDM5C",
"SMC1A", "NAA10", "OFD1", "IKBKG", "PIR", "INE2", "INE1",
"AP1S2", "GYG2", "MED14", "RAB9A", "ITM2A", "MORF4L2",
"CA5B", "SRPX2", "GEMIN8", "CTPS2", "CLTRN", "NLGN4X",
"DUSP21", "ALG13","SYAP1", "SYTL4", "FUNDC1", "GAB3",
"RIBC1", "FAM9C","CA5BP1")
# genes belonging to the male-specific region of chromosome Y (unique genes)
# http://bioinf.wehi.edu.au/software/GenderGenes/index.html
Ygenes<-c("AMELY", "DAZ1", "PRKY", "RBMY1A1", "RBMY1HP", "RPS4Y1", "SRY",
"TSPY1", "UTY", "ZFY","KDM5D", "USP9Y", "DDX3Y", "PRY", "XKRY",
"BPY2", "VCY", "CDY1", "EIF1AY", "TMSB4Y","CDY2A", "NLGN4Y",
"PCDH11Y", "HSFY1", "TGIF2LY", "TBL1Y", "RPS4Y2", "HSFY2",
"CDY2B", "TXLNGY","CDY1B", "DAZ3", "DAZ2", "DAZ4")
# build artificial genes
Xgene.set <-Xgenes[Xgenes %in% row.names(x)]
Ygene.set <-Ygenes[Ygenes %in% row.names(x)]
cm.new<-as.data.frame(matrix(rep(0, 3*ncol(x)), ncol = ncol(x),nrow = 3))
row.names(cm.new) <- c("XIST","superX","superY")
colnames(cm.new) <- colnames(x)
if ("XIST" %in% row.names(x)) {
cm.new["XIST", ]<- x["XIST", ]
}else{
cm.new["XIST", ]<- 0
}
if (length(Xgene.set)>0){
cm.new["superX", ] <-colSums(x[Xgene.set,,drop = FALSE])
}
if (length(Ygene.set)>0){
cm.new["superY", ] <-colSums(x[Ygene.set,,drop = FALSE])
}
############################################################################
# Pre-processing
# perform simple QC
# keep a copy of library size
discarded.cells <- NA
if (qc == TRUE){
#data.sce <-SingleCellExperiment(assays = list(counts = x))
qcstats <- scuttle::perCellQCMetrics(x)
qcfilter <- scuttle::perCellQCFilters(qcstats)
# save the discarded cells
discarded.cells <- colnames(x[,qcfilter$discard])
# cm.new only contains cells that pass the quality control
cm.new <-cm.new[,!qcfilter$discard]
}
tcm.final <- t(cm.new)
tcm.final <- as.data.frame(tcm.final)
#Do Not Classify
zero.cells <- NA
dnc <- tcm.final$superY==0 & tcm.final$superX==0
if(any(dnc)==TRUE){
zero.cells <- row.names(tcm.final)[dnc]
message(length(zero.cells), "cell/s are unable to be classified
due to an abundance of zeroes on X and Y chromosome genes\n")
}
tcm.final <- tcm.final[!dnc, ]
cm.new <- cm.new[,!dnc]
cm.lib.size<- colSums(x[,colnames(cm.new)], na.rm=TRUE)
# log-normalisation performed for each cell
# scaling performed for each gene
normsca.cm <- data.frame(lognormCounts(cm.new, log = TRUE,
prior.count = 0.5,lib.size=cm.lib.size))
data.df <- t(normsca.cm)
data.df <- as.data.frame(data.df)
row.names(data.df) = row.names(tcm.final)
return(list(tcm.final=tcm.final, data.df=data.df,
discarded.cells=discarded.cells,
zero.cells=zero.cells))
}