the permuted data is real data. just real data which has been subsampled with N samples for Y iterations to estimate population parameters for a random distribution of N samples. effectively that is the baseline distribution for a random process.
i recently wrote an R program that does just this. you'll have to tinker with it so it works for your data, but it's someplace to start.
library(rsgcc)
library(SNFtool)
args<-commandArgs(TRUE)
mrna_file <- args[1]
omics_file <- args[2]
group_name <- args[3]
query <- args[4]
try(mirna_file <- args[5])
ifis.na(mirna_file)){
try(n_permute <- args[5])
}else{
try(n_permute <- args[6])
}
ifis.na(n_permute)){n_permute=2000}
make_unique <- function(input.data) {
require(org.Hs.eg.db)
input.data <- data.frame(cbind(input.data,
entrez=as.numeric(mapIds(org.Hs.eg.db,
keys=gsub("\\..*","",rownames(input.data)),
column="ENTREZID", keytype="ENSEMBL",multiVals="first"))))
input.data <- input.data[!is.na(input.data$entrez),]
id.dup <- input.data[duplicated(input.data$entrez)
| duplicated(input.data$entrez,
fromLast = TRUE),ncol(input.data)]
data.dup <- as.matrix(input.data[duplicated(input.data$entrez) |
duplicated(input.data$entrez, fromLast = TRUE),])
ezid.dup <- unique(id.dup)
data.unique <- input.data[!input.data$entrez %in% id.dup,]
rownames(data.unique) <- data.unique$entrez
data.unique$entrez = NULL
data.dup2 <- lapply(ezid.dup, function(x) {
expr <- data.dup[id.dup == x,]
if(is.matrix(expr)){
sd.values <- apply(expr[,1:ncol(input.data)-1], 1, sd)
mean.values <- apply (expr[,1:ncol(input.data)-1], 1, mean)
CV.values <- sd.values/mean.values
CV.values <- gsub("NaN","0",CV.values)
expr <- expr[which(CV.values == max(CV.values))[[1]],]
} else {
expr
}
})
merger <- data.frame(do.call(rbind, data.dup2))
rownames(merger) <- merger$entrez
merger$entrez = NULL
eset <- as.matrix(rbind(data.unique, merger))
return(eset)
}
run_tests <- function(data, feature_type, n){
output = NULL
for (x in 2:dim(data)[[1]]){
#print(paste("current step",x-1,"out of",dim(data)[[1]]-1,sep=" "))
corpair <- cor.pair(c(1,x), GEMatrix = as.matrix(data),
rowORcol = "row", cormethod = "PCC", pernum = n,
sigmethod = "two.sided")
direction_corr="neg"
if(corpair$cor > 0){
direction_corr="pos"
}
output <- rbind(output, c(rownames(data)[[1]],
rownames(data)[[x]], feature_type, corpair$cor,
abs(corpair$cor), direction_corr, corpair$pvalue))
}
colnames(output) <- c("gene","feature","feature_type",
"correl", "abs_correl", "direction", "pvalue_2k_permutation")
write.table(output, file=paste(gene_name_query, "_", feature_type, "_",
group_name, ".csv",sep=""), row.names = FALSE, sep="\t")
}
mrna <- read.table(mrna_file, header=TRUE, row.names=1, sep="\t")
mrna <- make_unique(mrna)
genes <- mapIds(org.Hs.eg.db, keys=rownames(mrna),
column="SYMBOL", keytype="ENTREZID",multiVals="first")
rownames(mrna) <- genes
gene_name_query <- mapIds(org.Hs.eg.db, keys=gsub("\\..*","",query),
column="SYMBOL", keytype="ENSEMBL",multiVals="first")
query_row_num <- which(rownames(mrna) == gene_name_query)
group <- as.factor(unlist(read.table(omics_file, header=FALSE, sep="\t")))
if(group_name != "all"){
mrna <- mrna[,which(group == group_name)]
}
query_data <- data.frame(mrna[query_row_num,])
colnames(query_data) <- rownames(mrna)[[query_row_num]]
query_data <- t(query_data)
ifis.na(mirna_file)){
data <- rbind(query_data, mrna[-c(query_row_num),])
run_tests(data,"mrna",n_permute)
}else{
mirna <- read.table(mirna_file, header=TRUE, row.names=1, sep="\t")
if(group_name != "all"){
mirna <- mirna[,which(group == group_name)]
}
data <- rbind(query_data, mirna)
tdata <- t(data)
ntdata <- standardNormalization(tdata)
data <- t(ntdata)
run_tests(data,"mirna", as.numeric(n_permute))
}
If this isn't helpful, I can put a working example up on github. But that won't be until next week. Just let me know.