Off topic:negative correlations in subsets of random dataset
0
0
Entering edit mode
3.3 years ago

To illustrated the issue, I have written the code below which first generates a dataset of 4000 cases of 6 variables, each with values randomly selected from 5 levels (0 to 4).Then these 6 variables are summed for each row. Then the dataset is splitted based on quartiles of this total(column name is"t"). Next, Qgraph package in r generates first 4 graphs, each for a quartile then the last graph for the complete original dataset. The whole dataset's graph predictably shows no edges or a couple with very weak values(correlation matrix is printed in the console) but almost all nodes in quartile graphs are connected by significant negative correlations. I have simulated this(1000 rounds) and the result is exactly the same, just the random couple of edges in whole dataset graph disappear. Is there a problem with the code? If not can somebody help me with an explanation for this phenomenon dataset of random variables with no correlations but subsets based on sum of variables have significant negative correlations between variables, which pass even bonferroni correction)?

library(qgraph)
long.analys=function(cols=NULL,fraction=0.25,db=data.frame()){

ct=list()
corct=list()
qgMDD=list()

#assigning db argument of the function to da.n which remains the main
#dataset in function
da.n=data.frame(db,stringsAsFactors = False)
#adds column id which here is set to 1 but could be used if cases have
#identifier for downstream functions
da.n$id=1 #to clean up NAs if any is present da.n=na.omit(da.n) #to generate total score da.n$t=rowSums(da.n[,c(1:ncol(da.n))-1])
#to specify columns to enter correlation matrix generation
nc=ncol(da.n)-2
clr=vector()
dbs=da.n$t #to specify node colors with best contrst if less than 10 variable and use #random colors if more variables if(nc<=10){ clr=c("#c9ddff","#c9faff","#c9ffd2","#ebffc9","#e499f7", "#f49fa6","#f1ef5e","#f2b25e","#f46950","#ffc9eb") }else{ clr<<- colorRampPalette(c("red", "orange", "blue"),space = "Lab")(nc)} # to stratify dataset based on totalscore. if fraction=0.25 sample is #stratified into quartiles, and if fraction=1 the whole sample enters #analysis s=quantile(dbs,na.rm=TRUE,probs = seq(0, 1, by= fraction)) # to determine the best layout for output graphs s1=length(s) s2=as.integer(sqrt(s1)) par(mfrow=c(s2,ifelse(s1%%s2==0,s2,s2+1))) #to add a column named quantiles to dataset and prefill it with a number in #this case number of bins. This value will change in the next step da.n$quantile=length(s)

# this loop generates correlation matrices and graphs based on the
#percentiles defined in s list
for (j in 1:(length(s)-1)){
# defines operator "between" which is used as x %between% c(a,b). By
#default it includes any x
# greater or equal a and less than b. For the last bin it also includes
#value b
if (j==(length(s)-1)){
%between%= function(x,rng) x>=rng[1] & x<=rng[2]
}else{%between%= function(x,rng) x>=rng[1] & x<rng[2]}
#column nc+3 is "quantile", This line assigns the group
#number to each case for example if fraction=0.25, quartile that case
#belongs to is assigned

da.n[which(da.n$t%between%c(s[[j]],s[[j+1]])),nc+3]=rep( j,length(which(da.n$t %between% c(s[[j]],s[[j+1]]))))
#sample is splitted based on values of s and each subset is stored in list
#ct.
ct[[j]]= da.n[da.n\$t%between%c(s[[j]],s[[j+1]]),c(1:nc)]
#some columns might have the same value in all cells or standard deviation
#of zero, they generate NAs which propagate in correlation matrix
#and cause error in graph generation. They are identified and deleted from
#ct and color vector so that each variable remains related to the same
#color
# in graphs
columnsds<<- as.vector(apply(ct[[j]],2,function(cl)( sd(cl,na.rm=TRUE))))

sdzero<<- which(columnsds==0)

if(length(sdzero)>0){
ct[[j]]=  ct[[j]][,-(sdzero)]
}
# cl2 is a vector which contains colors in original clr vector minus those
# that their corresponding variable has standard deviation of zero
cl2<<-NULL
if (length(sdzero)>0){
cl2<<-clr[-(sdzero)]
}else{
cl2<<-clr}
#building network graphs which are stored in list qgMDD. First subsets
#with only one or zero variable
#left after deleting variables with standard deviation of zero are
#filtered out and their qgmdd is set to null
if(ncol(ct[[j]])<2){qgMDD[[j]]=NULL}else{
#then correlation matrices are build for other subsets and based on it
#qgraphs are build and stored in qgMDD list

corct[[j]]= cor(ct[[j]], method="spearman",use="pairwise.complete.obs")
#node size represented by vs is the mean of each variable in the subset
vs=apply(ct[[j]],2,function(x){mean(x,na.rm = TRUE)})
#qgraph is rendered and stored in qgMDD
qgMDD[[j]]=qgraph(input=corct[[j]],
graph ="assosciation",edge.labels=FALSE,color=cl2,
vsize=vs*7,groups=c(1:length(ct[[j]])),threshold="bonferroni",
diag=FALSE,label.cex=0.7,label.color= "black",vTrans=200,
border.color="gray", sampleSize=nrow(ct[[j]]),
aspect=TRUE,label.scale=FALSE,plot=TRUE,rescale=TRUE,
labels=colnames(ct[[j]]),layout="spring",title=paste0('Group: ',j ,",
Cases: ",nrow(ct[[j]])," Score: ",s[[j]],"-",s[[j+1]],"V#
",ncol(ct[[j]])))
#correlation matrix for each subset is printed in console
print(paste0("correlation matrix for ",ifelse(fraction==1,"the whole
dataset", paste0("quartile ",j)) ))
print(corct[[j]])
}
}
}
#end of function definition

######################################################################
#Simulation:
#Creating sample dataset composed of 4000 row of 6 variables with 5 levels
#from 0 to 4 randomly
s.df=data.frame(replicate(6,sample(0:4,4000,rep=TRUE)))
#Generating quartile graphs in first page of output
w.s=long.analys(cols=colnames(s.df),db=s.df,fraction=0.25)
#Generating Whole data Graph in second page of output. Please note that
#any significant correlation(edge)
#in the whole graph is extremely weak as shown in correlation
#matrix in console and disappears after simulation

w.s=long.analys(cols=colnames(s.df),db=s.df,fraction=1)

w.s=long.analys(cols=colnames(s.df),db=s.df,fraction=1)

R • 1.1k views