Entering edit mode
2.5 years ago
pt.taklifi
▴
60
Hello everyone , I'm working with 2 cfDNA vcf files and I want to get a union of the 2 . but for reporting allele frequencies if a variant is present in both files I want to report the mean AF in the union data. my data looks like this
structure(list(chrom = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1"), position = c(133365L, 133700L, 487723L, 722645L, 722646L,
722715L), ref = c("G", "A", "C", "C", "A", "C"), var = c("C",
"C", "T", "G", "G", "T"), normal_reads1 = c(18L, 39L, 44L, 14L,
14L, 30L), normal_reads2 = c(1L, 0L, 3L, 1L, 1L, 1L), normal_var_freq = c("5.26%",
"0%", "6.38%", "6.67%", "6.67%", "3.23%"), normal_gt = c("G",
"A", "C", "C", "A", "C"), tumor_reads1 = c(250L, 587L, 474L,
227L, 229L, 236L), tumor_reads2 = c(90L, 169L, 124L, 233L, 232L,
73L), tumor_var_freq = c("26.47%", "22.35%", "20.74%", "50.65%",
"50.33%", "23.62%"), tumor_gt = c("S", "M", "Y", "S", "R", "Y"
), somatic_status = c("Somatic", "Somatic", "Somatic", "Somatic",
"Somatic", "Somatic"), variant_p_value = c(1, 1, 1, 1, 1, 1),
somatic_p_value = c(0.0258864919345273, 6.89638075337064e-05,
0.00860896789389251, 0.000501434470602199, 0.00054506217282518,
0.00353389771754141), tumor_reads1_plus = c(194L, 378L, 275L,
199L, 199L, 231L), tumor_reads1_minus = c(56L, 209L, 199L,
28L, 30L, 5L), tumor_reads2_plus = c(76L, 152L, 70L, 129L,
127L, 47L), tumor_reads2_minus = c(14L, 17L, 54L, 104L, 105L,
26L), normal_reads1_plus = c(10L, 15L, 11L, 9L, 9L, 17L),
normal_reads1_minus = c(8L, 24L, 33L, 5L, 5L, 13L), normal_reads2_plus = c(1L,
0L, 1L, 1L, 1L, 0L), normal_reads2_minus = c(0L, 0L, 2L,
0L, 0L, 1L)), row.names = c(5L, 6L, 9L, 10L, 11L, 12L), class = "data.frame")
and
structure(list(chrom = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1"), position = c(131736L, 487723L, 722645L, 722646L, 722941L,
2056450L), ref = c("C", "C", "C", "A", "C", "C"), var = c("T",
"T", "G", "G", "A", "T"), normal_reads1 = c(53L, 44L, 14L, 14L,
52L, 259L), normal_reads2 = c(11L, 3L, 1L, 1L, 2L, 1L), normal_var_freq = c("17.19%",
"6.38%", "6.67%", "6.67%", "3.7%", "0.38%"), normal_gt = c("C",
"C", "C", "A", "C", "C"), tumor_reads1 = c(111L, 102L, 145L,
145L, 368L, 6L), tumor_reads2 = c(47L, 36L, 138L, 138L, 93L,
2L), tumor_var_freq = c("29.75%", "26.09%", "48.76%", "48.76%",
"20.17%", "25%"), tumor_gt = c("Y", "Y", "S", "R", "M", "Y"),
somatic_status = c("Somatic", "Somatic", "Somatic", "Somatic",
"Somatic", "Somatic"), variant_p_value = c(1, 1, 1, 1, 1,
1), somatic_p_value = c(0.0366182859602205, 0.00219935827265179,
0.000900078636086993, 0.000900078636086993, 0.000915489373568878,
0.00231250606812068), tumor_reads1_plus = c(88L, 78L, 145L,
144L, 305L, 6L), tumor_reads1_minus = c(23L, 24L, 0L, 1L,
63L, 0L), tumor_reads2_plus = c(43L, 33L, 30L, 30L, 89L,
2L), tumor_reads2_minus = c(4L, 3L, 108L, 108L, 4L, 0L),
normal_reads1_plus = c(25L, 11L, 9L, 9L, 31L, 220L), normal_reads1_minus = c(28L,
33L, 5L, 5L, 21L, 39L), normal_reads2_plus = c(7L, 1L, 1L,
1L, 2L, 1L), normal_reads2_minus = c(4L, 2L, 0L, 0L, 0L,
0L)), row.names = c(2L, 7L, 8L, 9L, 10L, 13L), class = "data.frame")
I'm working in R and the data is in"list" format. I know I can do subset each cfDNA samples to common and private variants and then calculate allele frequencies for each variant ,but I was wondering if there is a more efficient way for this task