Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.
The iSCORE-PD_design.csv file is a comma seperated text file starting with a header and with one cell line per line with the following columns:
samplesDT <- setDT(read.csv("./supporting_files/iSCORE-PD_design.csv"))
datatable(samplesDT)
getSubsetFilter <- function(vcf, samples, qual) {
subset <- as.data.frame(geno(vcf)$GT[, samples])
filter <- !apply(subset, 1, function(x) all(x %in% c("./.", "0/0", "./0", "0/."))) & qual(vcf) >= qual
return(filter)
}
getUniqueSnps <- function(vcf.file, samplesDT, qual = 40) {
## Read the VCF
vcf <- readVcf(vcf.file)
groups <- samplesDT[group != "parental", unique(group), ] # nolint: object_usage_linter.
## Roll through the groups and get their unique SNPs
results <- lapply(groups, function(currGroup) {
## Get the sample names of the group under analysis
currSamples <- samplesDT[currGroup == group, samples] # nolint: object_usage_linter.
currUniverse <- samplesDT[currGroup != group, samples] # nolint: object_usage_linter.
## Remove the meta samples for the EWTs...
if (!is.na(unique(samplesDT[group == currGroup, meta]))) { # nolint: object_usage_linter.
metaSamples <- samplesDT[meta %in% samplesDT[group == currGroup, meta], samples] # nolint: object_usage_linter.
currUniverse <- currUniverse[!currUniverse %in% metaSamples]
}
groupSnp_f <- getSubsetFilter(vcf, currSamples, qual)
universeSnps_f <- getSubsetFilter(vcf, currUniverse, qual)
## Subset the vcf to the group SNPS
uniqueSnps <- vcf[groupSnp_f & !universeSnps_f]
return(uniqueSnps)
})
names(results) <- groups
return(results)
}
Now that we have the underlying algorithm, it’s time to roll over every VCF file for each choromsome (For a gain of speed, we’ll do that using parallel computing) and we will massage the data to return something consumable
## Only run if data files are no present
resultToSave <- file.path("data", "unique_SNPs_vs_all.Rds")
vcfToSave <- file.path("data", "unique_SNPs_vs_all.vcf")
if (file.exists(resultToSave) && file.exists(vcfToSave)) {
groupUniqueSnps <- readRDS(resultToSave)
allUniqueVCF <- readVcf(vcfToSave)
} else {
vcf.files <- list.files("SNV", "\\.vcf\\.gz$", full = TRUE)
rawResults <- mclapply(vcf.files,
getUniqueSnps,
samplesDT = samplesDT,
qual = 30,
mc.cores = detectCores(),
mc.preschedule = FALSE
)
groupUniqueSnps <- mclapply(names(rawResults[[1]]), function(group) {
res <- do.call(rbind, lapply(rawResults, "[[", group))
return(res)
}, mc.cores = detectCores(), mc.preschedule = FALSE)
names(groupUniqueSnps) <- names(rawResults[[1]])
dir.create("data", showWarnings = FALSE)
saveRDS(groupUniqueSnps, resultToSave)
allUniqueVCF <- unique(do.call(rbind, groupUniqueSnps))
writeVcf(allUniqueVCF, vcfToSave)
}
vcf <- allUniqueVCF
currSamples <- samplesDT[group != "parental", samples]
dt <- cbind(
allele = rownames(geno(vcf)$GT),
data.table(geno(vcf)$GT)[, ..currSamples]
)
geno <- melt(
dt,
id.vars = "allele",
measure.vars = currSamples,
variable.name = "sample",
value.name = "geno"
)
geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]
samples <- geno[, unique(sample)]
codes <- do.call(c, lapply(seq_len(floor(length(samples) / 26) + 1), function(i) {
if (26 * i < length(samples)) {
end <- 26
} else {
end <- length(samples) - 26 * (i - 1)
}
sapply(LETTERS[seq_len(end)], function(x) paste(rep(x, i), collapse = ""))
}))
void <- lapply(seq_along(codes), function(i) {
geno[sample == samples[i], sub := codes[i]]
})
combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")
combn <- unique(combi)
combn <- combn[order(elementNROWS(strsplit(combn, "-")))]
combi <- factor(combi, levels = combn)
dt <- data.table(Combination = combi)
dt1 <- dt[!is.na(Combination), .N, by = Combination]
dt1[, nSample := elementNROWS(strsplit(as.character(Combination), "-"))]
dt1 <- dt1[order(factor(Combination, levels = combn))]
legend <- data.table(Code = codes, Sample = samples)
cat("\n")
cat("#### Number of SNPs shared among all edited cell lines\n")
Now that we have the underlying algorithm, it’s time to roll over every VCF file for each choromsome (For a gain of speed, we’ll do that using parallel computing) and we will massage the data to return something consumable
myVcfDist <- fastreeR::vcf2dist(inputFile = vcfToSave, threads = 2)
myVcfTree <- fastreeR::dist2tree(inputDist = myVcfDist)
plot(ape::read.tree(text = myVcfTree), direction = "rightwards", cex = 0.8)
ape::add.scale.bar()
ape::axisPhylo(side = 3)
What type of genomic features are affected by the SNPS in each group
allGroup <- mclapply(groupUniqueSnps, function(vcf) {
seqlevels(vcf) <- seqlevels(vcf)[seqlevels(vcf) %in% seqlevels(txdb)]
all <- locateVariants(vcf, txdb, AllVariants())
## UPDATE 02/28/25
## There's a bug with locateVariants(), intronic variants are wrong. Let's recode them
all.noInt <- all[all$LOCATION != "intron"]
## Copying from the the VariantAnnotation repo, from the variantLocation() method
subject <- intronsByTranscript(txdb)
## Remove items with no GRanges, that's the bug
subject <- subject[elementNROWS(subject) > 0]
query <- rowRanges(vcf)
vtype <- "intron"
ignore.strand <- FALSE
asHits <- FALSE
.location <-
function(length = 0, value = NA) {
levels <- c(
"spliceSite", "intron", "fiveUTR", "threeUTR",
"coding", "intergenic", "promoter"
)
factor(rep(value, length), levels = levels)
}
map <- mapToTranscripts(unname(query), subject,
ignore.strand = ignore.strand
)
if (length(map) > 0) {
xHits <- map$xHits
txHits <- map$transcriptsHits
tx <- names(subject)[txHits]
if (!is.null(tx)) {
txid <- tx
} else {
txid <- NA_integer_
}
## FIXME: cdsid is expensive
cdsid <- IntegerList(integer(0))
ss <- runValue(strand(subject)[txHits])
if (any(elementNROWS(ss) > 1L)) {
warning(
"'subject' has multiple strands per list element; ",
"setting strand to '*'"
)
sstrand <- Rle("*", length(txHits))
}
sstrand <- unlist(ss, use.names = FALSE)
res <- GRanges(
seqnames = seqnames(query)[xHits],
ranges = IRanges(ranges(query)[xHits]),
strand = sstrand,
LOCATION = .location(length(xHits), vtype),
LOCSTART = start(map),
LOCEND = end(map),
QUERYID = xHits,
TXID = txid,
CDSID = cdsid,
GENEID = NA_character_,
PRECEDEID = CharacterList(character(0)),
FOLLOWID = CharacterList(character(0))
)
}
res$GENEID <- select(
txdb, as.character(res$TXID),
"GENEID", "TXID"
)$GENEID
## Add back the intronic location and reorder the GRanges based on the location
all <- c(all.noInt, res)
all <- all[order(all)]
}, mc.cores = detectCores(), mc.preschedule = FALSE)
featuresPerGroup <- do.call(rbind, lapply(names(allGroup), function(group) {
all <- allGroup[[group]]
df <- as.data.frame(table(all$LOCATION))
names(df) <- c("feature", "count")
df$group <- group
setDT(df)
}))
datatable(dcast(featuresPerGroup, group ~ feature, value.var = "count"))
rawResult <- lapply(names(allGroup), function(currGroup) {
## Get the combination of samples for each SNPs
vcf <- groupUniqueSnps[[currGroup]]
currSamples <- samplesDT[group == currGroup, samples]
dt <- cbind(
allele = rownames(geno(vcf)$GT),
data.table(geno(vcf)$GT)[, ..currSamples]
)
geno <- melt(
dt,
id.vars = "allele",
measure.vars = currSamples,
variable.name = "sample",
value.name = "geno"
)
geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]
samples <- geno[, unique(sample)]
codes <- LETTERS[seq_along(samples)]
void <- lapply(seq_along(codes), function(i) {
geno[sample == samples[i], sub := codes[i]]
})
combn <- unlist(lapply(do.call(
c,
lapply(seq_along(codes), combn, x = codes, simplify = FALSE)
), paste, collapse = "-"))
combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")
all <- allGroup[[currGroup]]
dt <- do.call(rbind, lapply(levels(all$LOCATION), function(feature) {
f <- all$LOCATION == feature
gid <- all[f]$GENEID
snpID <- names(all)[f] ## UPDATE 12/20/24
snpLocation <- paste0(seqnames(all[f]), ":", start(all[f]), "-", end(all[f])) ## UPDATE 12/20/24
f2 <- !is.na(gid)
gid <- gid[f2]
snpID <- snpID[f2]
snpLocation <- snpLocation[f2] ## UPDATE 12/20/24
dt <- data.table(select(org.Hs.eg.db, gid, c("SYMBOL", "GENENAME", "GENETYPE"), "ENTREZID"))
dt$group <- currGroup
dt$feature <- feature
dt$location <- snpLocation
dt$combi <- combi[snpID]
unique(dt[, ENTREZID := NULL])
}))
legend <- data.table(
group = currGroup,
sample = samples,
code = codes
)
list(legend = legend, dt = dt)
})
names(rawResult) <- names(allGroup)
datatable(do.call(rbind, lapply(rawResult, "[[", "legend")))
write.csv(do.call(rbind, lapply(rawResult, "[[", "legend")), file.path("data", "SNP_annotations_grouping_keys.csv"))
write.csv(do.call(rbind, lapply(rawResult, "[[", "dt")), file.path("data", "SNP_annotations.csv"))
for (currGroup in names(rawResult)) {
cat("\n")
cat("#### List of Genes with SNPs in", currGroup, "edited cell lines\n")
print(
tagList(
datatable(rawResult[[currGroup]]$dt[!feature %in% c("promoter", "intron")])
)
)
cat("\n")
}
gid <- do.call(c, lapply(allGroup, function(all) {
gid <- unique(all[!all$LOCATION %in% c("intron", "promoter", "intergenic")]$GENEID)
gid[!is.na(gid)]
}))
ez2id <- select(org.Hs.eg.db, gid, "SYMBOL", "ENTREZID")
tt <- table(ez2id$SYMBOL)
tt <- tt[order(tt, decreasing = TRUE)]
datatable(as.data.frame(tt))
results2 <- mclapply(names(groupUniqueSnps), function(currGroup) {
## Get the combination of samples for each SNPs
currSamples <- samplesDT[group == currGroup, samples]
vcf <- groupUniqueSnps[[currGroup]][, currSamples]
dt1 <- cbind(
allele = rownames(geno(vcf)$GT),
data.table(geno(vcf)$GT)[, ..currSamples]
)
geno <- melt(
dt1,
id.vars = "allele",
measure.vars = currSamples,
variable.name = "sample",
value.name = "geno"
)
geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]
samples <- geno[, unique(sample)]
codes <- LETTERS[seq_along(samples)]
void <- lapply(seq_along(codes), function(i) {
geno[sample == samples[i], sub := codes[i]]
})
combn <- unlist(lapply(do.call(
c,
lapply(seq_along(codes), combn, x = codes, simplify = FALSE)
), paste, collapse = "-"))
combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")
seqlevels(vcf) <- seqlevels(vcf)[seqlevels(vcf) %in% seqlevels(txdb)]
cds <- locateVariants(vcf, txdb, CodingVariants())
aa <- predictCoding(vcf[cds$QUERYID], txdb, Hsapiens)
aa$GENEID <- select(txdb, aa$TXID, "GENEID", "TXID")$GENEID
aa <- aa[!(duplicated(aa) & (duplicated(aa$GENEID) | is.na(aa$GENEID)))]
if (length(aa) > 0) {
gt <- apply(data.table(geno(vcf[names(aa)])$GT)[, ..currSamples], 1, function(x) {
paste(unique(x[!x %in% c("0/0", "./.", "./0", "0/.")]), collapse = ",")
})
dt <- data.table(
REF = as.character(aa$REF),
ALT = sapply(aa$ALT, paste, collapse = ","),
GT = gt,
location = paste0(seqnames(aa), ":", start(aa), "-", end(aa)), ## UPDATE 12/20/24
consequence = aa$CONSEQUENCE,
combi = as.character(combi[names(aa)])
)
dt <- cbind(select(org.Hs.eg.db, aa$GENEID, c("SYMBOL", "GENENAME"), "ENTREZID"), dt)
setDT(dt)
unique(dt[, ENTREZID := NULL])
} else {
dt <- data.table(
SYMBOL = character(),
GENENAME = character(),
REF = character(),
ALT = character(),
location = character(), ## UPDATE 12/20/24
consequence = character(),
combi = character()
)
}
}, mc.cores = detectCores(), mc.preschedule = FALSE)
names(results2) <- names(groupUniqueSnps)
results2 <- results2[elementNROWS(results2) > 0]
write.csv(do.call(rbind, lapply(rawResult, "[[", "legend")), file.path("data", "CDS_impact_grouping_keys.csv"))
results3 <- cbind(do.call(rbind, results2),
group = rep(names(results2), sapply(results2, nrow))
)
write.csv(results3, file.path("data", "CDS_impact.csv"))
for (currGroup in names(results2)) {
cat("\n")
cat("#### Coding sequence impact in", currGroup, "edited cell lines\n")
print(
tagList(
datatable(results2[[currGroup]])
)
)
cat("\n")
}
sampleNames <- unique(c(names(rawResult), names(results2)))
## Make sures all snp located in coding feature from first table
## Are the list of coding impact in second table
all(sapply(sampleNames, function(currGroup) {
res1 <- rawResult[[currGroup]][["dt"]]
genes1 <- res1[res1$feature == "coding"]$SYMBOL
genes2 <- results2[[currGroup]]$SYMBOL
all(unique(c(genes1, genes2)) %in% genes1) & all(unique(c(genes1, genes2)) %in% genes2)
}))
## [1] TRUE
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8
## [5] LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C.UTF-8
## [9] LC_ADDRESS=C.UTF-8 LC_TELEPHONE=C.UTF-8
## [11] LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C.UTF-8
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gridExtra_2.3
## [2] ape_5.8-1
## [3] fastreeR_1.12.0
## [4] ggplot2_3.5.2
## [5] htmltools_0.5.8.1
## [6] DT_0.33
## [7] org.Hs.eg.db_3.21.0
## [8] BSgenome.Hsapiens.UCSC.hg38_1.4.5
## [9] BSgenome_1.76.0
## [10] rtracklayer_1.68.0
## [11] BiocIO_1.18.0
## [12] TxDb.Hsapiens.UCSC.hg38.knownGene_3.21.0
## [13] GenomicFeatures_1.60.0
## [14] AnnotationDbi_1.70.0
## [15] data.table_1.17.0
## [16] VariantAnnotation_1.54.0
## [17] Rsamtools_2.24.0
## [18] Biostrings_2.76.0
## [19] XVector_0.48.0
## [20] SummarizedExperiment_1.38.0
## [21] Biobase_2.68.0
## [22] GenomicRanges_1.60.0
## [23] GenomeInfoDb_1.44.0
## [24] IRanges_2.42.0
## [25] S4Vectors_0.46.0
## [26] MatrixGenerics_1.20.0
## [27] matrixStats_1.5.0
## [28] BiocGenerics_0.54.0
## [29] generics_0.1.3
## [30] httpgd_2.0.4
##
## loaded via a namespace (and not attached):
## [1] farver_2.1.2 blob_1.2.4 R.utils_2.13.0
## [4] bitops_1.0-9 fastmap_1.2.0 RCurl_1.98-1.17
## [7] GenomicAlignments_1.44.0 XML_3.99-0.18 digest_0.6.37
## [10] lifecycle_1.0.4 KEGGREST_1.48.0 RSQLite_2.3.9
## [13] magrittr_2.0.3 compiler_4.5.0 rlang_1.1.6
## [16] sass_0.4.10 tools_4.5.0 yaml_2.3.10
## [19] knitr_1.50 S4Arrays_1.8.0 htmlwidgets_1.6.4
## [22] bit_4.6.0 curl_6.2.2 DelayedArray_0.34.1
## [25] RColorBrewer_1.1-3 abind_1.4-8 BiocParallel_1.42.0
## [28] unigd_0.1.3 withr_3.0.2 R.oo_1.27.0
## [31] grid_4.5.0 scales_1.4.0 cli_3.6.5
## [34] rmarkdown_2.29 crayon_1.5.3 httr_1.4.7
## [37] rjson_0.2.23 DBI_1.2.3 cachem_1.1.0
## [40] stringr_1.5.1 restfulr_0.0.15 vctrs_0.6.5
## [43] Matrix_1.7-3 jsonlite_2.0.0 bit64_4.6.0-1
## [46] crosstalk_1.2.1 systemfonts_1.2.2 jquerylib_0.1.4
## [49] glue_1.8.0 codetools_0.2-20 stringi_1.8.7
## [52] rJava_1.0-11 gtable_0.3.6 UCSC.utils_1.4.0
## [55] tibble_3.2.1 pillar_1.10.2 GenomeInfoDbData_1.2.14
## [58] R6_2.6.1 evaluate_1.0.3 lattice_0.22-7
## [61] R.methodsS3_1.8.2 png_0.1-8 memoise_2.0.1
## [64] bslib_0.9.0 Rcpp_1.0.14 nlme_3.1-168
## [67] SparseArray_1.8.0 xfun_0.52 pkgconfig_2.0.3
Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.
The iSCORE-PD_cells_grouped_by_editing_methods.csv file is a comma seperated text file starting with a header and with one cell line per line with the following column header:
## Set a table of sample to group from one of the VCF file
vcf.files <- list.files("SNV", "\\.vcf\\.gz$", full = TRUE)
## Read Hanqin csv file for the samples
samplesDT <- setDT(read.csv("supporting_files/iSCORE-PD_cells_grouped_by_editing_methods.csv"))
samplesDT[, group := editing_group]
samplesDT[, editing_group := NULL]
## Remove the Talen edited cell line
samplesDT <- samplesDT[group != "TALEN", ]
## Print the table of samples
datatable(samplesDT)
getSubsetFilter <- function(vcf, samples, qual) {
subset <- as.data.frame(geno(vcf)$GT[, samples])
filter <- !apply(subset, 1, function(x) all(x %in% c("./.", "0/0", "./0", "0/."))) & qual(vcf) >= qual
return(filter)
}
getUniqueSnps <- function(vcf.file, samplesDT, qual = 40) {
## Read the VCF
vcf <- readVcf(vcf.file)
groups <- samplesDT[group != "parental", unique(group), ] # nolint: object_usage_linter.
## Roll through the groups and get their unique SNPs
results <- lapply(groups, function(currGroup) {
## Get the sample names of the group under analysis
currSamples <- samplesDT[group == currGroup, samples] # nolint: object_usage_linter.
currUniverse <- samplesDT[group == "parental", samples] # nolint: object_usage_linter.
groupSnp_f <- getSubsetFilter(vcf, currSamples, qual)
universeSnps_f <- getSubsetFilter(vcf, currUniverse, qual)
## Subset the vcf to the group SNPS
uniqueSnps <- vcf[groupSnp_f & !universeSnps_f]
return(uniqueSnps)
})
names(results) <- groups
return(results)
}
Now that we have the underlying algorithm, it’s time to roll over every VCF file for each choromsome (For a gain of speed, we’ll do that using parallel computing) and we will massage the data to return something consumable
## Only run if data files are no present
resultToSave <- file.path("data", "edit_type_unique_SNPs_vs_all.Rds")
vcfToSave <- file.path("data", "edit_type_unique_SNPs_vs_all.vcf")
if (file.exists(resultToSave) && file.exists(vcfToSave)) {
groupUniqueSnps <- readRDS(resultToSave)
} else {
rawResults <- mclapply(vcf.files,
getUniqueSnps,
samplesDT = samplesDT,
qual = 30,
mc.cores = detectCores(),
mc.preschedule = FALSE
)
groupUniqueSnps <- mclapply(names(rawResults[[1]]), function(group) {
res <- do.call(rbind, lapply(rawResults, "[[", group))
return(res)
}, mc.cores = detectCores(), mc.preschedule = FALSE)
names(groupUniqueSnps) <- names(rawResults[[1]])
allUniqueVCF <- unique(do.call(rbind, groupUniqueSnps))
dir.create("data", showWarnings = FALSE)
saveRDS(groupUniqueSnps, resultToSave)
writeVcf(allUniqueVCF, vcfToSave)
}
the Unique SNPs ar now in a series of ExpandedVCF object, one for each group. Let’s create a CompactedVCF with these SNPs
Let’s reformat the table to display counts of SNPs per group per chromosome.
dt <- data.table(
group = rep(names(groupUniqueSnps), sapply(groupUniqueSnps, length)),
chr = do.call(c, lapply(groupUniqueSnps, function(x) as.character(seqnames(x))))
)
counts <- dt[, .N, by = .(group, chr)]
dt2 <- dcast(counts, group ~ chr, value.var = "N")
dt2[, total := sum(.SD), by = group]
dt2 <- dt2[samplesDT[group != "parental", .N, by = group], on = .(group)]
dt2[, mean := round(total / N)]
dt2
dt2 <- dt2[
order(factor(group,
levels = c(
grep("EWT", unique(group), value = TRUE),
grep("EWT", unique(group), value = TRUE, invert = TRUE)
)
)),
c("group", "N", "total", "mean", sort(grep("chr.+", colnames(dt2), value = TRUE)))
]
dt2
## [1] "group" "N" "total" "mean" "chr1" "chr10" "chr11" "chr12" "chr13"
## [10] "chr14" "chr15" "chr16" "chr17" "chr18" "chr19" "chr2" "chr20" "chr21"
## [19] "chr22" "chr3" "chr4" "chr5" "chr6" "chr7" "chr8" "chr9" "chrX"
## [28] "chrY"
pps <- lapply(names(groupUniqueSnps), function(currGroup) {
vcf <- groupUniqueSnps[[currGroup]]
currSamples <- samplesDT[group == currGroup, samples]
dt <- cbind(
allele = rownames(geno(vcf)$GT),
data.table(geno(vcf)$GT)[, ..currSamples]
)
geno <- melt(
dt,
id.vars = "allele",
measure.vars = currSamples,
variable.name = "sample",
value.name = "geno"
)
geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]
samples <- geno[, unique(sample)]
if (length(samples) > 26) {
codes <- c(
LETTERS[seq_along(samples)[1:26]],
sapply(LETTERS[seq_along(samples[27:length(samples)])], function(x) paste(rep(x, 2), collapse = ""))
)
} else {
codes <- LETTERS[seq_along(samples)]
}
void <- lapply(seq_along(codes), function(i) {
geno[sample == samples[i], sub := codes[i]]
})
combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")
combn <- unique(combi)
combn <- combn[order(elementNROWS(strsplit(combn, "-")))]
combi <- factor(combi, levels = combn)
df <- as.data.frame(table(combi))
p_bar <- ggplot(df, aes(combi, Freq)) +
geom_col() +
labs(title = paste("Sample", currGroup)) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust = 1))
legend <- data.frame(legend = paste(codes, samples, sep = " = "))
p_tab <- tableGrob(unname(legend),
theme = ttheme_default(base_size = 8),
rows = NULL
)
grid.arrange(grobs = list(p_bar, p_tab), ncol = 2, widths = c(3, 1))
})
pps <- mclapply(names(groupUniqueSnps), function(currGroup) {
vcf <- groupUniqueSnps[[currGroup]]
currSamples <- samplesDT[group == currGroup, samples]
dt <- cbind(
allele = rownames(geno(vcf)$GT),
data.table(geno(vcf)$GT)[, ..currSamples]
)
geno <- melt(
dt,
id.vars = "allele",
measure.vars = currSamples,
variable.name = "sample",
value.name = "geno"
)
geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]
samples <- geno[, unique(sample)]
if (length(samples) > 26) {
codes <- c(
LETTERS[seq_along(samples)[1:26]],
sapply(LETTERS[seq_along(samples[27:length(samples)])], function(x) paste(rep(x, 2), collapse = ""))
)
} else {
codes <- LETTERS[seq_along(samples)]
}
void <- lapply(seq_along(codes), function(i) {
geno[sample == samples[i], sub := codes[i]]
})
combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")
combn <- unique(combi)
combn <- combn[order(elementNROWS(strsplit(combn, "-")))]
combi <- factor(combi, levels = combn)
dt <- data.table(Combination = combi)
dt1 <- dt[!is.na(Combination), .N, by = Combination]
dt1[, nSample := elementNROWS(strsplit(as.character(Combination), "-"))]
dt1 <- dt1[order(factor(Combination, levels = combn))]
legend <- data.table(Code = codes, Sample = samples)
list(legend = legend, dt1 = dt1)
})
names(pps) <- names(groupUniqueSnps)
for (currGroup in names(pps)) {
cat("\n")
cat("#### Number of SNPs shared among the", currGroup, "edited cell lines\n")
cat("##### Id to sample legend\n")
print(tagList(datatable(pps[[currGroup]]$legend)))
cat("##### Number of SNPs per combination for", currGroup, "\n")
print(tagList(datatable(pps[[currGroup]]$dt1)))
cat("\n")
}
Now that we have the underlying algorithm, it’s time to roll over every VCF file for each choromsome (For a gain of speed, we’ll do that using parallel computing) and we will massage the data to return something consumable
myVcfDist <- fastreeR::vcf2dist(inputFile = vcfToSave, threads = 2)
myVcfTree <- fastreeR::dist2tree(inputDist = myVcfDist)
plot(ape::read.tree(text = myVcfTree), direction = "rightwards", cex = 0.8)
ape::add.scale.bar()
ape::axisPhylo(side = 3)
Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.
The iSCORE-PD_design.csv file is a comma seperated text file starting with a header and with one cell line per line with the following columns:
samplesDT <- setDT(read.csv("./supporting_files/iSCORE-PD_design.csv"))
datatable(samplesDT)
## Only run if data files are no present
resultToSave <- file.path("data", "unique_SNPs_vs_all.Rds")
if (file.exists(resultToSave)) {
groupUniqueSnps <- readRDS(resultToSave)
} else {
stop("Cant find file", resultToSave)
}
Frank S. suggested that perhaps, off target effect could be reveal by new polymorphism being created near to each other just because the CRISPR would tend to affect nearby location more frequenlty
genes <- genes(txdb)
## 30 genes were dropped because they have exons located on both strands of the
## same reference sequence or on more than one reference sequence, so cannot be
## represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a GRangesList
## object, or use suppressMessages() to suppress this message.
dists <- mclapply(names(groupUniqueSnps), function(currGroup) {
vcf <- groupUniqueSnps[[currGroup]]
## If not dealing with EWT or control cell lines, find the current gene location and remove the SNPs from the group
if (!grepl("EWT|S_Group", currGroup)) {
gene <- unique(sub("-.+", "", grep("EWT|parental|S_Group", currGroup, value = TRUE, invert = TRUE)))
if (gene %in% keys(org.Hs.eg.db, "SYMBOL")) {
ez <- select(org.Hs.eg.db, gene, "ENTREZID", "SYMBOL")$ENTREZ
} else if (gene %in% keys(org.Hs.eg.db, "ALIAS")) {
ez <- select(org.Hs.eg.db, gene, "ENTREZID", "ALIAS")$ENTREZ
}
suppressWarnings(ol <- findOverlaps(vcf, genes[ez]))
if (length(queryHits(ol) > 0)) vcf <- vcf[-queryHits(ol)]
}
df <- do.call(rbind, lapply(samplesDT[group == currGroup, samples], function(currSample) {
x <- geno(vcf)$GT[, currSample]
f <- !x %in% c("./.", "0/0", "./0", "0/.")
subVcf <- vcf[f]
dist <- unlist(tapply(start(subVcf), seqnames(subVcf), function(start) start[-1] - start[-length(start)]))
dist <- dist[elementNROWS(dist) != 0]
chrCounts <- table(seqnames(subVcf))
chrCounts <- chrCounts[chrCounts != 0]
bootstrap <- unlist(lapply(seq_len(1000), function(i) {
unlist(lapply(names(chrCounts), function(chr) {
start <- sort(sample(seqlengths(subVcf)[chr], chrCounts[chr]))
dist <- start[-1] - start[-length(start)]
}))
}))
df <- data.frame(
distance = c(dist, bootstrap),
sample = c(rep("Sample", length(dist)), rep("Bootstrap", length(bootstrap)))
)
}))
df
}, mc.cores = detectCores(), mc.preschedule = FALSE)
names(dists) <- names(groupUniqueSnps)
dists <- dists[elementNROWS(dists) > 0]
void <- lapply(names(dists), function(group) {
df <- dists[[group]]
df <- df[!is.na(df$distance), ]
p <- ggplot(df, aes(log10(distance), color = sample)) +
geom_histogram(binwidth = 0.05) +
facet_grid(rows = vars(sample), scales = "free_y") +
labs(
title = paste("Distance between SNPs for", group, "samples")
)
print(p)
})
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
Kolmogorov-Smirnov test can tell us if two samples are comming from the same distribution by looking at the distances between ECDFs.
dt <- data.table(do.call(rbind, lapply(dists, function(df) {
setDT(df)
ks <- ks.test(
df[sample == "Sample", distance],
df[sample == "Bootstrap", distance],
alternative = "greater"
)
c("D^+" = sprintf("%.3f", ks$statistic), p.value = sprintf("%.3f", ks$p.value))
})))
datatable(cbind(group = names(dists), dt), options = list(pageLength = 25))
dt <- data.table(do.call(rbind, lapply(names(dists), function(group) {
dist <- setDT(dists[[group]])
fracSample <- dist[distance <= 100 & sample == "Sample", .N] / dist[sample == "Sample", .N]
fracBS <- dist[distance <= 100 & sample == "Bootstrap", .N] / dist[sample == "Bootstrap", .N]
c(
group = group,
Count = dist[distance <= 100 & sample == "Sample", .N],
"Sample prop" = sprintf("%.2e", fracSample),
"Bootstrap prop" = sprintf("%.2e", fracBS),
Enrichment = round(fracSample / fracBS)
)
})))
datatable(dt, options = list(pageLength = 25))
dt <- data.table(do.call(rbind, lapply(names(dists), function(group) {
dist <- setDT(dists[[group]])
fracSample <- dist[distance <= 100 & sample == "Sample", .N] / dist[sample == "Sample", .N]
fracBS <- dist[distance <= 100 & sample == "Bootstrap", .N] / dist[sample == "Bootstrap", .N]
c(
group = group,
Count = dist[distance <= 1000 & sample == "Sample", .N],
"Sample prop" = sprintf("%.2e", fracSample),
"Bootstrap prop" = sprintf("%.2e", fracBS),
Enrichment = round(fracSample / fracBS)
)
})))
datatable(dt, options = list(pageLength = 25))
Let’s look at the sequence around the SNPs closer to each other wihin 100 Bootrap
genes <- genes(txdb)
## 30 genes were dropped because they have exons located on both strands of the
## same reference sequence or on more than one reference sequence, so cannot be
## represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a GRangesList
## object, or use suppressMessages() to suppress this message.
colorize <- function(x, color) {
if (knitr::is_latex_output()) {
sprintf("\\textcolor{%s}{%s}", color, x)
} else if (knitr::is_html_output()) {
sprintf(
"<span style='color: %s; font-family: monospace, monospace;'>%s</span>", color,
x
)
} else {
x
}
}
results <- mclapply(names(groupUniqueSnps), function(currGroup) {
vcf <- groupUniqueSnps[[currGroup]]
if (!grepl("EWT|S_Group", currGroup)) {
gene <- unique(sub("-.+", "", grep("EWT|parental", currGroup, value = TRUE, invert = TRUE)))
if (gene %in% keys(org.Hs.eg.db, "SYMBOL")) {
ez <- select(org.Hs.eg.db, gene, "ENTREZID", "SYMBOL")$ENTREZ
} else if (gene %in% keys(org.Hs.eg.db, "ALIAS")) {
ez <- select(org.Hs.eg.db, gene, "ENTREZID", "ALIAS")$ENTREZ
}
suppressWarnings(ol <- findOverlaps(vcf, genes[ez]))
if (length(queryHits(ol) > 0)) vcf <- vcf[-queryHits(ol)]
}
chrCounts <- table(seqnames(vcf))
chrCounts <- chrCounts[chrCounts > 0]
df <- do.call(rbind, lapply(names(chrCounts), function(chr) {
subVcf <- vcf[seqnames(vcf) == chr]
starts <- start(subVcf)
df <- do.call(rbind, lapply(which(starts[-1] - starts[-length(starts)] <= 100), function(f) {
seqs <- getSeq(Hsapiens, GRanges(chr, IRanges(start(subVcf[c(f, f + 1)]) - 15, width = 30)))
refs <- ref(subVcf[c(f, f + 1)])
alts <- sapply(alt(subVcf[c(f, f + 1)]), paste, collapse = ",")
currSamples <- samplesDT[group == currGroup, samples]
gt1 <- geno(subVcf[f])$GT[, samplesDT[group == currGroup, samples]]
gt2 <- geno(subVcf[f + 1])$GT[, samplesDT[group == currGroup, samples]]
gts <- c(gt1[!gt1 %in% c("./.", "0/0", "./0", "0/.")], gt2[!gt2 %in% c("./.", "0/0", "./0", "0/.")])
seqs <- paste0(
colorize(as.character(subseq(seqs, 1, 15)), "blue"),
colorize(as.character(subseq(seqs, 16, width(seqs))), "red")
)
data.frame(
distance = start(subVcf[f + 1]) - start(subVcf[f]),
CHR = chr,
start = start(subVcf[f]),
REF1 = refs[1],
ALT1 = alts[1],
REF2 = refs[2],
ALT2 = alts[2],
seq1 = seqs[1],
seq2 = seqs[2],
GT = paste(unique(gts), collapse = "|"),
sample = paste(unique(names(gts)), collapse = "\n")
)
}))
}))
return(df)
}, mc.cores = detectCores(), mc.preschedule = FALSE)
names(results) <- names(groupUniqueSnps)
for (currGroup in names(results)) {
cat("\n")
cat("#### Sequences around SNPs seperated by less than 100 bp in ", currGroup, "edited cell lines\n")
print(
tagList(
datatable(results[[currGroup]], escape = FALSE)
)
)
cat("\n")
}
Perhaps, looking at the distance of SNPs relative to the edited gene might reveal a stronger signal.
genes <- genes(txdb)
## 30 genes were dropped because they have exons located on both strands of the
## same reference sequence or on more than one reference sequence, so cannot be
## represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a GRangesList
## object, or use suppressMessages() to suppress this message.
dists <- mclapply(names(groupUniqueSnps), function(currGroup) {
if (grepl("OTHERS|S_Group", currGroup)) {
return(data.frame())
}
vcf <- groupUniqueSnps[[currGroup]]
## If not dealing with EWT, find the current gene location and remove the SNPs from the group
currGroup2 <- sub("(EWT-|^)(.+)", "\\2", currGroup)
gene <- sub("(.+)-.+", "\\1", currGroup2)
if (gene %in% keys(org.Hs.eg.db, "SYMBOL")) {
ez <- select(org.Hs.eg.db, gene, "ENTREZID", "SYMBOL")$ENTREZ
} else if (gene %in% keys(org.Hs.eg.db, "ALIAS")) {
ez <- select(org.Hs.eg.db, gene, "ENTREZID", "ALIAS")$ENTREZ
}
suppressWarnings(ol <- findOverlaps(vcf, genes[ez]))
if (length(queryHits(ol) > 0)) vcf <- vcf[-queryHits(ol)]
currChr <- as.character(seqnames(genes[ez]))
subVCF <- vcf[seqnames(vcf) == currChr]
if (length(subVCF) < 2) {
return(data.frame())
}
starts <- start(subVCF)
f1 <- starts < start(genes[ez])
dist <- c(start(genes[ez]) - starts[f1], starts[!f1] - end(genes[ez]))
## bootstrap
target <- c(1:start(genes[ez]), end(genes[ez]):seqlengths(vcf)[currChr])
bootstrap <- unlist(lapply(seq_len(1000), function(i) {
starts <- sort(sample(seqlengths(vcf)[currChr], length(subVCF)))
f1 <- starts < start(genes[ez])
dist <- c(start(genes[ez]) - starts[f1], starts[!f1] - end(genes[ez]))
}))
df <- data.frame(
distance = c(dist, bootstrap),
sample = c(rep("Sample", length(dist)), rep("Bootstrap", length(bootstrap)))
)
}, mc.cores = detectCores(), mc.preschedule = FALSE)
names(dists) <- names(groupUniqueSnps)
void <- lapply(names(dists), function(group) {
df <- dists[[group]]
if (nrow(df) == 0) {
cat(group, "have less than 2 SNPs near the edited gene\n")
} else {
p <- ggplot(df, aes(log10(distance), color = sample)) +
geom_histogram(binwidth = 0.05) +
facet_grid(rows = vars(sample), scales = "free_y") +
labs(
title = paste("Distance between SNPs for", group, "samples")
)
print(p)
}
})
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 13 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 21 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 79 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
## EWT-OTHERS have less than 2 SNPs near the edited gene
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 227 rows containing non-finite outside the scale range
## (`stat_bin()`).
## S_Group have less than 2 SNPs near the edited gene
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 15 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 31 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_bin()`).
Ok, keeping this here as initialy I look at the distance of SNPs within a group of samples instead of wihin a single sample. The impact is that you might get SNPs that are close to each others accross samples, which if the assumption is that they are caused by initial edits of a chromatid then subsquent CRISPR edits in the neibhoring DNA, wihin group does not make sense ***
genes <- genes
dists <- mclapply(names(groupUniqueSnps), function(currGroup) {
vcf <- groupUniqueSnps[[currGroup]]
## If not dealing with EWT, find the current gene location and remove the SNPs from the group
if (!grepl("EWT|S_Group", currGroup)) {
gene <- unique(sub("-.+", "", grep("EWT|parental", currGroup, value = TRUE, invert = TRUE)))
if (gene %in% keys(org.Hs.eg.db, "SYMBOL")) {
ez <- select(org.Hs.eg.db, gene, "ENTREZID", "SYMBOL")$ENTREZ
} else if (gene %in% keys(org.Hs.eg.db, "ALIAS")) {
ez <- select(org.Hs.eg.db, gene, "ENTREZID", "ALIAS")$ENTREZ
}
suppressWarnings(ol <- findOverlaps(vcf, genes[ez]))
if (length(queryHits(ol) > 0)) vcf <- vcf[-queryHits(ol)]
}
dist <- unlist(tapply(start(vcf), seqnames(vcf), function(start) start[-1] - start[-length(start)]))
dist <- dist[elementNROWS(dist) != 0]
chrCounts <- table(seqnames(vcf))
chrCounts <- chrCounts[chrCounts != 0]
bootstrap <- unlist(lapply(seq_len(1000), function(i) {
unlist(lapply(names(chrCounts), function(chr) {
start <- sort(sample(seqlengths(vcf)[chr], chrCounts[chr]))
dist <- start[-1] - start[-length(start)]
}))
}))
df <- data.frame(
distance = c(dist, bootstrap),
sample = c(rep("Sample", length(dist)), rep("Bootstrap", length(bootstrap)))
)
}, mc.cores = detectCores(), mc.preschedule = FALSE)
names(dists) <- names(groupUniqueSnps)
void <- lapply(names(dists), function(group) {
df <- dists[[group]]
p <- ggplot(df, aes(log10(distance), color = sample)) +
geom_histogram(binwidth = 0.05) +
facet_grid(rows = vars(sample), scales = "free_y") +
labs(
title = paste("Distance between SNPs for", group, "samples")
)
print(p)
})
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8
## [5] LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C.UTF-8
## [9] LC_ADDRESS=C.UTF-8 LC_TELEPHONE=C.UTF-8
## [11] LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C.UTF-8
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gridExtra_2.3
## [2] ape_5.8-1
## [3] fastreeR_1.12.0
## [4] ggplot2_3.5.2
## [5] htmltools_0.5.8.1
## [6] DT_0.33
## [7] org.Hs.eg.db_3.21.0
## [8] BSgenome.Hsapiens.UCSC.hg38_1.4.5
## [9] BSgenome_1.76.0
## [10] rtracklayer_1.68.0
## [11] BiocIO_1.18.0
## [12] TxDb.Hsapiens.UCSC.hg38.knownGene_3.21.0
## [13] GenomicFeatures_1.60.0
## [14] AnnotationDbi_1.70.0
## [15] data.table_1.17.0
## [16] VariantAnnotation_1.54.0
## [17] Rsamtools_2.24.0
## [18] Biostrings_2.76.0
## [19] XVector_0.48.0
## [20] SummarizedExperiment_1.38.0
## [21] Biobase_2.68.0
## [22] GenomicRanges_1.60.0
## [23] GenomeInfoDb_1.44.0
## [24] IRanges_2.42.0
## [25] S4Vectors_0.46.0
## [26] MatrixGenerics_1.20.0
## [27] matrixStats_1.5.0
## [28] BiocGenerics_0.54.0
## [29] generics_0.1.3
## [30] httpgd_2.0.4
##
## loaded via a namespace (and not attached):
## [1] farver_2.1.2 blob_1.2.4 R.utils_2.13.0
## [4] bitops_1.0-9 fastmap_1.2.0 RCurl_1.98-1.17
## [7] GenomicAlignments_1.44.0 XML_3.99-0.18 digest_0.6.37
## [10] lifecycle_1.0.4 KEGGREST_1.48.0 RSQLite_2.3.9
## [13] magrittr_2.0.3 compiler_4.5.0 rlang_1.1.6
## [16] sass_0.4.10 tools_4.5.0 yaml_2.3.10
## [19] knitr_1.50 labeling_0.4.3 S4Arrays_1.8.0
## [22] htmlwidgets_1.6.4 bit_4.6.0 curl_6.2.2
## [25] DelayedArray_0.34.1 RColorBrewer_1.1-3 abind_1.4-8
## [28] BiocParallel_1.42.0 unigd_0.1.3 withr_3.0.2
## [31] R.oo_1.27.0 grid_4.5.0 scales_1.4.0
## [34] cli_3.6.5 rmarkdown_2.29 crayon_1.5.3
## [37] httr_1.4.7 rjson_0.2.23 DBI_1.2.3
## [40] cachem_1.1.0 stringr_1.5.1 restfulr_0.0.15
## [43] vctrs_0.6.5 Matrix_1.7-3 jsonlite_2.0.0
## [46] bit64_4.6.0-1 crosstalk_1.2.1 systemfonts_1.2.2
## [49] jquerylib_0.1.4 glue_1.8.0 codetools_0.2-20
## [52] stringi_1.8.7 rJava_1.0-11 gtable_0.3.6
## [55] UCSC.utils_1.4.0 tibble_3.2.1 pillar_1.10.2
## [58] GenomeInfoDbData_1.2.14 R6_2.6.1 evaluate_1.0.3
## [61] lattice_0.22-7 R.methodsS3_1.8.2 png_0.1-8
## [64] memoise_2.0.1 bslib_0.9.0 Rcpp_1.0.14
## [67] nlme_3.1-168 SparseArray_1.8.0 xfun_0.52
## [70] pkgconfig_2.0.3
Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.
The iSCORE-PD_cells_grouped_with_guides.csv file is a comma seperated text file starting with a header and with one cell line per line with the following column header:
## Set a table of sample to group from one of the VCF file
vcf.files <- list.files("SNV", "\\.vcf\\.gz$", full = TRUE)
## Read Hanqin csv file for the samples
samplesDT <- setDT(read.csv("supporting_files/iSCORE-PD_cells_grouped_with_guides.csv"))
## Print the table of samples
datatable(samplesDT)
getSubsetFilter <- function(vcf, samples, qual) {
subset <- as.data.frame(geno(vcf)$GT[, samples])
filter <- !apply(subset, 1, function(x) all(x %in% c("./.", "0/0", "./0", "0/."))) & qual(vcf) >= qual
return(filter)
}
getUniqueSnps <- function(vcf.file, samplesDT, qual = 40) {
## Read the VCF
vcf <- readVcf(vcf.file)
groups <- samplesDT[group != "parental", unique(group), ]
## Roll through the groups and get their unique SNPs
results <- lapply(groups, function(currGroup) {
## Get the sample names of the group under analysis
currSamples <- samplesDT[currGroup == group, sample]
currUniverse <- samplesDT[-grep(paste0("(", currGroup, "|EWT)"), group), sample]
## Remove the meta samples for the EWTs...
if (!is.na(unique(samplesDT[group == currGroup, meta]))) {
metaSamples <- samplesDT[meta %in% samplesDT[group == currGroup, meta], sample]
currUniverse <- currUniverse[!currUniverse %in% metaSamples]
}
groupSnp_f <- getSubsetFilter(vcf, currSamples, qual)
universeSnps_f <- getSubsetFilter(vcf, currUniverse, qual)
## Subset the vcf to the group SNPS
uniqueSnps <- vcf[groupSnp_f & !universeSnps_f]
return(uniqueSnps)
})
names(results) <- groups
return(results)
}
Now that we have the underlying algorithm, it’s time to roll over every VCF file for each choromsome (For a gain of speed, we’ll do that using parallel computing) and we will massage the data to return something consumable
## Only run if data files are no present
resultToSave <- file.path("data", "cas-offinder_unique_SNPs_vs_all.Rds")
vcfToSave <- file.path("data", "cas-offinder_unique_SNPs_vs_all.vcf")
if (file.exists(resultToSave) && file.exists(vcfToSave)) {
groupUniqueSnps <- readRDS(resultToSave)
} else {
rawResults <- mclapply(vcf.files,
getUniqueSnps,
samplesDT = samplesDT,
qual = 30,
mc.cores = detectCores(),
mc.preschedule = FALSE
)
groupUniqueSnps <- mclapply(names(rawResults[[1]]), function(group) {
res <- do.call(rbind, lapply(rawResults, "[[", group))
return(res)
}, mc.cores = detectCores(), mc.preschedule = FALSE)
names(groupUniqueSnps) <- names(rawResults[[1]])
allUniqueVCF <- unique(do.call(rbind, groupUniqueSnps))
dir.create("data", showWarnings = FALSE)
saveRDS(groupUniqueSnps, resultToSave)
writeVcf(allUniqueVCF, vcfToSave)
}
sgRNA <- setDT(read.delim("./supporting_files/sgRNA.txt", header = FALSE))
datatable(sgRNA) %>%
formatStyle(c("V1"), fontFamily = "monospace, monospace")
This is how cas-offinder was run on the GPU based instance in us-east-1 zone
cas-offinder sgRNA.txt G cas-offinder-out.txt
Display only the first 50 results to avoid bloating the HTML report
casOff <- setDT(read.table("./supporting_files/cas-offinder-out.txt"))
setnames(casOff, c("PAM", "chr", "start", "seq", "strand", "mismatch", "id"))
casOff[strand == "-", start := start + 3]
casOff[strand == "+", start := start + nchar(seq) - 3]
casOffGR <- GRanges(
seqnames = casOff[, chr],
IRanges(casOff[, start], width = 1),
strand = casOff[, strand],
data.frame(seq = casOff[, seq], guide = casOff[, id], mismatch = casOff[, mismatch], PAM = casOff[, PAM])
)
datatable(head(casOff, 50)) %>%
formatStyle(c("PAM", "seq"), fontFamily = "monospace, monospace")
genes <- genes(txdb)
targetGenes <- unique(sub("-.+", "", grep("EWT|parental", names(groupUniqueSnps), value = TRUE, invert = TRUE)))
symbol2ez <- select(org.Hs.eg.db, targetGenes, "ENTREZID", "SYMBOL")
symbol2ez[is.na(symbol2ez$ENTREZID), ]$ENTREZID <- select(
org.Hs.eg.db,
symbol2ez[is.na(symbol2ez$ENTREZID), ]$SYMBOL, "ENTREZID", "ALIAS"
)$ENTREZID
ol <- findOverlaps(casOffGR, genes[symbol2ez$ENTREZID], ignore.strand = TRUE)
dt <- setDT(as.data.frame(ol))
dt$guide <- casOffGR[queryHits(ol)]$guide
dt$overlappingGene <- symbol2ez$SYMBOL[subjectHits(ol)]
datatable(dt[order(dt$guide), .(guide, overlappingGene)], options = list(pageLength = 10))
dt.m1 <- melt(samplesDT,
measure.vars = c("guide1", "guide2", "guide3"),
variable.name = "guide",
value.name = "guideId"
)
dt.m1[, guideId := sub(" ", "_", guideId)]
dt.m1[grep("EWT|parental", group, invert = TRUE), gene := sub("-.+", "", group)]
gene2guide <- unique(dt.m1[, .(gene, group, guideId)])
gene2guide <- gene2guide[!is.na(gene)]
dt$targetGene <- gene2guide[match(dt$guide, guideId), gene]
datatable(dt[order(factor(guide)), .(casOffInTargetGene = any(unique(targetGene) %in% overlappingGene)), by = guide],
options = list(pageLength = 25)
)
Let’s output the location, sequences and mismatches of the cas-offinder results falling within the targeted genes.
subCasOffGR <- casOffGR[dt[overlappingGene == targetGene, queryHits]]
f <- unlist(tapply(subCasOffGR$mismatch, subCasOffGR$guide, function(x) x == min(x)))
order <- order(factor(subCasOffGR$guide))
datatable(
data.table(
guide = subCasOffGR$guide[order],
chr = as.character(seqnames(subCasOffGR))[order],
start = start(subCasOffGR)[order],
strand = as.character(strand(subCasOffGR))[order],
PAM = subCasOffGR$PAM[order],
seq = subCasOffGR$seq[order],
mismatch = subCasOffGR$mismatch[order]
)[f],
options = list(pageLength = 25)
) %>%
formatStyle(c("PAM", "seq"), fontFamily = "monospace, monospace")
The chance that a SNPs fall in any 100 bp window is the same for any window, it’s a uniform distribution accross the chromosome lenght. The rate at wich a SNPs should fall within the same 100bp window as a predicted CRISPR off target site is then one over the size of the chomosome divided by 100 bp times the number of off-target sites on that chromosome (ie any one 100 bp as the same probability of getting a SNPs in any given window). Then with the predicted rate, we can use the poisson distribution to compute the probabiltiy of seeing a given number of observation.
binSize <- 100
dt.m1 <- melt(samplesDT,
measure.vars = c("guide1", "guide2", "guide3"),
variable.name = "guide",
value.name = "guideId"
)
dt.m1[, guideId := sub(" ", "_", guideId)]
genes <- genes(txdb)
targetGenes <- unique(sub("-.+", "", grep("EWT|parental", names(groupUniqueSnps), value = TRUE, invert = TRUE)))
symbol2ez <- select(org.Hs.eg.db, targetGenes, "ENTREZID", "SYMBOL")
symbol2ez[is.na(symbol2ez$ENTREZID), ]$ENTREZID <- select(
org.Hs.eg.db,
symbol2ez[is.na(symbol2ez$ENTREZID), ]$SYMBOL, "ENTREZID", "ALIAS"
)$ENTREZID
dd <- do.call(rbind, mclapply(names(groupUniqueSnps), function(currGroup) {
vcf <- groupUniqueSnps[[currGroup]]
targetedGene <- sub("-.+", "", subGroup <- sub("EWT-", "", currGroup))
whithinTargetedGene <- findOverlaps(
genes[symbol2ez$ENTREZID[symbol2ez$SYMBOL == targetedGene]],
casOffGR,
ignore.strand = TRUE
)
casOffOutsideGene <- casOffGR[-subjectHits(whithinTargetedGene)]
guides <- dt.m1[group == currGroup, unique(guideId)[unique(guideId) != ""]]
if (length(guides) == 0) {
return(data.table())
}
## Get the chomosome name that has SNPs
chrs <- table(seqnames(vcf))
chrs <- names(chrs[chrs > 0])
dd <- do.call(rbind, lapply(chrs, function(currChr) {
subCasOffGR <- casOffOutsideGene[seqnames(casOffOutsideGene) == currChr & casOffOutsideGene$guide %in% guides]
chrLength <- seqlengths(vcf)[currChr]
rate <- 1 / (chrLength / binSize) * sum(seqnames(subCasOffGR) == currChr)
casOffStarts <- start(subCasOffGR)
snpsStarts <- start(vcf[seqnames(vcf) == currChr])
## If no cas-offinder hits, return early
if (length(casOffStarts) == 0) {
return(data.table())
}
dists <- sapply(casOffStarts, function(start) min(abs(start - snpsStarts)))
data.table(
chr = currChr,
close2offTarget = sum(dists <= binSize),
predictedRate = rate
)
}))
dd[, .(group = currGroup, events = sum(close2offTarget), rate = mean(predictedRate))]
}, mc.cores = detectCores(), mc.preschedule = FALSE))
dd <- dd[!is.na(rate)]
dd[events > 0, p.value.num := 1 - ppois(events, rate)]
dd[!is.na(p.value.num), p.value := format(p.value.num, digits = 3, scientific = TRUE)]
datatable(dd[, `:=`(rate = NULL, p.value.num = NULL)], options = list(pageLength = 25))
Ok, if this is true, doing this anlysis with a CRISPR guide RNA that is not the one used for the edited cell line should not returned significant values. Let’s try it.
binSize <- 100
dt.m1 <- melt(samplesDT,
measure.vars = c("guide1", "guide2", "guide3"),
variable.name = "guide",
value.name = "guideId"
)
dt.m1[, guideId := sub(" ", "_", guideId)]
dd <- do.call(rbind, lapply(1:10, function(i) {
dd <- do.call(rbind, mclapply(names(groupUniqueSnps), function(currGroup) {
subGroup <- sub("EWT-", "", currGroup)
guides <- dt.m1[group == currGroup, unique(guideId)[unique(guideId) != ""]]
if (length(guides) == 0) {
return(data.table())
}
vcf <- groupUniqueSnps[[currGroup]]
## randomly sample same number of guides not the original guides
guides <- sample(dt.m1[, unique(guideId)[!unique(guideId) %in% c(guides, "")]], length(guides))
## Get the chomosome name that has SNPs
chrs <- table(seqnames(vcf))
chrs <- names(chrs[chrs > 0])
dd <- do.call(rbind, lapply(chrs, function(currChr) {
chrLength <- seqlengths(vcf)[currChr]
rate <- 1 / (chrLength / binSize) * length(casOff[chr == currChr])
starts <- start(vcf[seqnames(vcf) == currChr])
dist <- sapply(starts, function(currStart) min(abs(casOff[chr == currChr & id %in% guides, start] - currStart)))
data.table(
chr = currChr,
close2offTarget = sum(dist <= binSize),
predictedRate = rate
)
}))
dd[, .(group = currGroup, events = sum(close2offTarget), rate = mean(predictedRate))]
}, mc.cores = detectCores(), mc.preschedule = FALSE))
dd[, iter := paste0("iter_", i)]
}))
datatable(dcast(dd, group ~ iter, value.var = "events"), options = list(pageLength = 25))
binSize <- 100
dt.m1 <- melt(samplesDT,
measure.vars = c("guide1", "guide2", "guide3"),
variable.name = "guide",
value.name = "guideId"
)
dt.m1[, guideId := sub(" ", "_", guideId)]
genes <- genes(txdb)
targetGenes <- unique(sub("-.+", "", grep("EWT|parental", names(groupUniqueSnps), value = TRUE, invert = TRUE)))
symbol2ez <- select(org.Hs.eg.db, targetGenes, "ENTREZID", "SYMBOL")
symbol2ez[is.na(symbol2ez$ENTREZID), ]$ENTREZID <- select(
org.Hs.eg.db,
symbol2ez[is.na(symbol2ez$ENTREZID), ]$SYMBOL, "ENTREZID", "ALIAS"
)$ENTREZID
dd <- do.call(rbind, mclapply(names(groupUniqueSnps), function(currGroup) {
vcf <- groupUniqueSnps[[currGroup]]
targetedGene <- sub("-.+", "", subGroup <- sub("EWT-", "", currGroup))
whithinTargetedGene <- findOverlaps(
genes[symbol2ez$ENTREZID[symbol2ez$SYMBOL == targetedGene]],
casOffGR,
ignore.strand = TRUE
)
casOffOutsideGene <- casOffGR[-subjectHits(whithinTargetedGene)]
guides <- dt.m1[group == currGroup, unique(guideId)[unique(guideId) != ""]]
if (length(guides) == 0) {
return(data.table())
}
## Get the chomosome name that has SNPs
chrs <- table(seqnames(vcf))
chrs <- names(chrs[chrs > 0])
dd <- do.call(rbind, lapply(chrs, function(currChr) {
subCasOffGR <- casOffOutsideGene[seqnames(casOffOutsideGene) == currChr & casOffOutsideGene$guide %in% guides]
casOffStarts <- start(subCasOffGR)
snpsStarts <- start(vcf[seqnames(vcf) == currChr])
## If no cas-offinder hits, return early
if (length(casOffStarts) == 0) {
return(data.table())
}
dists <- sapply(casOffStarts, function(start) min(abs(start - snpsStarts)))
f.casOff <- dists <= binSize
f.vcf <- sapply(snpsStarts, function(start) min(abs(start - casOffStarts)) <= binSize)
if (sum(f.casOff) == 0) {
return(data.table())
}
dt <- data.table(
group = currGroup,
guide = subCasOffGR$guide,
chr = as.character(seqnames(subCasOffGR)),
"cas-off start" = start(subCasOffGR),
strand = as.character(strand(subCasOffGR)),
PAM = subCasOffGR$PAM,
seq = subCasOffGR$seq,
mismatch = subCasOffGR$mismatch,
dist = dists
)[f.casOff]
dt$"SNP start" <- start(vcf[seqnames(vcf) == currChr][f.vcf])
dt$REF <- as.character(ref(vcf[seqnames(vcf) == currChr][f.vcf]))
dt$ALT <- sapply(alt(vcf[seqnames(vcf) == currChr][f.vcf]), paste0, collapse = ",")
samples <- samplesDT[group == currGroup, sample]
colFilt <- colnames(geno(vcf[seqnames(vcf) == currChr][f.vcf])$GT) %in% samples
geno <- data.frame(geno(vcf[seqnames(vcf) == currChr][f.vcf])$GT)[, colFilt]
dt$GT <- apply(geno, 1, function(x) paste0(unique(x[!x %in% c("0/0", "./.", "./0", "0/.")]), collapse = "\n"))
dt$sample <- apply(geno, 1, function(x) paste0(names(x)[!x %in% c("0/0", "./.", "./0", "0/.")], collapse = "\n"))
return(dt)
}))
}, mc.cores = detectCores(), mc.preschedule = FALSE))
if (nrow(dd)) {
datatable(dd[order(group)]) %>%
formatStyle(c("PAM", "seq"), fontFamily = "monospace, monospace")
}
dd
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8
## [5] LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C.UTF-8
## [9] LC_ADDRESS=C.UTF-8 LC_TELEPHONE=C.UTF-8
## [11] LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C.UTF-8
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gridExtra_2.3
## [2] ape_5.8-1
## [3] fastreeR_1.12.0
## [4] ggplot2_3.5.2
## [5] htmltools_0.5.8.1
## [6] DT_0.33
## [7] org.Hs.eg.db_3.21.0
## [8] BSgenome.Hsapiens.UCSC.hg38_1.4.5
## [9] BSgenome_1.76.0
## [10] rtracklayer_1.68.0
## [11] BiocIO_1.18.0
## [12] TxDb.Hsapiens.UCSC.hg38.knownGene_3.21.0
## [13] GenomicFeatures_1.60.0
## [14] AnnotationDbi_1.70.0
## [15] data.table_1.17.0
## [16] VariantAnnotation_1.54.0
## [17] Rsamtools_2.24.0
## [18] Biostrings_2.76.0
## [19] XVector_0.48.0
## [20] SummarizedExperiment_1.38.0
## [21] Biobase_2.68.0
## [22] GenomicRanges_1.60.0
## [23] GenomeInfoDb_1.44.0
## [24] IRanges_2.42.0
## [25] S4Vectors_0.46.0
## [26] MatrixGenerics_1.20.0
## [27] matrixStats_1.5.0
## [28] BiocGenerics_0.54.0
## [29] generics_0.1.3
## [30] httpgd_2.0.4
##
## loaded via a namespace (and not attached):
## [1] farver_2.1.2 blob_1.2.4 R.utils_2.13.0
## [4] bitops_1.0-9 fastmap_1.2.0 RCurl_1.98-1.17
## [7] GenomicAlignments_1.44.0 XML_3.99-0.18 digest_0.6.37
## [10] lifecycle_1.0.4 KEGGREST_1.48.0 RSQLite_2.3.9
## [13] magrittr_2.0.3 compiler_4.5.0 rlang_1.1.6
## [16] sass_0.4.10 tools_4.5.0 yaml_2.3.10
## [19] knitr_1.50 labeling_0.4.3 S4Arrays_1.8.0
## [22] htmlwidgets_1.6.4 bit_4.6.0 curl_6.2.2
## [25] DelayedArray_0.34.1 RColorBrewer_1.1-3 abind_1.4-8
## [28] BiocParallel_1.42.0 unigd_0.1.3 withr_3.0.2
## [31] R.oo_1.27.0 grid_4.5.0 scales_1.4.0
## [34] cli_3.6.5 rmarkdown_2.29 crayon_1.5.3
## [37] httr_1.4.7 rjson_0.2.23 DBI_1.2.3
## [40] cachem_1.1.0 stringr_1.5.1 restfulr_0.0.15
## [43] vctrs_0.6.5 Matrix_1.7-3 jsonlite_2.0.0
## [46] bit64_4.6.0-1 crosstalk_1.2.1 systemfonts_1.2.2
## [49] jquerylib_0.1.4 glue_1.8.0 codetools_0.2-20
## [52] stringi_1.8.7 rJava_1.0-11 gtable_0.3.6
## [55] UCSC.utils_1.4.0 tibble_3.2.1 pillar_1.10.2
## [58] GenomeInfoDbData_1.2.14 R6_2.6.1 evaluate_1.0.3
## [61] lattice_0.22-7 R.methodsS3_1.8.2 png_0.1-8
## [64] memoise_2.0.1 bslib_0.9.0 Rcpp_1.0.14
## [67] nlme_3.1-168 SparseArray_1.8.0 xfun_0.52
## [70] pkgconfig_2.0.3
Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.
The iSCORE-PD_design.csv file is a comma seperated text file starting with a header and with one cell line per line with the following columns:
samplesDT <- setDT(read.csv("./supporting_files/iSCORE-PD_design.csv"))
datatable(samplesDT)
Many SNPs in the sampeles have more than one GT, more than the number of unique SNPs. What’s going on?
Ok, Let’s look at chromosome 1 only
vcf <- readVcf(vcf.files[1])
groups <- samplesDT[!group %in% c("parental", "EWT"), unique(group), ]
## Get the sample names of the group under analysis
currUniverse <- samplesDT[-grep("EWT", group), samples]
asSNPs <- apply(geno(vcf)$GT[, currUniverse], 1, function(x) !all(x %in% c("./.", "0/0", "./0", "0/.")))
vcf_with_snps <- vcf[asSNPs & qual(vcf) >= 30]
asSameGT <- apply(geno(vcf_with_snps)$GT[, currUniverse], 1, function(x) length(unique(x)) == 1)
## Fraction of SNPs that have the same GT over all the samples
sum(asSameGT) / length(asSameGT)
## [1] 0.9096058
geno <- geno(vcf_with_snps[!asSameGT])$GT[, currUniverse]
lvls <- combn(0:4, 2, paste, collapse = "/")
lvls <- c(lvls, sapply(0:4, function(x) paste(rep(x, 2), collapse = "/")))
unique_geno <- apply(geno, 1, function(x) unique(x[-grep("\\.", x, )]))
## Franction of SNPs where all genotypes have an uncalled allele
sum(!elementNROWS(unique_geno) > 1) / length(unique_geno)
## [1] 0.6940722
unique_geno <- unique_geno[elementNROWS(unique_geno) > 1]
length(unique_geno)
## [1] 9429
Turns out that a large fraction of the discordant GT are transition between Het and Homo.
dt1 <- data.table(
snp = names(unique_geno),
gts = sapply(unique_geno, function(x) paste(factor(x, levels = lvls), collapse = "|"))
)
dt <- data.table(table(sapply(unique_geno, function(x) paste(factor(x, levels = lvls), collapse = "|"))))
setnames(dt, "V1", "gts")
dt[, gtCount := elementNROWS(strsplit(gts, "\\|"))]
dt[, alleleCount := unlist(lapply(strsplit(gts, "\\||/"), function(x) length(unique(x))))]
dt[, homo2het := "No"]
dt[!(alleleCount > 2 | gtCount > 2), homo2het := lapply(strsplit(gts, "\\|"), function(y) {
yss <- lapply(strsplit(y, "/"), unique)
if (any(yss[[1]] %in% yss[[2]] | yss[[2]] %in% yss[[1]])) "yes"
})]
dt[order(factor(homo2het, levels = c("yes", "no")))]
dt[homo2het == "yes", sum(N)] / dt[, sum(N)]
## [1] 0.5698377
Most likely, the source of these transitions are homologous DNA repair, where during replication, a break hapened and is being recessed then repaired by the homolgous chromatid. Any Het SNPs in within the recessed region will be converted to the reciprocal allele, converting the site from Het to Homo
If the hypothesis is true, there’s a greather likely hood to transition SNPs from Het to Homo that are closer to each others. Let’s look at that.
homo_2_het_snps <- dt1[gts %in% dt[homo2het == "yes", gts], snp]
start1 <- start(vcf[homo_2_het_snps])
start2 <- c(start1[-1], seqlengths(vcf)[unique(seqnames(vcf))])
df <- data.frame(
dist2next = start2 - start1,
type = "homo2het"
)
not_homo_2_het_snps <- dt1[!gts %in% dt[homo2het == "yes", gts], snp]
start3 <- start(vcf[not_homo_2_het_snps])
start4 <- c(start3[-1], seqlengths(vcf)[unique(seqnames(vcf))])
df <- rbind(df, data.frame(
dist2next = start4 - start3,
type = "notHomo2het"
))
p <- ggplot(df, aes(log10(dist2next), color = type)) +
geom_density()
print(p)
Let’s take the SNPs that have samples with genotyp transitions from het to homo and look at the read depth coverage from the sample correspoding to the major GT (ie samples from the group with most of one of the two GT) compare to the samples with the minor GT (ie sample from the group wiht the least of one of the two GT). Also, as a control, let’s sample the compute the read coverage from SNPs where all the genotypes are identical (well do some sampling here to reduce the size of the finale data frame).
df <- do.call(rbind, mclapply(dt[homo2het == "yes" & N > 10, gts], function(currCase) {
gts <- strsplit(currCase, "\\|")[[1]]
currVcf <- vcf[dt1[gts == currCase, snp], currUniverse]
majorDP <- unlist(sapply(seq_len(length(currVcf)), function(i) {
major <- which.max(c(sum(geno(currVcf)$GT[i, ] == gts[1]), sum(geno(currVcf)$GT[i, ] == gts[2])))
f <- geno(currVcf)$GT[i, ] == gts[major]
geno(currVcf)$DP[i, f]
}))
minorDP <- unlist(sapply(seq_len(length(currVcf)), function(i) {
major <- which.min(c(sum(geno(currVcf)$GT[i, ] == gts[1]), sum(geno(currVcf)$GT[i, ] == gts[2])))
f <- geno(currVcf)$GT[i, ] == gts[major]
geno(currVcf)$DP[i, f]
}))
f <- apply(geno(vcf[, currUniverse])$GT, 1, function(x) length(unique(x)) == 1 && unique(x) %in% gts)
uniqueDP <- as.vector(geno(vcf[f][sample(sum(f), length(currVcf))])$DP)
df <- data.frame(
DP = c(majorDP, minorDP, uniqueDP),
sample = c(
rep("Major GT", length(majorDP)),
rep("Minor GT", length(minorDP)),
rep("Unique GT", length(uniqueDP))
)
)
}, mc.preschedule = FALSE, mc.cores = detectCores()))
p <- ggplot(df, aes(DP, color = sample)) +
geom_density() +
xlim(0, 60) +
labs(
title = "Distributions of squencing depth (deepVariant DP) between samples from major, minor, or common GT",
x = "Read Deapth"
)
print(p)
## Warning: Removed 3211 rows containing non-finite outside the scale range
## (`stat_density()`).
tapply(df$DP, df$sample, summary)
## $`Major GT`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 18.00 25.00 25.24 31.00 247.00
##
## $`Minor GT`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 14.0 20.0 21.4 27.0 243.0
##
## $`Unique GT`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 25.00 30.00 30.55 35.00 275.00
Looking at same SNPs as above but computing the ration of read depth for one allele compare to the other allele
df <- do.call(rbind, mclapply(dt[homo2het == "yes" & N > 10, gts], function(currCase) {
gts <- strsplit(currCase, "\\|")[[1]]
currVcf <- vcf[dt1[gts == currCase, snp], currUniverse]
majorAD <- unlist(sapply(seq_len(length(currVcf)), function(i) {
major <- which.max(c(sum(geno(currVcf)$GT[i, ] == gts[1]), sum(geno(currVcf)$GT[i, ] == gts[2])))
f <- geno(currVcf)$GT[i, ] == gts[major]
AD <- geno(currVcf)$AD[i, f]
sapply(AD, function(x) x[1] / sum(x))
}))
minorAD <- unlist(sapply(seq_len(length(currVcf)), function(i) {
minor <- which.min(c(sum(geno(currVcf)$GT[i, ] == gts[1]), sum(geno(currVcf)$GT[i, ] == gts[2])))
f <- geno(currVcf)$GT[i, ] == gts[minor]
AD <- geno(currVcf)$AD[i, f]
sapply(AD, function(x) x[1] / sum(x))
}))
f <- apply(geno(vcf[, currUniverse])$GT, 1, function(x) length(unique(x)) == 1 && unique(x) %in% gts)
sampleVcf <- vcf[f][sample(sum(f), length(currVcf))]
uniqueAD <- unlist(sapply(seq_len(length(sampleVcf)), function(i) {
AD <- geno(sampleVcf)$AD[i, ]
sapply(AD, function(x) x[1] / sum(x))
}))
df <- data.frame(
AD = c(majorAD, minorAD, uniqueAD),
sample = c(
rep("Major AD", length(majorAD)),
rep("Minor AD", length(minorAD)),
rep("Unique AD", length(uniqueAD))
)
)
}, mc.preschedule = FALSE, mc.cores = detectCores()))
ggplot(df, aes(AD)) +
geom_histogram() +
facet_grid(rows = vars(sample), scales = "free") +
labs(
title = "Distributions of fraction of Allele Depth (deepVariant AD)\n between samples from major, minor, or common GT", # nolint: line_length_linter.
x = "Read Deapth"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3839 rows containing non-finite outside the scale range
## (`stat_bin()`).
tapply(df$AD, df$sample, summary)
## $`Major AD`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.2222 0.4737 0.4538 0.6667 1.0000 3652
##
## $`Minor AD`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.2000 0.5385 0.5445 1.0000 1.0000 185
##
## $`Unique AD`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00000 0.07692 0.45455 0.37394 0.54167 1.00000 2
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8
## [5] LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C.UTF-8
## [9] LC_ADDRESS=C.UTF-8 LC_TELEPHONE=C.UTF-8
## [11] LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C.UTF-8
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gridExtra_2.3
## [2] ape_5.8-1
## [3] fastreeR_1.12.0
## [4] ggplot2_3.5.2
## [5] htmltools_0.5.8.1
## [6] DT_0.33
## [7] org.Hs.eg.db_3.21.0
## [8] BSgenome.Hsapiens.UCSC.hg38_1.4.5
## [9] BSgenome_1.76.0
## [10] rtracklayer_1.68.0
## [11] BiocIO_1.18.0
## [12] TxDb.Hsapiens.UCSC.hg38.knownGene_3.21.0
## [13] GenomicFeatures_1.60.0
## [14] AnnotationDbi_1.70.0
## [15] data.table_1.17.0
## [16] VariantAnnotation_1.54.0
## [17] Rsamtools_2.24.0
## [18] Biostrings_2.76.0
## [19] XVector_0.48.0
## [20] SummarizedExperiment_1.38.0
## [21] Biobase_2.68.0
## [22] GenomicRanges_1.60.0
## [23] GenomeInfoDb_1.44.0
## [24] IRanges_2.42.0
## [25] S4Vectors_0.46.0
## [26] MatrixGenerics_1.20.0
## [27] matrixStats_1.5.0
## [28] BiocGenerics_0.54.0
## [29] generics_0.1.3
## [30] httpgd_2.0.4
##
## loaded via a namespace (and not attached):
## [1] farver_2.1.2 blob_1.2.4 R.utils_2.13.0
## [4] bitops_1.0-9 fastmap_1.2.0 RCurl_1.98-1.17
## [7] GenomicAlignments_1.44.0 XML_3.99-0.18 digest_0.6.37
## [10] lifecycle_1.0.4 KEGGREST_1.48.0 RSQLite_2.3.9
## [13] magrittr_2.0.3 compiler_4.5.0 rlang_1.1.6
## [16] sass_0.4.10 tools_4.5.0 yaml_2.3.10
## [19] knitr_1.50 labeling_0.4.3 S4Arrays_1.8.0
## [22] htmlwidgets_1.6.4 bit_4.6.0 curl_6.2.2
## [25] DelayedArray_0.34.1 RColorBrewer_1.1-3 abind_1.4-8
## [28] BiocParallel_1.42.0 unigd_0.1.3 withr_3.0.2
## [31] R.oo_1.27.0 grid_4.5.0 scales_1.4.0
## [34] cli_3.6.5 rmarkdown_2.29 crayon_1.5.3
## [37] httr_1.4.7 rjson_0.2.23 DBI_1.2.3
## [40] cachem_1.1.0 stringr_1.5.1 restfulr_0.0.15
## [43] vctrs_0.6.5 Matrix_1.7-3 jsonlite_2.0.0
## [46] bit64_4.6.0-1 crosstalk_1.2.1 systemfonts_1.2.2
## [49] jquerylib_0.1.4 glue_1.8.0 codetools_0.2-20
## [52] stringi_1.8.7 rJava_1.0-11 gtable_0.3.6
## [55] UCSC.utils_1.4.0 tibble_3.2.1 pillar_1.10.2
## [58] GenomeInfoDbData_1.2.14 R6_2.6.1 evaluate_1.0.3
## [61] lattice_0.22-7 R.methodsS3_1.8.2 png_0.1-8
## [64] memoise_2.0.1 bslib_0.9.0 Rcpp_1.0.14
## [67] nlme_3.1-168 SparseArray_1.8.0 xfun_0.52
## [70] pkgconfig_2.0.3