Comparative analsyis of mutation load in different CRISPR edited cell lines

Groupwise SNPs

Intro

Data.table linking samples to groups

Rational:

Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.

Format:

The iSCORE-PD_design.csv file is a comma seperated text file starting with a header and with one cell line per line with the following columns:

samples: Sample ID found in VCF header of the joint genotyping output file
group: Group ID linking the sample in group
meta: Additional group relation. Not used anymore

samplesDT <- setDT(read.csv("./supporting_files/iSCORE-PD_design.csv"))
datatable(samplesDT)

Unique Snps

Find SNPs common to a set of samples

Strategy:

Read the VCF for a given chromosome
Remove the SNPs with a Qual less than 30
Subset the SNPs for the samples of a given group
Remove the SNPs where all the calls are missing or REF (GT %in% ./., 0/0, 0/., ./0)
Define the sample part of the univers. ie all samples excluding EWTs
If a sample is part of the meta colums, remove the samples of the meta from the universeSnps
Subset the SNPs for the samples of a given group
Remove the SNPs where all the calls are missing or REF (GT %in% ./., 0/0, 0/., ./0)
Returns the SNPs in the Group not present in the Universe

getSubsetFilter <- function(vcf, samples, qual) {
  subset <- as.data.frame(geno(vcf)$GT[, samples])
  filter <- !apply(subset, 1, function(x) all(x %in% c("./.", "0/0", "./0", "0/."))) & qual(vcf) >= qual
  return(filter)
}

getUniqueSnps <- function(vcf.file, samplesDT, qual = 40) {
  ## Read the VCF
  vcf <- readVcf(vcf.file)

  groups <- samplesDT[group != "parental", unique(group), ] # nolint: object_usage_linter.
  ## Roll through the groups and get their unique SNPs
  results <- lapply(groups, function(currGroup) {
    ## Get the sample names of the group under analysis
    currSamples <- samplesDT[currGroup == group, samples] # nolint: object_usage_linter.
    currUniverse <- samplesDT[currGroup != group, samples] # nolint: object_usage_linter.
    ## Remove the meta samples for the EWTs...
    if (!is.na(unique(samplesDT[group == currGroup, meta]))) { # nolint: object_usage_linter.
      metaSamples <- samplesDT[meta %in% samplesDT[group == currGroup, meta], samples] # nolint: object_usage_linter.
      currUniverse <- currUniverse[!currUniverse %in% metaSamples]
    }

    groupSnp_f <- getSubsetFilter(vcf, currSamples, qual)
    universeSnps_f <- getSubsetFilter(vcf, currUniverse, qual)

    ## Subset the vcf to the group SNPS
    uniqueSnps <- vcf[groupSnp_f & !universeSnps_f]

    return(uniqueSnps)
  })

  names(results) <- groups
  return(results)
}

Computing the unique SNPs for each group

Rational:

Now that we have the underlying algorithm, it’s time to roll over every VCF file for each choromsome (For a gain of speed, we’ll do that using parallel computing) and we will massage the data to return something consumable

Strategy:

Define the parental samples
Find the VCF files
Reading the header of one VCF file, get the group tags
In parallel, computed the unique SNPs VCF and stats
Massage the data for compsution (and save the VCF to HD)

## Only run if data files are no present
resultToSave <- file.path("data", "unique_SNPs_vs_all.Rds")
vcfToSave <- file.path("data", "unique_SNPs_vs_all.vcf")
if (file.exists(resultToSave) && file.exists(vcfToSave)) {
  groupUniqueSnps <- readRDS(resultToSave)
  allUniqueVCF <- readVcf(vcfToSave)
} else {
  vcf.files <- list.files("SNV", "\\.vcf\\.gz$", full = TRUE)
  rawResults <- mclapply(vcf.files,
    getUniqueSnps,
    samplesDT = samplesDT,
    qual = 30,
    mc.cores = detectCores(),
    mc.preschedule = FALSE
  )

  groupUniqueSnps <- mclapply(names(rawResults[[1]]), function(group) {
    res <- do.call(rbind, lapply(rawResults, "[[", group))
    return(res)
  }, mc.cores = detectCores(), mc.preschedule = FALSE)

  names(groupUniqueSnps) <- names(rawResults[[1]])
  dir.create("data", showWarnings = FALSE)
  saveRDS(groupUniqueSnps, resultToSave)

  allUniqueVCF <- unique(do.call(rbind, groupUniqueSnps))
  writeVcf(allUniqueVCF, vcfToSave)
}

All Intersections

Generating the intersections for all the samples but parental

vcf <- allUniqueVCF
currSamples <- samplesDT[group != "parental", samples]

dt <- cbind(
  allele = rownames(geno(vcf)$GT),
  data.table(geno(vcf)$GT)[, ..currSamples]
)
geno <- melt(
  dt,
  id.vars = "allele",
  measure.vars = currSamples,
  variable.name = "sample",
  value.name = "geno"
)
geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]
samples <- geno[, unique(sample)]

codes <- do.call(c, lapply(seq_len(floor(length(samples) / 26) + 1), function(i) {
  if (26 * i < length(samples)) {
    end <- 26
  } else {
    end <- length(samples) - 26 * (i - 1)
  }
  sapply(LETTERS[seq_len(end)], function(x) paste(rep(x, i), collapse = ""))
}))

void <- lapply(seq_along(codes), function(i) {
  geno[sample == samples[i], sub := codes[i]]
})

combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")
combn <- unique(combi)
combn <- combn[order(elementNROWS(strsplit(combn, "-")))]
combi <- factor(combi, levels = combn)

dt <- data.table(Combination = combi)

dt1 <- dt[!is.na(Combination), .N, by = Combination]
dt1[, nSample := elementNROWS(strsplit(as.character(Combination), "-"))]
dt1 <- dt1[order(factor(Combination, levels = combn))]
legend <- data.table(Code = codes, Sample = samples)

cat("\n")

cat("#### Number of SNPs shared among all edited cell lines\n")

Number of SNPs shared among all edited cell lines

cat("##### Id to sample legend\n")

Id to sample legend

print(tagList(datatable(legend)))

cat("##### Number of SNPs per combination for all edited cell lines\n")

Number of SNPs per combination for all edited cell lines

print(tagList(datatable(dt1, options = list(order = list(list(2, "desc"))))))

cat("\n")

Phylo Tree

Computing the phylogenic relationship of the different cell lines

Rational:

Strategy:

Define the parental samples
Find the VCF files
Reading the header of one VCF file, get the group tags
In parallel, computed the unique SNPs VCF and stats
Massage the data for compsution (and save the VCF to HD)

Phylogenetic tree from only the SNPs unique to the edited cell lines

myVcfDist <- fastreeR::vcf2dist(inputFile = vcfToSave, threads = 2)
myVcfTree <- fastreeR::dist2tree(inputDist = myVcfDist)

plot(ape::read.tree(text = myVcfTree), direction = "rightwards", cex = 0.8)
ape::add.scale.bar()
ape::axisPhylo(side = 3)

SNPs annotation

Get the count of SNPs falling on different genomic features (Transcript centric)

Rational:

What type of genomic features are affected by the SNPS in each group

Strategy:

Load the genomic location of all gene and feature from the Hg38 TxDb library
Roll over each group from the unique SNPs identified earlier
Remove the sequences that are not found in the TxDb object (lots of decoy sequences)
Get all the features overlaping a SNPs

allGroup <- mclapply(groupUniqueSnps, function(vcf) {
  seqlevels(vcf) <- seqlevels(vcf)[seqlevels(vcf) %in% seqlevels(txdb)]
  all <- locateVariants(vcf, txdb, AllVariants())
  ## UPDATE 02/28/25
  ## There's a bug with locateVariants(), intronic variants are wrong. Let's recode them
  all.noInt <- all[all$LOCATION != "intron"]

  ## Copying from the the VariantAnnotation repo, from the variantLocation() method
  subject <- intronsByTranscript(txdb)
  ## Remove items with no GRanges, that's the bug
  subject <- subject[elementNROWS(subject) > 0]
  query <- rowRanges(vcf)
  vtype <- "intron"
  ignore.strand <- FALSE
  asHits <- FALSE

  .location <-
    function(length = 0, value = NA) {
      levels <- c(
        "spliceSite", "intron", "fiveUTR", "threeUTR",
        "coding", "intergenic", "promoter"
      )
      factor(rep(value, length), levels = levels)
    }

  map <- mapToTranscripts(unname(query), subject,
    ignore.strand = ignore.strand
  )
  if (length(map) > 0) {
    xHits <- map$xHits
    txHits <- map$transcriptsHits
    tx <- names(subject)[txHits]
    if (!is.null(tx)) {
      txid <- tx
    } else {
      txid <- NA_integer_
    }
    ## FIXME: cdsid is expensive
    cdsid <- IntegerList(integer(0))

    ss <- runValue(strand(subject)[txHits])
    if (any(elementNROWS(ss) > 1L)) {
      warning(
        "'subject' has multiple strands per list element; ",
        "setting strand to '*'"
      )
      sstrand <- Rle("*", length(txHits))
    }
    sstrand <- unlist(ss, use.names = FALSE)
    res <- GRanges(
      seqnames = seqnames(query)[xHits],
      ranges = IRanges(ranges(query)[xHits]),
      strand = sstrand,
      LOCATION = .location(length(xHits), vtype),
      LOCSTART = start(map),
      LOCEND = end(map),
      QUERYID = xHits,
      TXID = txid,
      CDSID = cdsid,
      GENEID = NA_character_,
      PRECEDEID = CharacterList(character(0)),
      FOLLOWID = CharacterList(character(0))
    )
  }
  res$GENEID <- select(
    txdb, as.character(res$TXID),
    "GENEID", "TXID"
  )$GENEID

  ## Add back the intronic location and reorder the GRanges based on the location
  all <- c(all.noInt, res)
  all <- all[order(all)]
}, mc.cores = detectCores(), mc.preschedule = FALSE)

Tabulate the count of features per group

featuresPerGroup <- do.call(rbind, lapply(names(allGroup), function(group) {
  all <- allGroup[[group]]
  df <- as.data.frame(table(all$LOCATION))
  names(df) <- c("feature", "count")
  df$group <- group
  setDT(df)
}))

datatable(dcast(featuresPerGroup, group ~ feature, value.var = "count"))

Samples to code legend for the different edited cell line groups

rawResult <- lapply(names(allGroup), function(currGroup) {
  ## Get the combination of samples for each SNPs
  vcf <- groupUniqueSnps[[currGroup]]
  currSamples <- samplesDT[group == currGroup, samples]
  dt <- cbind(
    allele = rownames(geno(vcf)$GT),
    data.table(geno(vcf)$GT)[, ..currSamples]
  )
  geno <- melt(
    dt,
    id.vars = "allele",
    measure.vars = currSamples,
    variable.name = "sample",
    value.name = "geno"
  )
  geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]

  samples <- geno[, unique(sample)]
  codes <- LETTERS[seq_along(samples)]

  void <- lapply(seq_along(codes), function(i) {
    geno[sample == samples[i], sub := codes[i]]
  })

  combn <- unlist(lapply(do.call(
    c,
    lapply(seq_along(codes), combn, x = codes, simplify = FALSE)
  ), paste, collapse = "-"))

  combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")

  all <- allGroup[[currGroup]]
  dt <- do.call(rbind, lapply(levels(all$LOCATION), function(feature) {
    f <- all$LOCATION == feature

    gid <- all[f]$GENEID
    snpID <- names(all)[f] ## UPDATE 12/20/24
    snpLocation <- paste0(seqnames(all[f]), ":", start(all[f]), "-", end(all[f])) ## UPDATE 12/20/24

    f2 <- !is.na(gid)
    gid <- gid[f2]
    snpID <- snpID[f2]
    snpLocation <- snpLocation[f2] ## UPDATE 12/20/24

    dt <- data.table(select(org.Hs.eg.db, gid, c("SYMBOL", "GENENAME", "GENETYPE"), "ENTREZID"))
    dt$group <- currGroup
    dt$feature <- feature
    dt$location <- snpLocation
    dt$combi <- combi[snpID]
    unique(dt[, ENTREZID := NULL])
  }))
  legend <- data.table(
    group = currGroup,
    sample = samples,
    code = codes
  )
  list(legend = legend, dt = dt)
})
names(rawResult) <- names(allGroup)

datatable(do.call(rbind, lapply(rawResult, "[[", "legend")))

List the genes affected by the SNPs in each type of feature (excluding introns and promoter)

write.csv(do.call(rbind, lapply(rawResult, "[[", "legend")), file.path("data", "SNP_annotations_grouping_keys.csv"))
write.csv(do.call(rbind, lapply(rawResult, "[[", "dt")), file.path("data", "SNP_annotations.csv"))

for (currGroup in names(rawResult)) {
  cat("\n")
  cat("#### List of Genes with SNPs in", currGroup, "edited cell lines\n")
  print(
    tagList(
      datatable(rawResult[[currGroup]]$dt[!feature %in% c("promoter", "intron")])
    )
  )
  cat("\n")
}

List of Genes with SNPs in ATP13A2 edited cell lines

List of Genes with SNPs in DJ1 edited cell lines

List of Genes with SNPs in DNAJC6 edited cell lines

List of Genes with SNPs in SNCA-A30P edited cell lines

List of Genes with SNPs in GBA1 edited cell lines

List of Genes with SNPs in EWT-OTHERS edited cell lines

List of Genes with SNPs in FBXO7 edited cell lines

List of Genes with SNPs in LRRK2 edited cell lines

List of Genes with SNPs in PINK1 edited cell lines

List of Genes with SNPs in PRKN edited cell lines

List of Genes with SNPs in S_Group edited cell lines

List of Genes with SNPs in SNCA-A53T edited cell lines

List of Genes with SNPs in SYNJ1 edited cell lines

List of Genes with SNPs in VPS13C-A444P edited cell lines

List of Genes with SNPs in VPS13C-W395C edited cell lines

Gene Frequencies

Looking at genes frequently found in many groups

Frequencies of common genes in the groups of edited cell lines

gid <- do.call(c, lapply(allGroup, function(all) {
  gid <- unique(all[!all$LOCATION %in% c("intron", "promoter", "intergenic")]$GENEID)
  gid[!is.na(gid)]
}))

ez2id <- select(org.Hs.eg.db, gid, "SYMBOL", "ENTREZID")

tt <- table(ez2id$SYMBOL)

tt <- tt[order(tt, decreasing = TRUE)]

datatable(as.data.frame(tt))

CDS impact

Compute the effect of SNPs falling over coding sequences

results2 <- mclapply(names(groupUniqueSnps), function(currGroup) {
  ## Get the combination of samples for each SNPs
  currSamples <- samplesDT[group == currGroup, samples]
  vcf <- groupUniqueSnps[[currGroup]][, currSamples]

  dt1 <- cbind(
    allele = rownames(geno(vcf)$GT),
    data.table(geno(vcf)$GT)[, ..currSamples]
  )
  geno <- melt(
    dt1,
    id.vars = "allele",
    measure.vars = currSamples,
    variable.name = "sample",
    value.name = "geno"
  )
  geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]

  samples <- geno[, unique(sample)]
  codes <- LETTERS[seq_along(samples)]

  void <- lapply(seq_along(codes), function(i) {
    geno[sample == samples[i], sub := codes[i]]
  })

  combn <- unlist(lapply(do.call(
    c,
    lapply(seq_along(codes), combn, x = codes, simplify = FALSE)
  ), paste, collapse = "-"))

  combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")

  seqlevels(vcf) <- seqlevels(vcf)[seqlevels(vcf) %in% seqlevels(txdb)]
  cds <- locateVariants(vcf, txdb, CodingVariants())
  aa <- predictCoding(vcf[cds$QUERYID], txdb, Hsapiens)
  aa$GENEID <- select(txdb, aa$TXID, "GENEID", "TXID")$GENEID
  aa <- aa[!(duplicated(aa) & (duplicated(aa$GENEID) | is.na(aa$GENEID)))]

  if (length(aa) > 0) {
    gt <- apply(data.table(geno(vcf[names(aa)])$GT)[, ..currSamples], 1, function(x) {
      paste(unique(x[!x %in% c("0/0", "./.", "./0", "0/.")]), collapse = ",")
    })

    dt <- data.table(
      REF = as.character(aa$REF),
      ALT = sapply(aa$ALT, paste, collapse = ","),
      GT = gt,
      location = paste0(seqnames(aa), ":", start(aa), "-", end(aa)), ## UPDATE 12/20/24
      consequence = aa$CONSEQUENCE,
      combi = as.character(combi[names(aa)])
    )
    dt <- cbind(select(org.Hs.eg.db, aa$GENEID, c("SYMBOL", "GENENAME"), "ENTREZID"), dt)

    setDT(dt)

    unique(dt[, ENTREZID := NULL])
  } else {
    dt <- data.table(
      SYMBOL = character(),
      GENENAME = character(),
      REF = character(),
      ALT = character(),
      location = character(), ## UPDATE 12/20/24
      consequence = character(),
      combi = character()
    )
  }
}, mc.cores = detectCores(), mc.preschedule = FALSE)

names(results2) <- names(groupUniqueSnps)
results2 <- results2[elementNROWS(results2) > 0]

write.csv(do.call(rbind, lapply(rawResult, "[[", "legend")), file.path("data", "CDS_impact_grouping_keys.csv"))

results3 <- cbind(do.call(rbind, results2),
  group = rep(names(results2), sapply(results2, nrow))
)

write.csv(results3, file.path("data", "CDS_impact.csv"))

for (currGroup in names(results2)) {
  cat("\n")
  cat("#### Coding sequence impact in", currGroup, "edited cell lines\n")
  print(
    tagList(
      datatable(results2[[currGroup]])
    )
  )
  cat("\n")
}

Coding sequence impact in ATP13A2 edited cell lines

Coding sequence impact in DJ1 edited cell lines

Coding sequence impact in DNAJC6 edited cell lines

Coding sequence impact in SNCA-A30P edited cell lines

Coding sequence impact in GBA1 edited cell lines

Coding sequence impact in EWT-OTHERS edited cell lines

Coding sequence impact in FBXO7 edited cell lines

Coding sequence impact in LRRK2 edited cell lines

Coding sequence impact in PINK1 edited cell lines

Coding sequence impact in PRKN edited cell lines

Coding sequence impact in S_Group edited cell lines

Coding sequence impact in SNCA-A53T edited cell lines

Coding sequence impact in SYNJ1 edited cell lines

Coding sequence impact in VPS13C-A444P edited cell lines

Coding sequence impact in VPS13C-W395C edited cell lines

Sanity check to verify that all the genes in the SNP table are in the CDS impact table

sampleNames <- unique(c(names(rawResult), names(results2)))

## Make sures all snp located in coding feature from first table
## Are the list of coding impact in second table
all(sapply(sampleNames, function(currGroup) {
  res1 <- rawResult[[currGroup]][["dt"]]
  genes1 <- res1[res1$feature == "coding"]$SYMBOL
  genes2 <- results2[[currGroup]]$SYMBOL
  all(unique(c(genes1, genes2)) %in% genes1) & all(unique(c(genes1, genes2)) %in% genes2)
}))

## [1] TRUE

sessionInfo

sessionInfo()

## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8          LC_NUMERIC=C             
##  [3] LC_TIME=C.UTF-8           LC_COLLATE=C.UTF-8       
##  [5] LC_MONETARY=C.UTF-8       LC_MESSAGES=C.UTF-8      
##  [7] LC_PAPER=C.UTF-8          LC_NAME=C.UTF-8          
##  [9] LC_ADDRESS=C.UTF-8        LC_TELEPHONE=C.UTF-8     
## [11] LC_MEASUREMENT=C.UTF-8    LC_IDENTIFICATION=C.UTF-8
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3                           
##  [2] ape_5.8-1                               
##  [3] fastreeR_1.12.0                         
##  [4] ggplot2_3.5.2                           
##  [5] htmltools_0.5.8.1                       
##  [6] DT_0.33                                 
##  [7] org.Hs.eg.db_3.21.0                     
##  [8] BSgenome.Hsapiens.UCSC.hg38_1.4.5       
##  [9] BSgenome_1.76.0                         
## [10] rtracklayer_1.68.0                      
## [11] BiocIO_1.18.0                           
## [12] TxDb.Hsapiens.UCSC.hg38.knownGene_3.21.0
## [13] GenomicFeatures_1.60.0                  
## [14] AnnotationDbi_1.70.0                    
## [15] data.table_1.17.0                       
## [16] VariantAnnotation_1.54.0                
## [17] Rsamtools_2.24.0                        
## [18] Biostrings_2.76.0                       
## [19] XVector_0.48.0                          
## [20] SummarizedExperiment_1.38.0             
## [21] Biobase_2.68.0                          
## [22] GenomicRanges_1.60.0                    
## [23] GenomeInfoDb_1.44.0                     
## [24] IRanges_2.42.0                          
## [25] S4Vectors_0.46.0                        
## [26] MatrixGenerics_1.20.0                   
## [27] matrixStats_1.5.0                       
## [28] BiocGenerics_0.54.0                     
## [29] generics_0.1.3                          
## [30] httpgd_2.0.4                            
## 
## loaded via a namespace (and not attached):
##  [1] farver_2.1.2             blob_1.2.4               R.utils_2.13.0          
##  [4] bitops_1.0-9             fastmap_1.2.0            RCurl_1.98-1.17         
##  [7] GenomicAlignments_1.44.0 XML_3.99-0.18            digest_0.6.37           
## [10] lifecycle_1.0.4          KEGGREST_1.48.0          RSQLite_2.3.9           
## [13] magrittr_2.0.3           compiler_4.5.0           rlang_1.1.6             
## [16] sass_0.4.10              tools_4.5.0              yaml_2.3.10             
## [19] knitr_1.50               S4Arrays_1.8.0           htmlwidgets_1.6.4       
## [22] bit_4.6.0                curl_6.2.2               DelayedArray_0.34.1     
## [25] RColorBrewer_1.1-3       abind_1.4-8              BiocParallel_1.42.0     
## [28] unigd_0.1.3              withr_3.0.2              R.oo_1.27.0             
## [31] grid_4.5.0               scales_1.4.0             cli_3.6.5               
## [34] rmarkdown_2.29           crayon_1.5.3             httr_1.4.7              
## [37] rjson_0.2.23             DBI_1.2.3                cachem_1.1.0            
## [40] stringr_1.5.1            restfulr_0.0.15          vctrs_0.6.5             
## [43] Matrix_1.7-3             jsonlite_2.0.0           bit64_4.6.0-1           
## [46] crosstalk_1.2.1          systemfonts_1.2.2        jquerylib_0.1.4         
## [49] glue_1.8.0               codetools_0.2-20         stringi_1.8.7           
## [52] rJava_1.0-11             gtable_0.3.6             UCSC.utils_1.4.0        
## [55] tibble_3.2.1             pillar_1.10.2            GenomeInfoDbData_1.2.14 
## [58] R6_2.6.1                 evaluate_1.0.3           lattice_0.22-7          
## [61] R.methodsS3_1.8.2        png_0.1-8                memoise_2.0.1           
## [64] bslib_0.9.0              Rcpp_1.0.14              nlme_3.1-168            
## [67] SparseArray_1.8.0        xfun_0.52                pkgconfig_2.0.3

By Edit Method

Intro

Data.table linking samples to groups

Rational:

Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.

Format:

The iSCORE-PD_cells_grouped_by_editing_methods.csv file is a comma seperated text file starting with a header and with one cell line per line with the following column header:

samples: Sample ID found in VCF header of the joint genotyping output file
group: Group ID linking the sample in group
meta: Additional group relation. Not used anymore
editing_group: Method used for the editing (Cas9, TALEN, PE)

## Set a table of sample to group from one of the VCF file
vcf.files <- list.files("SNV", "\\.vcf\\.gz$", full = TRUE)

## Read Hanqin csv file for the samples
samplesDT <- setDT(read.csv("supporting_files/iSCORE-PD_cells_grouped_by_editing_methods.csv"))

samplesDT[, group := editing_group]
samplesDT[, editing_group := NULL]

## Remove the Talen edited cell line
samplesDT <- samplesDT[group != "TALEN", ]

## Print the table of samples
datatable(samplesDT)

Unique Snps

Find SNPs common to a set of samples

Strategy:

Read the VCF for a given chromosome
Remove the SNPs with a Qual less than 30
Subset the SNPs for the samples of a given group
Remove the SNPs where all the calls are missing or REF (GT %in% ./., 0/0, 0/., ./0)
Define the sample part of the univers. ie all samples excluding EWTs
If a sample is part of the meta colums, remove the samples of the meta from the universeSnps
Subset the SNPs for the samples of a given group
Remove the SNPs where all the calls are missing or REF (GT %in% ./., 0/0, 0/., ./0)
Returns the SNPs in the Group not present in the Universe

getSubsetFilter <- function(vcf, samples, qual) {
  subset <- as.data.frame(geno(vcf)$GT[, samples])
  filter <- !apply(subset, 1, function(x) all(x %in% c("./.", "0/0", "./0", "0/."))) & qual(vcf) >= qual
  return(filter)
}

getUniqueSnps <- function(vcf.file, samplesDT, qual = 40) {
  ## Read the VCF
  vcf <- readVcf(vcf.file)

  groups <- samplesDT[group != "parental", unique(group), ] # nolint: object_usage_linter.
  ## Roll through the groups and get their unique SNPs
  results <- lapply(groups, function(currGroup) {
    ## Get the sample names of the group under analysis
    currSamples <- samplesDT[group == currGroup, samples] # nolint: object_usage_linter.
    currUniverse <- samplesDT[group == "parental", samples] # nolint: object_usage_linter.

    groupSnp_f <- getSubsetFilter(vcf, currSamples, qual)
    universeSnps_f <- getSubsetFilter(vcf, currUniverse, qual)

    ## Subset the vcf to the group SNPS
    uniqueSnps <- vcf[groupSnp_f & !universeSnps_f]

    return(uniqueSnps)
  })

  names(results) <- groups
  return(results)
}

Computing the unique SNPs for each group

Rational:

Strategy:

Define the parental samples
Find the VCF files
Reading the header of one VCF file, get the group tags
In parallel, computed the unique SNPs VCF and stats
Massage the data for compsution (and save the VCF to HD)

## Only run if data files are no present
resultToSave <- file.path("data", "edit_type_unique_SNPs_vs_all.Rds")
vcfToSave <- file.path("data", "edit_type_unique_SNPs_vs_all.vcf")
if (file.exists(resultToSave) && file.exists(vcfToSave)) {
  groupUniqueSnps <- readRDS(resultToSave)
} else {
  rawResults <- mclapply(vcf.files,
    getUniqueSnps,
    samplesDT = samplesDT,
    qual = 30,
    mc.cores = detectCores(),
    mc.preschedule = FALSE
  )

  groupUniqueSnps <- mclapply(names(rawResults[[1]]), function(group) {
    res <- do.call(rbind, lapply(rawResults, "[[", group))
    return(res)
  }, mc.cores = detectCores(), mc.preschedule = FALSE)

  names(groupUniqueSnps) <- names(rawResults[[1]])

  allUniqueVCF <- unique(do.call(rbind, groupUniqueSnps))

  dir.create("data", showWarnings = FALSE)
  saveRDS(groupUniqueSnps, resultToSave)
  writeVcf(allUniqueVCF, vcfToSave)
}

Save the unique SNPs as a VCF file

Rational:

the Unique SNPs ar now in a series of ExpandedVCF object, one for each group. Let’s create a CompactedVCF with these SNPs

Total of unique SNPs per group:

Let’s reformat the table to display counts of SNPs per group per chromosome.

dt <- data.table(
  group = rep(names(groupUniqueSnps), sapply(groupUniqueSnps, length)),
  chr = do.call(c, lapply(groupUniqueSnps, function(x) as.character(seqnames(x))))
)

counts <- dt[, .N, by = .(group, chr)]

dt2 <- dcast(counts, group ~ chr, value.var = "N")
dt2[, total := sum(.SD), by = group]
dt2 <- dt2[samplesDT[group != "parental", .N, by = group], on = .(group)]
dt2[, mean := round(total / N)]

dt2

dt2 <- dt2[
  order(factor(group,
    levels = c(
      grep("EWT", unique(group), value = TRUE),
      grep("EWT", unique(group), value = TRUE, invert = TRUE)
    )
  )),
  c("group", "N", "total", "mean", sort(grep("chr.+", colnames(dt2), value = TRUE)))
]
dt2

##  [1] "group" "N"     "total" "mean"  "chr1"  "chr10" "chr11" "chr12" "chr13"
## [10] "chr14" "chr15" "chr16" "chr17" "chr18" "chr19" "chr2"  "chr20" "chr21"
## [19] "chr22" "chr3"  "chr4"  "chr5"  "chr6"  "chr7"  "chr8"  "chr9"  "chrX" 
## [28] "chrY"

Intersections

Computing the distribution of alleles per sample combination

Rational:

For each group, what is the number of SNPs shared among the samples

Strategy:

for each SNP, assign a group ID representing the combination of samples carrying that SNP

pps <- lapply(names(groupUniqueSnps), function(currGroup) {
  vcf <- groupUniqueSnps[[currGroup]]
  currSamples <- samplesDT[group == currGroup, samples]
  dt <- cbind(
    allele = rownames(geno(vcf)$GT),
    data.table(geno(vcf)$GT)[, ..currSamples]
  )
  geno <- melt(
    dt,
    id.vars = "allele",
    measure.vars = currSamples,
    variable.name = "sample",
    value.name = "geno"
  )
  geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]
  samples <- geno[, unique(sample)]

  if (length(samples) > 26) {
    codes <- c(
      LETTERS[seq_along(samples)[1:26]],
      sapply(LETTERS[seq_along(samples[27:length(samples)])], function(x) paste(rep(x, 2), collapse = ""))
    )
  } else {
    codes <- LETTERS[seq_along(samples)]
  }

  void <- lapply(seq_along(codes), function(i) {
    geno[sample == samples[i], sub := codes[i]]
  })

  combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")
  combn <- unique(combi)
  combn <- combn[order(elementNROWS(strsplit(combn, "-")))]
  combi <- factor(combi, levels = combn)

  df <- as.data.frame(table(combi))

  p_bar <- ggplot(df, aes(combi, Freq)) +
    geom_col() +
    labs(title = paste("Sample", currGroup)) +
    theme(axis.text.x = element_text(angle = 45, vjust = 0.9, hjust = 1))

  legend <- data.frame(legend = paste(codes, samples, sep = " = "))

  p_tab <- tableGrob(unname(legend),
    theme = ttheme_default(base_size = 8),
    rows = NULL
  )

  grid.arrange(grobs = list(p_bar, p_tab), ncol = 2, widths = c(3, 1))
})

Saving the intersection as a table

Common SNPs within the Cas9 type of edit

pps <- mclapply(names(groupUniqueSnps), function(currGroup) {
  vcf <- groupUniqueSnps[[currGroup]]
  currSamples <- samplesDT[group == currGroup, samples]
  dt <- cbind(
    allele = rownames(geno(vcf)$GT),
    data.table(geno(vcf)$GT)[, ..currSamples]
  )
  geno <- melt(
    dt,
    id.vars = "allele",
    measure.vars = currSamples,
    variable.name = "sample",
    value.name = "geno"
  )
  geno <- geno[!geno %in% c("0/0", "./.", "./0", "0/."), ]
  samples <- geno[, unique(sample)]

  if (length(samples) > 26) {
    codes <- c(
      LETTERS[seq_along(samples)[1:26]],
      sapply(LETTERS[seq_along(samples[27:length(samples)])], function(x) paste(rep(x, 2), collapse = ""))
    )
  } else {
    codes <- LETTERS[seq_along(samples)]
  }

  void <- lapply(seq_along(codes), function(i) {
    geno[sample == samples[i], sub := codes[i]]
  })

  combi <- tapply(geno$sub, geno$allele, paste, sep = "-", collapse = "-")
  combn <- unique(combi)
  combn <- combn[order(elementNROWS(strsplit(combn, "-")))]
  combi <- factor(combi, levels = combn)

  dt <- data.table(Combination = combi)

  dt1 <- dt[!is.na(Combination), .N, by = Combination]
  dt1[, nSample := elementNROWS(strsplit(as.character(Combination), "-"))]
  dt1 <- dt1[order(factor(Combination, levels = combn))]
  legend <- data.table(Code = codes, Sample = samples)

  list(legend = legend, dt1 = dt1)
})

names(pps) <- names(groupUniqueSnps)


for (currGroup in names(pps)) {
  cat("\n")
  cat("#### Number of SNPs shared among the", currGroup, "edited cell lines\n")
  cat("##### Id to sample legend\n")
  print(tagList(datatable(pps[[currGroup]]$legend)))
  cat("##### Number of SNPs per combination for", currGroup, "\n")
  print(tagList(datatable(pps[[currGroup]]$dt1)))
  cat("\n")
}

Number of SNPs shared among the Cas9 edited cell lines

Id to sample legend

Number of SNPs per combination for Cas9

Number of SNPs shared among the PE edited cell lines

Id to sample legend

Number of SNPs per combination for PE

Phylo Tree

Computing the phylogenic relationship of the different cell lines

Rational:

Strategy:

Define the parental samples
Find the VCF files
Reading the header of one VCF file, get the group tags
In parallel, computed the unique SNPs VCF and stats
Massage the data for compsution (and save the VCF to HD)

Phylogenetic tree from only the SNPs unique to the edited cell lines

myVcfDist <- fastreeR::vcf2dist(inputFile = vcfToSave, threads = 2)
myVcfTree <- fastreeR::dist2tree(inputDist = myVcfDist)

plot(ape::read.tree(text = myVcfTree), direction = "rightwards", cex = 0.8)
ape::add.scale.bar()
ape::axisPhylo(side = 3)

SNPs proximity

Intro

Data.table linking samples to groups

Rational:

Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.

Format:

The iSCORE-PD_design.csv file is a comma seperated text file starting with a header and with one cell line per line with the following columns:

samples: Sample ID found in VCF header of the joint genotyping output file
group: Group ID linking the sample in group
meta: Additional group relation. Not used anymore

samplesDT <- setDT(read.csv("./supporting_files/iSCORE-PD_design.csv"))
datatable(samplesDT)

Loading the data

## Only run if data files are no present
resultToSave <- file.path("data", "unique_SNPs_vs_all.Rds")
if (file.exists(resultToSave)) {
  groupUniqueSnps <- readRDS(resultToSave)
} else {
  stop("Cant find file", resultToSave)
}

Inter SNPs distance

Is there an over-represtnation of SNPs closer to each other

Rational:

Frank S. suggested that perhaps, off target effect could be reveal by new polymorphism being created near to each other just because the CRISPR would tend to affect nearby location more frequenlty

Strategy:

Get a set of SNPs unique to a group of samples
Remove the SNPs overlaping the gene target by the group edit
Take a sample and
1. Remove the SNPs not called or same as reference
2. Get the location of the SNPs
3. Meseaure the distance between 2 SNPs next to each other in genomic location
Bootrap the distriubtion by
1. Counting the number of SNPs per chromosome
2. For each chromosome, randomly selecting chromosomal loction for the same number of SNPs in sample
3. Computing the distance between each 2 neibhoring SNPs
4. Repeat 1-3 1000 times
Plot the distributino of distances for the group and the bootstrap distribution

genes <- genes(txdb)

##   30 genes were dropped because they have exons located on both strands of the
##   same reference sequence or on more than one reference sequence, so cannot be
##   represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a GRangesList
##   object, or use suppressMessages() to suppress this message.

dists <- mclapply(names(groupUniqueSnps), function(currGroup) {
  vcf <- groupUniqueSnps[[currGroup]]
  ## If not dealing with EWT or control cell lines, find the current gene location and remove the SNPs from the group
  if (!grepl("EWT|S_Group", currGroup)) {
    gene <- unique(sub("-.+", "", grep("EWT|parental|S_Group", currGroup, value = TRUE, invert = TRUE)))
    if (gene %in% keys(org.Hs.eg.db, "SYMBOL")) {
      ez <- select(org.Hs.eg.db, gene, "ENTREZID", "SYMBOL")$ENTREZ
    } else if (gene %in% keys(org.Hs.eg.db, "ALIAS")) {
      ez <- select(org.Hs.eg.db, gene, "ENTREZID", "ALIAS")$ENTREZ
    }
    suppressWarnings(ol <- findOverlaps(vcf, genes[ez]))
    if (length(queryHits(ol) > 0)) vcf <- vcf[-queryHits(ol)]
  }

  df <- do.call(rbind, lapply(samplesDT[group == currGroup, samples], function(currSample) {
    x <- geno(vcf)$GT[, currSample]
    f <- !x %in% c("./.", "0/0", "./0", "0/.")
    subVcf <- vcf[f]

    dist <- unlist(tapply(start(subVcf), seqnames(subVcf), function(start) start[-1] - start[-length(start)]))
    dist <- dist[elementNROWS(dist) != 0]

    chrCounts <- table(seqnames(subVcf))
    chrCounts <- chrCounts[chrCounts != 0]

    bootstrap <- unlist(lapply(seq_len(1000), function(i) {
      unlist(lapply(names(chrCounts), function(chr) {
        start <- sort(sample(seqlengths(subVcf)[chr], chrCounts[chr]))
        dist <- start[-1] - start[-length(start)]
      }))
    }))
    df <- data.frame(
      distance = c(dist, bootstrap),
      sample = c(rep("Sample", length(dist)), rep("Bootstrap", length(bootstrap)))
    )
  }))
  df
}, mc.cores = detectCores(), mc.preschedule = FALSE)

names(dists) <- names(groupUniqueSnps)

dists <- dists[elementNROWS(dists) > 0]

void <- lapply(names(dists), function(group) {
  df <- dists[[group]]
  df <- df[!is.na(df$distance), ]
  p <- ggplot(df, aes(log10(distance), color = sample)) +
    geom_histogram(binwidth = 0.05) +
    facet_grid(rows = vars(sample), scales = "free_y") +
    labs(
      title = paste("Distance between SNPs for", group, "samples")
    )

  print(p)
})

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

KS Test

Are the sample and bootstrap distributions comming from the same distribution?

Rational:

Kolmogorov-Smirnov test can tell us if two samples are comming from the same distribution by looking at the distances between ECDFs.

dt <- data.table(do.call(rbind, lapply(dists, function(df) {
  setDT(df)
  ks <- ks.test(
    df[sample == "Sample", distance],
    df[sample == "Bootstrap", distance],
    alternative = "greater"
  )
  c("D^+" = sprintf("%.3f", ks$statistic), p.value = sprintf("%.3f", ks$p.value))
})))

datatable(cbind(group = names(dists), dt), options = list(pageLength = 25))

Dist enrichment

Is there an enrichment of SNPs closer to 100 bp?

dt <- data.table(do.call(rbind, lapply(names(dists), function(group) {
  dist <- setDT(dists[[group]])
  fracSample <- dist[distance <= 100 & sample == "Sample", .N] / dist[sample == "Sample", .N]
  fracBS <- dist[distance <= 100 & sample == "Bootstrap", .N] / dist[sample == "Bootstrap", .N]

  c(
    group = group,
    Count = dist[distance <= 100 & sample == "Sample", .N],
    "Sample prop" = sprintf("%.2e", fracSample),
    "Bootstrap prop" = sprintf("%.2e", fracBS),
    Enrichment = round(fracSample / fracBS)
  )
})))

datatable(dt, options = list(pageLength = 25))

Is there an enrichment of SNPs closer to 1000 bp?

dt <- data.table(do.call(rbind, lapply(names(dists), function(group) {
  dist <- setDT(dists[[group]])
  fracSample <- dist[distance <= 100 & sample == "Sample", .N] / dist[sample == "Sample", .N]
  fracBS <- dist[distance <= 100 & sample == "Bootstrap", .N] / dist[sample == "Bootstrap", .N]

  c(
    group = group,
    Count = dist[distance <= 1000 & sample == "Sample", .N],
    "Sample prop" = sprintf("%.2e", fracSample),
    "Bootstrap prop" = sprintf("%.2e", fracBS),
    Enrichment = round(fracSample / fracBS)
  )
})))

datatable(dt, options = list(pageLength = 25))

SNPs within 100bp

Rational:

Let’s look at the sequence around the SNPs closer to each other wihin 100 Bootrap

Strategy:

Take the unique SNPs of a given group of edits
Remove the SNPs within the group edited gene
Compute the distance between any 2 SNPs
For distances less than 100bp the 2 SNPs
Using the Hsapiens BSgenome, get the sequence 10nt before and 15nt after the SNPs
Return a table with
- Distance between the 2 SNPs
- the REF and ALT allels for the 2 SNPs
- The sequences around the 2 SNPs, blue is before, red is SNPs then after

genes <- genes(txdb)

##   30 genes were dropped because they have exons located on both strands of the
##   same reference sequence or on more than one reference sequence, so cannot be
##   represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a GRangesList
##   object, or use suppressMessages() to suppress this message.

colorize <- function(x, color) {
  if (knitr::is_latex_output()) {
    sprintf("\\textcolor{%s}{%s}", color, x)
  } else if (knitr::is_html_output()) {
    sprintf(
      "<span style='color: %s; font-family: monospace, monospace;'>%s</span>", color,
      x
    )
  } else {
    x
  }
}

results <- mclapply(names(groupUniqueSnps), function(currGroup) {
  vcf <- groupUniqueSnps[[currGroup]]

  if (!grepl("EWT|S_Group", currGroup)) {
    gene <- unique(sub("-.+", "", grep("EWT|parental", currGroup, value = TRUE, invert = TRUE)))
    if (gene %in% keys(org.Hs.eg.db, "SYMBOL")) {
      ez <- select(org.Hs.eg.db, gene, "ENTREZID", "SYMBOL")$ENTREZ
    } else if (gene %in% keys(org.Hs.eg.db, "ALIAS")) {
      ez <- select(org.Hs.eg.db, gene, "ENTREZID", "ALIAS")$ENTREZ
    }
    suppressWarnings(ol <- findOverlaps(vcf, genes[ez]))
    if (length(queryHits(ol) > 0)) vcf <- vcf[-queryHits(ol)]
  }

  chrCounts <- table(seqnames(vcf))
  chrCounts <- chrCounts[chrCounts > 0]

  df <- do.call(rbind, lapply(names(chrCounts), function(chr) {
    subVcf <- vcf[seqnames(vcf) == chr]
    starts <- start(subVcf)

    df <- do.call(rbind, lapply(which(starts[-1] - starts[-length(starts)] <= 100), function(f) {
      seqs <- getSeq(Hsapiens, GRanges(chr, IRanges(start(subVcf[c(f, f + 1)]) - 15, width = 30)))

      refs <- ref(subVcf[c(f, f + 1)])
      alts <- sapply(alt(subVcf[c(f, f + 1)]), paste, collapse = ",")

      currSamples <- samplesDT[group == currGroup, samples]
      gt1 <- geno(subVcf[f])$GT[, samplesDT[group == currGroup, samples]]
      gt2 <- geno(subVcf[f + 1])$GT[, samplesDT[group == currGroup, samples]]
      gts <- c(gt1[!gt1 %in% c("./.", "0/0", "./0", "0/.")], gt2[!gt2 %in% c("./.", "0/0", "./0", "0/.")])

      seqs <- paste0(
        colorize(as.character(subseq(seqs, 1, 15)), "blue"),
        colorize(as.character(subseq(seqs, 16, width(seqs))), "red")
      )

      data.frame(
        distance = start(subVcf[f + 1]) - start(subVcf[f]),
        CHR = chr,
        start = start(subVcf[f]),
        REF1 = refs[1],
        ALT1 = alts[1],
        REF2 = refs[2],
        ALT2 = alts[2],
        seq1 = seqs[1],
        seq2 = seqs[2],
        GT = paste(unique(gts), collapse = "|"),
        sample = paste(unique(names(gts)), collapse = "\n")
      )
    }))
  }))
  return(df)
}, mc.cores = detectCores(), mc.preschedule = FALSE)

names(results) <- names(groupUniqueSnps)

for (currGroup in names(results)) {
  cat("\n")
  cat("#### Sequences around SNPs seperated by less than 100 bp in ", currGroup, "edited cell lines\n")
  print(
    tagList(
      datatable(results[[currGroup]], escape = FALSE)
    )
  )
  cat("\n")
}

Sequences around SNPs seperated by less than 100 bp in ATP13A2 edited cell lines

Sequences around SNPs seperated by less than 100 bp in DJ1 edited cell lines

Sequences around SNPs seperated by less than 100 bp in DNAJC6 edited cell lines

Sequences around SNPs seperated by less than 100 bp in SNCA-A30P edited cell lines

Sequences around SNPs seperated by less than 100 bp in GBA1 edited cell lines

Sequences around SNPs seperated by less than 100 bp in EWT-OTHERS edited cell lines

Sequences around SNPs seperated by less than 100 bp in FBXO7 edited cell lines

Sequences around SNPs seperated by less than 100 bp in LRRK2 edited cell lines

Sequences around SNPs seperated by less than 100 bp in PINK1 edited cell lines

Sequences around SNPs seperated by less than 100 bp in PRKN edited cell lines

Sequences around SNPs seperated by less than 100 bp in S_Group edited cell lines

Sequences around SNPs seperated by less than 100 bp in SNCA-A53T edited cell lines

Sequences around SNPs seperated by less than 100 bp in SYNJ1 edited cell lines

Sequences around SNPs seperated by less than 100 bp in VPS13C-A444P edited cell lines

Sequences around SNPs seperated by less than 100 bp in VPS13C-W395C edited cell lines

Rel to Edit

What is the distance of SNPs to the edited genes

Rational:

Perhaps, looking at the distance of SNPs relative to the edited gene might reveal a stronger signal.

Strategy:

Get a set of SNPs unique to a group of samples
Remove the SNPs overlaping the gene target by the group edit
Get the location of the gene CRISPR edited in a group
Keep only SNPs on the same chromosome as the edited gene
Compute the distance between SNPs and start or end of gene (relative to where they are located to the gene)
Bootstrap the distribution by:
1. Seleting random position on the chromosme around the edited gene
2. Repeat 1000 times
Plot the distributino of distances for the group and the bootstrap distribution

genes <- genes(txdb)

##   30 genes were dropped because they have exons located on both strands of the
##   same reference sequence or on more than one reference sequence, so cannot be
##   represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a GRangesList
##   object, or use suppressMessages() to suppress this message.

dists <- mclapply(names(groupUniqueSnps), function(currGroup) {
  if (grepl("OTHERS|S_Group", currGroup)) {
    return(data.frame())
  }
  vcf <- groupUniqueSnps[[currGroup]]

  ## If not dealing with EWT, find the current gene location and remove the SNPs from the group
  currGroup2 <- sub("(EWT-|^)(.+)", "\\2", currGroup)
  gene <- sub("(.+)-.+", "\\1", currGroup2)

  if (gene %in% keys(org.Hs.eg.db, "SYMBOL")) {
    ez <- select(org.Hs.eg.db, gene, "ENTREZID", "SYMBOL")$ENTREZ
  } else if (gene %in% keys(org.Hs.eg.db, "ALIAS")) {
    ez <- select(org.Hs.eg.db, gene, "ENTREZID", "ALIAS")$ENTREZ
  }
  suppressWarnings(ol <- findOverlaps(vcf, genes[ez]))
  if (length(queryHits(ol) > 0)) vcf <- vcf[-queryHits(ol)]

  currChr <- as.character(seqnames(genes[ez]))
  subVCF <- vcf[seqnames(vcf) == currChr]
  if (length(subVCF) < 2) {
    return(data.frame())
  }
  starts <- start(subVCF)
  f1 <- starts < start(genes[ez])
  dist <- c(start(genes[ez]) - starts[f1], starts[!f1] - end(genes[ez]))

  ## bootstrap
  target <- c(1:start(genes[ez]), end(genes[ez]):seqlengths(vcf)[currChr])
  bootstrap <- unlist(lapply(seq_len(1000), function(i) {
    starts <- sort(sample(seqlengths(vcf)[currChr], length(subVCF)))
    f1 <- starts < start(genes[ez])
    dist <- c(start(genes[ez]) - starts[f1], starts[!f1] - end(genes[ez]))
  }))

  df <- data.frame(
    distance = c(dist, bootstrap),
    sample = c(rep("Sample", length(dist)), rep("Bootstrap", length(bootstrap)))
  )
}, mc.cores = detectCores(), mc.preschedule = FALSE)

names(dists) <- names(groupUniqueSnps)

void <- lapply(names(dists), function(group) {
  df <- dists[[group]]
  if (nrow(df) == 0) {
    cat(group, "have less than 2 SNPs near the edited gene\n")
  } else {
    p <- ggplot(df, aes(log10(distance), color = sample)) +
      geom_histogram(binwidth = 0.05) +
      facet_grid(rows = vars(sample), scales = "free_y") +
      labs(
        title = paste("Distance between SNPs for", group, "samples")
      )
    print(p)
  }
})

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 13 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 21 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 79 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

## EWT-OTHERS have less than 2 SNPs near the edited gene

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 227 rows containing non-finite outside the scale range
## (`stat_bin()`).

## S_Group have less than 2 SNPs near the edited gene

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 15 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 31 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_bin()`).

Dist per group

Distance between SNPs within a group

Rational:

Ok, keeping this here as initialy I look at the distance of SNPs within a group of samples instead of wihin a single sample. The impact is that you might get SNPs that are close to each others accross samples, which if the assumption is that they are caused by initial edits of a chromatid then subsquent CRISPR edits in the neibhoring DNA, wihin group does not make sense ***

genes <- genes

dists <- mclapply(names(groupUniqueSnps), function(currGroup) {
  vcf <- groupUniqueSnps[[currGroup]]

  ## If not dealing with EWT, find the current gene location and remove the SNPs from the group
  if (!grepl("EWT|S_Group", currGroup)) {
    gene <- unique(sub("-.+", "", grep("EWT|parental", currGroup, value = TRUE, invert = TRUE)))
    if (gene %in% keys(org.Hs.eg.db, "SYMBOL")) {
      ez <- select(org.Hs.eg.db, gene, "ENTREZID", "SYMBOL")$ENTREZ
    } else if (gene %in% keys(org.Hs.eg.db, "ALIAS")) {
      ez <- select(org.Hs.eg.db, gene, "ENTREZID", "ALIAS")$ENTREZ
    }
    suppressWarnings(ol <- findOverlaps(vcf, genes[ez]))
    if (length(queryHits(ol) > 0)) vcf <- vcf[-queryHits(ol)]
  }

  dist <- unlist(tapply(start(vcf), seqnames(vcf), function(start) start[-1] - start[-length(start)]))
  dist <- dist[elementNROWS(dist) != 0]

  chrCounts <- table(seqnames(vcf))
  chrCounts <- chrCounts[chrCounts != 0]

  bootstrap <- unlist(lapply(seq_len(1000), function(i) {
    unlist(lapply(names(chrCounts), function(chr) {
      start <- sort(sample(seqlengths(vcf)[chr], chrCounts[chr]))
      dist <- start[-1] - start[-length(start)]
    }))
  }))

  df <- data.frame(
    distance = c(dist, bootstrap),
    sample = c(rep("Sample", length(dist)), rep("Bootstrap", length(bootstrap)))
  )
}, mc.cores = detectCores(), mc.preschedule = FALSE)

names(dists) <- names(groupUniqueSnps)

void <- lapply(names(dists), function(group) {
  df <- dists[[group]]
  p <- ggplot(df, aes(log10(distance), color = sample)) +
    geom_histogram(binwidth = 0.05) +
    facet_grid(rows = vars(sample), scales = "free_y") +
    labs(
      title = paste("Distance between SNPs for", group, "samples")
    )

  print(p)
})

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

sessionInfo

sessionInfo()

## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8          LC_NUMERIC=C             
##  [3] LC_TIME=C.UTF-8           LC_COLLATE=C.UTF-8       
##  [5] LC_MONETARY=C.UTF-8       LC_MESSAGES=C.UTF-8      
##  [7] LC_PAPER=C.UTF-8          LC_NAME=C.UTF-8          
##  [9] LC_ADDRESS=C.UTF-8        LC_TELEPHONE=C.UTF-8     
## [11] LC_MEASUREMENT=C.UTF-8    LC_IDENTIFICATION=C.UTF-8
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3                           
##  [2] ape_5.8-1                               
##  [3] fastreeR_1.12.0                         
##  [4] ggplot2_3.5.2                           
##  [5] htmltools_0.5.8.1                       
##  [6] DT_0.33                                 
##  [7] org.Hs.eg.db_3.21.0                     
##  [8] BSgenome.Hsapiens.UCSC.hg38_1.4.5       
##  [9] BSgenome_1.76.0                         
## [10] rtracklayer_1.68.0                      
## [11] BiocIO_1.18.0                           
## [12] TxDb.Hsapiens.UCSC.hg38.knownGene_3.21.0
## [13] GenomicFeatures_1.60.0                  
## [14] AnnotationDbi_1.70.0                    
## [15] data.table_1.17.0                       
## [16] VariantAnnotation_1.54.0                
## [17] Rsamtools_2.24.0                        
## [18] Biostrings_2.76.0                       
## [19] XVector_0.48.0                          
## [20] SummarizedExperiment_1.38.0             
## [21] Biobase_2.68.0                          
## [22] GenomicRanges_1.60.0                    
## [23] GenomeInfoDb_1.44.0                     
## [24] IRanges_2.42.0                          
## [25] S4Vectors_0.46.0                        
## [26] MatrixGenerics_1.20.0                   
## [27] matrixStats_1.5.0                       
## [28] BiocGenerics_0.54.0                     
## [29] generics_0.1.3                          
## [30] httpgd_2.0.4                            
## 
## loaded via a namespace (and not attached):
##  [1] farver_2.1.2             blob_1.2.4               R.utils_2.13.0          
##  [4] bitops_1.0-9             fastmap_1.2.0            RCurl_1.98-1.17         
##  [7] GenomicAlignments_1.44.0 XML_3.99-0.18            digest_0.6.37           
## [10] lifecycle_1.0.4          KEGGREST_1.48.0          RSQLite_2.3.9           
## [13] magrittr_2.0.3           compiler_4.5.0           rlang_1.1.6             
## [16] sass_0.4.10              tools_4.5.0              yaml_2.3.10             
## [19] knitr_1.50               labeling_0.4.3           S4Arrays_1.8.0          
## [22] htmlwidgets_1.6.4        bit_4.6.0                curl_6.2.2              
## [25] DelayedArray_0.34.1      RColorBrewer_1.1-3       abind_1.4-8             
## [28] BiocParallel_1.42.0      unigd_0.1.3              withr_3.0.2             
## [31] R.oo_1.27.0              grid_4.5.0               scales_1.4.0            
## [34] cli_3.6.5                rmarkdown_2.29           crayon_1.5.3            
## [37] httr_1.4.7               rjson_0.2.23             DBI_1.2.3               
## [40] cachem_1.1.0             stringr_1.5.1            restfulr_0.0.15         
## [43] vctrs_0.6.5              Matrix_1.7-3             jsonlite_2.0.0          
## [46] bit64_4.6.0-1            crosstalk_1.2.1          systemfonts_1.2.2       
## [49] jquerylib_0.1.4          glue_1.8.0               codetools_0.2-20        
## [52] stringi_1.8.7            rJava_1.0-11             gtable_0.3.6            
## [55] UCSC.utils_1.4.0         tibble_3.2.1             pillar_1.10.2           
## [58] GenomeInfoDbData_1.2.14  R6_2.6.1                 evaluate_1.0.3          
## [61] lattice_0.22-7           R.methodsS3_1.8.2        png_0.1-8               
## [64] memoise_2.0.1            bslib_0.9.0              Rcpp_1.0.14             
## [67] nlme_3.1-168             SparseArray_1.8.0        xfun_0.52               
## [70] pkgconfig_2.0.3

Off Targets

Intro

Data.table linking samples to groups

Rational:

Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.

Format:

The iSCORE-PD_cells_grouped_with_guides.csv file is a comma seperated text file starting with a header and with one cell line per line with the following column header:

samples: Sample ID
group: Group ID
meta: Additional group relation. Not used anymore
editing_group: Method used for the editing (Cas9, TALEN, PE)
guide1
guide2
guide3

## Set a table of sample to group from one of the VCF file
vcf.files <- list.files("SNV", "\\.vcf\\.gz$", full = TRUE)

## Read Hanqin csv file for the samples
samplesDT <- setDT(read.csv("supporting_files/iSCORE-PD_cells_grouped_with_guides.csv"))

## Print the table of samples
datatable(samplesDT)

Unique Snps

Find SNPs common to a set of samples

Strategy:

Read the VCF for a given chromosome
Remove the SNPs with a Qual less than 30
Subset the SNPs for the samples of a given group
Remove the SNPs where all the calls are missing or REF (GT %in% ./., 0/0, 0/., ./0)
Define the sample part of the univers. ie all samples excluding EWTs
If a sample is part of the meta colums, remove the samples of the meta from the universeSnps
Subset the SNPs for the samples of a given group
Remove the SNPs where all the calls are missing or REF (GT %in% ./., 0/0, 0/., ./0)
Returns the SNPs in the Group not present in the Universe

getSubsetFilter <- function(vcf, samples, qual) {
  subset <- as.data.frame(geno(vcf)$GT[, samples])
  filter <- !apply(subset, 1, function(x) all(x %in% c("./.", "0/0", "./0", "0/."))) & qual(vcf) >= qual
  return(filter)
}

getUniqueSnps <- function(vcf.file, samplesDT, qual = 40) {
  ## Read the VCF
  vcf <- readVcf(vcf.file)

  groups <- samplesDT[group != "parental", unique(group), ]
  ## Roll through the groups and get their unique SNPs
  results <- lapply(groups, function(currGroup) {
    ## Get the sample names of the group under analysis
    currSamples <- samplesDT[currGroup == group, sample]
    currUniverse <- samplesDT[-grep(paste0("(", currGroup, "|EWT)"), group), sample]
    ## Remove the meta samples for the EWTs...
    if (!is.na(unique(samplesDT[group == currGroup, meta]))) {
      metaSamples <- samplesDT[meta %in% samplesDT[group == currGroup, meta], sample]
      currUniverse <- currUniverse[!currUniverse %in% metaSamples]
    }

    groupSnp_f <- getSubsetFilter(vcf, currSamples, qual)
    universeSnps_f <- getSubsetFilter(vcf, currUniverse, qual)

    ## Subset the vcf to the group SNPS
    uniqueSnps <- vcf[groupSnp_f & !universeSnps_f]

    return(uniqueSnps)
  })

  names(results) <- groups
  return(results)
}

Computing the unique SNPs for each group

Rational:

Strategy:

Define the parental samples
Find the VCF files
Reading the header of one VCF file, get the group tags
In parallel, computed the unique SNPs VCF and stats
Massage the data for compsution (and save the VCF to HD)

## Only run if data files are no present
resultToSave <- file.path("data", "cas-offinder_unique_SNPs_vs_all.Rds")
vcfToSave <- file.path("data", "cas-offinder_unique_SNPs_vs_all.vcf")
if (file.exists(resultToSave) && file.exists(vcfToSave)) {
  groupUniqueSnps <- readRDS(resultToSave)
} else {
  rawResults <- mclapply(vcf.files,
    getUniqueSnps,
    samplesDT = samplesDT,
    qual = 30,
    mc.cores = detectCores(),
    mc.preschedule = FALSE
  )

  groupUniqueSnps <- mclapply(names(rawResults[[1]]), function(group) {
    res <- do.call(rbind, lapply(rawResults, "[[", group))
    return(res)
  }, mc.cores = detectCores(), mc.preschedule = FALSE)

  names(groupUniqueSnps) <- names(rawResults[[1]])

  allUniqueVCF <- unique(do.call(rbind, groupUniqueSnps))

  dir.create("data", showWarnings = FALSE)
  saveRDS(groupUniqueSnps, resultToSave)
  writeVcf(allUniqueVCF, vcfToSave)
}

sgRNA

Loading the table of sgRNA

sgRNA <- setDT(read.delim("./supporting_files/sgRNA.txt", header = FALSE))

datatable(sgRNA) %>%
  formatStyle(c("V1"), fontFamily = "monospace, monospace")

Run cas-offinder

This is how cas-offinder was run on the GPU based instance in us-east-1 zone

cas-offinder sgRNA.txt G cas-offinder-out.txt

Load the results of cas-offinder

Display only the first 50 results to avoid bloating the HTML report

casOff <- setDT(read.table("./supporting_files/cas-offinder-out.txt"))
setnames(casOff, c("PAM", "chr", "start", "seq", "strand", "mismatch", "id"))

casOff[strand == "-", start := start + 3]
casOff[strand == "+", start := start + nchar(seq) - 3]

casOffGR <- GRanges(
  seqnames = casOff[, chr],
  IRanges(casOff[, start], width = 1),
  strand = casOff[, strand],
  data.frame(seq = casOff[, seq], guide = casOff[, id], mismatch = casOff[, mismatch], PAM = casOff[, PAM])
)

datatable(head(casOff, 50)) %>%
  formatStyle(c("PAM", "seq"), fontFamily = "monospace, monospace")

Is cas-offinder hit the targeted gene?

genes <- genes(txdb)

targetGenes <- unique(sub("-.+", "", grep("EWT|parental", names(groupUniqueSnps), value = TRUE, invert = TRUE)))

symbol2ez <- select(org.Hs.eg.db, targetGenes, "ENTREZID", "SYMBOL")
symbol2ez[is.na(symbol2ez$ENTREZID), ]$ENTREZID <- select(
  org.Hs.eg.db,
  symbol2ez[is.na(symbol2ez$ENTREZID), ]$SYMBOL, "ENTREZID", "ALIAS"
)$ENTREZID

ol <- findOverlaps(casOffGR, genes[symbol2ez$ENTREZID], ignore.strand = TRUE)

dt <- setDT(as.data.frame(ol))
dt$guide <- casOffGR[queryHits(ol)]$guide
dt$overlappingGene <- symbol2ez$SYMBOL[subjectHits(ol)]

datatable(dt[order(dt$guide), .(guide, overlappingGene)], options = list(pageLength = 10))

Is the targeted gene by the CRISP found in the cas-offinder results

dt.m1 <- melt(samplesDT,
  measure.vars = c("guide1", "guide2", "guide3"),
  variable.name = "guide",
  value.name = "guideId"
)
dt.m1[, guideId := sub(" ", "_", guideId)]
dt.m1[grep("EWT|parental", group, invert = TRUE), gene := sub("-.+", "", group)]

gene2guide <- unique(dt.m1[, .(gene, group, guideId)])
gene2guide <- gene2guide[!is.na(gene)]

dt$targetGene <- gene2guide[match(dt$guide, guideId), gene]

datatable(dt[order(factor(guide)), .(casOffInTargetGene = any(unique(targetGene) %in% overlappingGene)), by = guide],
  options = list(pageLength = 25)
)

cas-offinder hits within targeted genes

Let’s output the location, sequences and mismatches of the cas-offinder results falling within the targeted genes.

subCasOffGR <- casOffGR[dt[overlappingGene == targetGene, queryHits]]

f <- unlist(tapply(subCasOffGR$mismatch, subCasOffGR$guide, function(x) x == min(x)))
order <- order(factor(subCasOffGR$guide))

datatable(
  data.table(
    guide = subCasOffGR$guide[order],
    chr = as.character(seqnames(subCasOffGR))[order],
    start = start(subCasOffGR)[order],
    strand = as.character(strand(subCasOffGR))[order],
    PAM = subCasOffGR$PAM[order],
    seq = subCasOffGR$seq[order],
    mismatch = subCasOffGR$mismatch[order]
  )[f],
  options = list(pageLength = 25)
) %>%
  formatStyle(c("PAM", "seq"), fontFamily = "monospace, monospace")

Dist stats

Rational:

how to compute a p-value if we find SNPs close to predicted off target site.

The chance that a SNPs fall in any 100 bp window is the same for any window, it’s a uniform distribution accross the chromosome lenght. The rate at wich a SNPs should fall within the same 100bp window as a predicted CRISPR off target site is then one over the size of the chomosome divided by 100 bp times the number of off-target sites on that chromosome (ie any one 100 bp as the same probability of getting a SNPs in any given window). Then with the predicted rate, we can use the poisson distribution to compute the probabiltiy of seeing a given number of observation.

Strategy

Roll over the unique SNPs for each group of edited cell lines
Get the guide(s) used to edited the cell lines
Roll over each chromosomes with SNPs on them
Compute the number of 100 bp window for a given chromosome
Compute the rate of having a SNPs within a given windows
Remove the SNPs falling within the targeted gene
Count the number of SNPs wihtin 100 bp to a predicted CRIPSR off target site
Compute the p-value from a Poisson distribution of event at a given rate

binSize <- 100

dt.m1 <- melt(samplesDT,
  measure.vars = c("guide1", "guide2", "guide3"),
  variable.name = "guide",
  value.name = "guideId"
)
dt.m1[, guideId := sub(" ", "_", guideId)]

genes <- genes(txdb)

targetGenes <- unique(sub("-.+", "", grep("EWT|parental", names(groupUniqueSnps), value = TRUE, invert = TRUE)))
symbol2ez <- select(org.Hs.eg.db, targetGenes, "ENTREZID", "SYMBOL")
symbol2ez[is.na(symbol2ez$ENTREZID), ]$ENTREZID <- select(
  org.Hs.eg.db,
  symbol2ez[is.na(symbol2ez$ENTREZID), ]$SYMBOL, "ENTREZID", "ALIAS"
)$ENTREZID

dd <- do.call(rbind, mclapply(names(groupUniqueSnps), function(currGroup) {
  vcf <- groupUniqueSnps[[currGroup]]

  targetedGene <- sub("-.+", "", subGroup <- sub("EWT-", "", currGroup))
  whithinTargetedGene <- findOverlaps(
    genes[symbol2ez$ENTREZID[symbol2ez$SYMBOL == targetedGene]],
    casOffGR,
    ignore.strand = TRUE
  )
  casOffOutsideGene <- casOffGR[-subjectHits(whithinTargetedGene)]

  guides <- dt.m1[group == currGroup, unique(guideId)[unique(guideId) != ""]]
  if (length(guides) == 0) {
    return(data.table())
  }

  ## Get the chomosome name that has SNPs
  chrs <- table(seqnames(vcf))
  chrs <- names(chrs[chrs > 0])

  dd <- do.call(rbind, lapply(chrs, function(currChr) {
    subCasOffGR <- casOffOutsideGene[seqnames(casOffOutsideGene) == currChr & casOffOutsideGene$guide %in% guides]
    chrLength <- seqlengths(vcf)[currChr]
    rate <- 1 / (chrLength / binSize) * sum(seqnames(subCasOffGR) == currChr)

    casOffStarts <- start(subCasOffGR)
    snpsStarts <- start(vcf[seqnames(vcf) == currChr])

    ## If no cas-offinder hits, return early
    if (length(casOffStarts) == 0) {
      return(data.table())
    }

    dists <- sapply(casOffStarts, function(start) min(abs(start - snpsStarts)))

    data.table(
      chr = currChr,
      close2offTarget = sum(dists <= binSize),
      predictedRate = rate
    )
  }))

  dd[, .(group = currGroup, events = sum(close2offTarget), rate = mean(predictedRate))]
}, mc.cores = detectCores(), mc.preschedule = FALSE))

dd <- dd[!is.na(rate)]
dd[events > 0, p.value.num := 1 - ppois(events, rate)]
dd[!is.na(p.value.num), p.value := format(p.value.num, digits = 3, scientific = TRUE)]

datatable(dd[, `:=`(rate = NULL, p.value.num = NULL)], options = list(pageLength = 25))

Should I trust the p-values

Ok, if this is true, doing this anlysis with a CRISPR guide RNA that is not the one used for the edited cell line should not returned significant values. Let’s try it.

Strategy

Use same code as above but reset the guides to a random sample of guides other than the original for the group
Repeat that 10 times and return the count of events per iteration for each group

binSize <- 100

dt.m1 <- melt(samplesDT,
  measure.vars = c("guide1", "guide2", "guide3"),
  variable.name = "guide",
  value.name = "guideId"
)
dt.m1[, guideId := sub(" ", "_", guideId)]

dd <- do.call(rbind, lapply(1:10, function(i) {
  dd <- do.call(rbind, mclapply(names(groupUniqueSnps), function(currGroup) {
    subGroup <- sub("EWT-", "", currGroup)

    guides <- dt.m1[group == currGroup, unique(guideId)[unique(guideId) != ""]]

    if (length(guides) == 0) {
      return(data.table())
    }
    vcf <- groupUniqueSnps[[currGroup]]

    ## randomly sample same number of guides not the original guides
    guides <- sample(dt.m1[, unique(guideId)[!unique(guideId) %in% c(guides, "")]], length(guides))

    ## Get the chomosome name that has SNPs
    chrs <- table(seqnames(vcf))
    chrs <- names(chrs[chrs > 0])

    dd <- do.call(rbind, lapply(chrs, function(currChr) {
      chrLength <- seqlengths(vcf)[currChr]

      rate <- 1 / (chrLength / binSize) * length(casOff[chr == currChr])

      starts <- start(vcf[seqnames(vcf) == currChr])
      dist <- sapply(starts, function(currStart) min(abs(casOff[chr == currChr & id %in% guides, start] - currStart)))

      data.table(
        chr = currChr,
        close2offTarget = sum(dist <= binSize),
        predictedRate = rate
      )
    }))

    dd[, .(group = currGroup, events = sum(close2offTarget), rate = mean(predictedRate))]
  }, mc.cores = detectCores(), mc.preschedule = FALSE))
  dd[, iter := paste0("iter_", i)]
}))

datatable(dcast(dd, group ~ iter, value.var = "events"), options = list(pageLength = 25))

Location of cas-offinder hits falling within 100 bp of a unique SNPs

Strategy:

Roll over the unique SNPs for each group of edited cell lines
Get the guide(s) used to edited the cell lines
Roll over each chromosomes with SNPs on them

binSize <- 100

dt.m1 <- melt(samplesDT,
  measure.vars = c("guide1", "guide2", "guide3"),
  variable.name = "guide",
  value.name = "guideId"
)
dt.m1[, guideId := sub(" ", "_", guideId)]

genes <- genes(txdb)

targetGenes <- unique(sub("-.+", "", grep("EWT|parental", names(groupUniqueSnps), value = TRUE, invert = TRUE)))
symbol2ez <- select(org.Hs.eg.db, targetGenes, "ENTREZID", "SYMBOL")
symbol2ez[is.na(symbol2ez$ENTREZID), ]$ENTREZID <- select(
  org.Hs.eg.db,
  symbol2ez[is.na(symbol2ez$ENTREZID), ]$SYMBOL, "ENTREZID", "ALIAS"
)$ENTREZID

dd <- do.call(rbind, mclapply(names(groupUniqueSnps), function(currGroup) {
  vcf <- groupUniqueSnps[[currGroup]]

  targetedGene <- sub("-.+", "", subGroup <- sub("EWT-", "", currGroup))
  whithinTargetedGene <- findOverlaps(
    genes[symbol2ez$ENTREZID[symbol2ez$SYMBOL == targetedGene]],
    casOffGR,
    ignore.strand = TRUE
  )
  casOffOutsideGene <- casOffGR[-subjectHits(whithinTargetedGene)]

  guides <- dt.m1[group == currGroup, unique(guideId)[unique(guideId) != ""]]
  if (length(guides) == 0) {
    return(data.table())
  }

  ## Get the chomosome name that has SNPs
  chrs <- table(seqnames(vcf))
  chrs <- names(chrs[chrs > 0])

  dd <- do.call(rbind, lapply(chrs, function(currChr) {
    subCasOffGR <- casOffOutsideGene[seqnames(casOffOutsideGene) == currChr & casOffOutsideGene$guide %in% guides]

    casOffStarts <- start(subCasOffGR)
    snpsStarts <- start(vcf[seqnames(vcf) == currChr])

    ## If no cas-offinder hits, return early
    if (length(casOffStarts) == 0) {
      return(data.table())
    }

    dists <- sapply(casOffStarts, function(start) min(abs(start - snpsStarts)))

    f.casOff <- dists <= binSize
    f.vcf <- sapply(snpsStarts, function(start) min(abs(start - casOffStarts)) <= binSize)
    if (sum(f.casOff) == 0) {
      return(data.table())
    }

    dt <- data.table(
      group = currGroup,
      guide = subCasOffGR$guide,
      chr = as.character(seqnames(subCasOffGR)),
      "cas-off start" = start(subCasOffGR),
      strand = as.character(strand(subCasOffGR)),
      PAM = subCasOffGR$PAM,
      seq = subCasOffGR$seq,
      mismatch = subCasOffGR$mismatch,
      dist = dists
    )[f.casOff]

    dt$"SNP start" <- start(vcf[seqnames(vcf) == currChr][f.vcf])
    dt$REF <- as.character(ref(vcf[seqnames(vcf) == currChr][f.vcf]))
    dt$ALT <- sapply(alt(vcf[seqnames(vcf) == currChr][f.vcf]), paste0, collapse = ",")

    samples <- samplesDT[group == currGroup, sample]
    colFilt <- colnames(geno(vcf[seqnames(vcf) == currChr][f.vcf])$GT) %in% samples
    geno <- data.frame(geno(vcf[seqnames(vcf) == currChr][f.vcf])$GT)[, colFilt]
    dt$GT <- apply(geno, 1, function(x) paste0(unique(x[!x %in% c("0/0", "./.", "./0", "0/.")]), collapse = "\n"))
    dt$sample <- apply(geno, 1, function(x) paste0(names(x)[!x %in% c("0/0", "./.", "./0", "0/.")], collapse = "\n"))
    return(dt)
  }))
}, mc.cores = detectCores(), mc.preschedule = FALSE))

if (nrow(dd)) {
  datatable(dd[order(group)]) %>%
    formatStyle(c("PAM", "seq"), fontFamily = "monospace, monospace")
}

dd

sessionInfo

sessionInfo()

## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8          LC_NUMERIC=C             
##  [3] LC_TIME=C.UTF-8           LC_COLLATE=C.UTF-8       
##  [5] LC_MONETARY=C.UTF-8       LC_MESSAGES=C.UTF-8      
##  [7] LC_PAPER=C.UTF-8          LC_NAME=C.UTF-8          
##  [9] LC_ADDRESS=C.UTF-8        LC_TELEPHONE=C.UTF-8     
## [11] LC_MEASUREMENT=C.UTF-8    LC_IDENTIFICATION=C.UTF-8
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3                           
##  [2] ape_5.8-1                               
##  [3] fastreeR_1.12.0                         
##  [4] ggplot2_3.5.2                           
##  [5] htmltools_0.5.8.1                       
##  [6] DT_0.33                                 
##  [7] org.Hs.eg.db_3.21.0                     
##  [8] BSgenome.Hsapiens.UCSC.hg38_1.4.5       
##  [9] BSgenome_1.76.0                         
## [10] rtracklayer_1.68.0                      
## [11] BiocIO_1.18.0                           
## [12] TxDb.Hsapiens.UCSC.hg38.knownGene_3.21.0
## [13] GenomicFeatures_1.60.0                  
## [14] AnnotationDbi_1.70.0                    
## [15] data.table_1.17.0                       
## [16] VariantAnnotation_1.54.0                
## [17] Rsamtools_2.24.0                        
## [18] Biostrings_2.76.0                       
## [19] XVector_0.48.0                          
## [20] SummarizedExperiment_1.38.0             
## [21] Biobase_2.68.0                          
## [22] GenomicRanges_1.60.0                    
## [23] GenomeInfoDb_1.44.0                     
## [24] IRanges_2.42.0                          
## [25] S4Vectors_0.46.0                        
## [26] MatrixGenerics_1.20.0                   
## [27] matrixStats_1.5.0                       
## [28] BiocGenerics_0.54.0                     
## [29] generics_0.1.3                          
## [30] httpgd_2.0.4                            
## 
## loaded via a namespace (and not attached):
##  [1] farver_2.1.2             blob_1.2.4               R.utils_2.13.0          
##  [4] bitops_1.0-9             fastmap_1.2.0            RCurl_1.98-1.17         
##  [7] GenomicAlignments_1.44.0 XML_3.99-0.18            digest_0.6.37           
## [10] lifecycle_1.0.4          KEGGREST_1.48.0          RSQLite_2.3.9           
## [13] magrittr_2.0.3           compiler_4.5.0           rlang_1.1.6             
## [16] sass_0.4.10              tools_4.5.0              yaml_2.3.10             
## [19] knitr_1.50               labeling_0.4.3           S4Arrays_1.8.0          
## [22] htmlwidgets_1.6.4        bit_4.6.0                curl_6.2.2              
## [25] DelayedArray_0.34.1      RColorBrewer_1.1-3       abind_1.4-8             
## [28] BiocParallel_1.42.0      unigd_0.1.3              withr_3.0.2             
## [31] R.oo_1.27.0              grid_4.5.0               scales_1.4.0            
## [34] cli_3.6.5                rmarkdown_2.29           crayon_1.5.3            
## [37] httr_1.4.7               rjson_0.2.23             DBI_1.2.3               
## [40] cachem_1.1.0             stringr_1.5.1            restfulr_0.0.15         
## [43] vctrs_0.6.5              Matrix_1.7-3             jsonlite_2.0.0          
## [46] bit64_4.6.0-1            crosstalk_1.2.1          systemfonts_1.2.2       
## [49] jquerylib_0.1.4          glue_1.8.0               codetools_0.2-20        
## [52] stringi_1.8.7            rJava_1.0-11             gtable_0.3.6            
## [55] UCSC.utils_1.4.0         tibble_3.2.1             pillar_1.10.2           
## [58] GenomeInfoDbData_1.2.14  R6_2.6.1                 evaluate_1.0.3          
## [61] lattice_0.22-7           R.methodsS3_1.8.2        png_0.1-8               
## [64] memoise_2.0.1            bslib_0.9.0              Rcpp_1.0.14             
## [67] nlme_3.1-168             SparseArray_1.8.0        xfun_0.52               
## [70] pkgconfig_2.0.3

GT imbalances

Intro

Data.table linking samples to groups

Rational:

Let’s create a table where the samples in the VCF are associated with a group representing a set of clones carrying the same CRISPR edit type.

Format:

The iSCORE-PD_design.csv file is a comma seperated text file starting with a header and with one cell line per line with the following columns:

samples: Sample ID found in VCF header of the joint genotyping output file
group: Group ID linking the sample in group
meta: Additional group relation. Not used anymore

samplesDT <- setDT(read.csv("./supporting_files/iSCORE-PD_design.csv"))
datatable(samplesDT)

GT Analysis

Rational

Many SNPs in the sampeles have more than one GT, more than the number of unique SNPs. What’s going on?

Fraction of samples carying the same genotypes

Ok, Let’s look at chromosome 1 only

vcf <- readVcf(vcf.files[1])

groups <- samplesDT[!group %in% c("parental", "EWT"), unique(group), ]

## Get the sample names of the group under analysis
currUniverse <- samplesDT[-grep("EWT", group), samples]

asSNPs <- apply(geno(vcf)$GT[, currUniverse], 1, function(x) !all(x %in% c("./.", "0/0", "./0", "0/.")))

vcf_with_snps <- vcf[asSNPs & qual(vcf) >= 30]

asSameGT <- apply(geno(vcf_with_snps)$GT[, currUniverse], 1, function(x) length(unique(x)) == 1)

## Fraction of SNPs that have the same GT over all the samples
sum(asSameGT) / length(asSameGT)

## [1] 0.9096058

Franction of SNPs where all genotypes have an uncalled allele

geno <- geno(vcf_with_snps[!asSameGT])$GT[, currUniverse]

lvls <- combn(0:4, 2, paste, collapse = "/")
lvls <- c(lvls, sapply(0:4, function(x) paste(rep(x, 2), collapse = "/")))

unique_geno <- apply(geno, 1, function(x) unique(x[-grep("\\.", x, )]))

## Franction of SNPs where all genotypes have an uncalled allele
sum(!elementNROWS(unique_geno) > 1) / length(unique_geno)

## [1] 0.6940722

Number of unique GT that have calls

unique_geno <- unique_geno[elementNROWS(unique_geno) > 1]
length(unique_geno)

## [1] 9429

Fraction of GT transition from Het->Homo (or vice versa)

Turns out that a large fraction of the discordant GT are transition between Het and Homo.

dt1 <- data.table(
  snp = names(unique_geno),
  gts = sapply(unique_geno, function(x) paste(factor(x, levels = lvls), collapse = "|"))
)

dt <- data.table(table(sapply(unique_geno, function(x) paste(factor(x, levels = lvls), collapse = "|"))))
setnames(dt, "V1", "gts")
dt[, gtCount := elementNROWS(strsplit(gts, "\\|"))]
dt[, alleleCount := unlist(lapply(strsplit(gts, "\\||/"), function(x) length(unique(x))))]
dt[, homo2het := "No"]
dt[!(alleleCount > 2 | gtCount > 2), homo2het := lapply(strsplit(gts, "\\|"), function(y) {
  yss <- lapply(strsplit(y, "/"), unique)
  if (any(yss[[1]] %in% yss[[2]] | yss[[2]] %in% yss[[1]])) "yes"
})]

dt[order(factor(homo2het, levels = c("yes", "no")))]

dt[homo2het == "yes", sum(N)] / dt[, sum(N)]

## [1] 0.5698377

Most likely, the source of these transitions are homologous DNA repair, where during replication, a break hapened and is being recessed then repaired by the homolgous chromatid. Any Het SNPs in within the recessed region will be converted to the reciprocal allele, converting the site from Het to Homo

Dist Analysis

Distribution of distances between any 2 SNPs

If the hypothesis is true, there’s a greather likely hood to transition SNPs from Het to Homo that are closer to each others. Let’s look at that.

homo_2_het_snps <- dt1[gts %in% dt[homo2het == "yes", gts], snp]
start1 <- start(vcf[homo_2_het_snps])
start2 <- c(start1[-1], seqlengths(vcf)[unique(seqnames(vcf))])

df <- data.frame(
  dist2next = start2 - start1,
  type = "homo2het"
)

not_homo_2_het_snps <- dt1[!gts %in% dt[homo2het == "yes", gts], snp]
start3 <- start(vcf[not_homo_2_het_snps])
start4 <- c(start3[-1], seqlengths(vcf)[unique(seqnames(vcf))])

df <- rbind(df, data.frame(
  dist2next = start4 - start3,
  type = "notHomo2het"
))

p <- ggplot(df, aes(log10(dist2next), color = type)) +
  geom_density()

print(p)

Read Depth

Computing the Read Depth distrubtion for Major vs Minor Genotypes

Let’s take the SNPs that have samples with genotyp transitions from het to homo and look at the read depth coverage from the sample correspoding to the major GT (ie samples from the group with most of one of the two GT) compare to the samples with the minor GT (ie sample from the group wiht the least of one of the two GT). Also, as a control, let’s sample the compute the read coverage from SNPs where all the genotypes are identical (well do some sampling here to reduce the size of the finale data frame).

df <- do.call(rbind, mclapply(dt[homo2het == "yes" & N > 10, gts], function(currCase) {
  gts <- strsplit(currCase, "\\|")[[1]]
  currVcf <- vcf[dt1[gts == currCase, snp], currUniverse]

  majorDP <- unlist(sapply(seq_len(length(currVcf)), function(i) {
    major <- which.max(c(sum(geno(currVcf)$GT[i, ] == gts[1]), sum(geno(currVcf)$GT[i, ] == gts[2])))
    f <- geno(currVcf)$GT[i, ] == gts[major]
    geno(currVcf)$DP[i, f]
  }))

  minorDP <- unlist(sapply(seq_len(length(currVcf)), function(i) {
    major <- which.min(c(sum(geno(currVcf)$GT[i, ] == gts[1]), sum(geno(currVcf)$GT[i, ] == gts[2])))
    f <- geno(currVcf)$GT[i, ] == gts[major]
    geno(currVcf)$DP[i, f]
  }))

  f <- apply(geno(vcf[, currUniverse])$GT, 1, function(x) length(unique(x)) == 1 && unique(x) %in% gts)
  uniqueDP <- as.vector(geno(vcf[f][sample(sum(f), length(currVcf))])$DP)

  df <- data.frame(
    DP = c(majorDP, minorDP, uniqueDP),
    sample = c(
      rep("Major GT", length(majorDP)),
      rep("Minor GT", length(minorDP)),
      rep("Unique GT", length(uniqueDP))
    )
  )
}, mc.preschedule = FALSE, mc.cores = detectCores()))

p <- ggplot(df, aes(DP, color = sample)) +
  geom_density() +
  xlim(0, 60) +
  labs(
    title = "Distributions of squencing depth (deepVariant DP) between samples from major, minor, or common GT",
    x = "Read Deapth"
  )

print(p)

## Warning: Removed 3211 rows containing non-finite outside the scale range
## (`stat_density()`).

tapply(df$DP, df$sample, summary)

## $`Major GT`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   18.00   25.00   25.24   31.00  247.00 
## 
## $`Minor GT`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    14.0    20.0    21.4    27.0   243.0 
## 
## $`Unique GT`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   25.00   30.00   30.55   35.00  275.00

Computing the Allelic Read Depth Ratio for Major vs Minor Genotypes

Looking at same SNPs as above but computing the ration of read depth for one allele compare to the other allele

df <- do.call(rbind, mclapply(dt[homo2het == "yes" & N > 10, gts], function(currCase) {
  gts <- strsplit(currCase, "\\|")[[1]]
  currVcf <- vcf[dt1[gts == currCase, snp], currUniverse]

  majorAD <- unlist(sapply(seq_len(length(currVcf)), function(i) {
    major <- which.max(c(sum(geno(currVcf)$GT[i, ] == gts[1]), sum(geno(currVcf)$GT[i, ] == gts[2])))
    f <- geno(currVcf)$GT[i, ] == gts[major]
    AD <- geno(currVcf)$AD[i, f]
    sapply(AD, function(x) x[1] / sum(x))
  }))

  minorAD <- unlist(sapply(seq_len(length(currVcf)), function(i) {
    minor <- which.min(c(sum(geno(currVcf)$GT[i, ] == gts[1]), sum(geno(currVcf)$GT[i, ] == gts[2])))
    f <- geno(currVcf)$GT[i, ] == gts[minor]
    AD <- geno(currVcf)$AD[i, f]
    sapply(AD, function(x) x[1] / sum(x))
  }))

  f <- apply(geno(vcf[, currUniverse])$GT, 1, function(x) length(unique(x)) == 1 && unique(x) %in% gts)
  sampleVcf <- vcf[f][sample(sum(f), length(currVcf))]

  uniqueAD <- unlist(sapply(seq_len(length(sampleVcf)), function(i) {
    AD <- geno(sampleVcf)$AD[i, ]
    sapply(AD, function(x) x[1] / sum(x))
  }))

  df <- data.frame(
    AD = c(majorAD, minorAD, uniqueAD),
    sample = c(
      rep("Major AD", length(majorAD)),
      rep("Minor AD", length(minorAD)),
      rep("Unique AD", length(uniqueAD))
    )
  )
}, mc.preschedule = FALSE, mc.cores = detectCores()))

ggplot(df, aes(AD)) +
  geom_histogram() +
  facet_grid(rows = vars(sample), scales = "free") +
  labs(
    title = "Distributions of fraction of Allele Depth (deepVariant AD)\n between samples from major, minor, or common GT", # nolint: line_length_linter.
    x = "Read Deapth"
  )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 3839 rows containing non-finite outside the scale range
## (`stat_bin()`).

tapply(df$AD, df$sample, summary)

## $`Major AD`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.2222  0.4737  0.4538  0.6667  1.0000    3652 
## 
## $`Minor AD`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.2000  0.5385  0.5445  1.0000  1.0000     185 
## 
## $`Unique AD`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.07692 0.45455 0.37394 0.54167 1.00000       2

sessionInfo

sessionInfo()

## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8          LC_NUMERIC=C             
##  [3] LC_TIME=C.UTF-8           LC_COLLATE=C.UTF-8       
##  [5] LC_MONETARY=C.UTF-8       LC_MESSAGES=C.UTF-8      
##  [7] LC_PAPER=C.UTF-8          LC_NAME=C.UTF-8          
##  [9] LC_ADDRESS=C.UTF-8        LC_TELEPHONE=C.UTF-8     
## [11] LC_MEASUREMENT=C.UTF-8    LC_IDENTIFICATION=C.UTF-8
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3                           
##  [2] ape_5.8-1                               
##  [3] fastreeR_1.12.0                         
##  [4] ggplot2_3.5.2                           
##  [5] htmltools_0.5.8.1                       
##  [6] DT_0.33                                 
##  [7] org.Hs.eg.db_3.21.0                     
##  [8] BSgenome.Hsapiens.UCSC.hg38_1.4.5       
##  [9] BSgenome_1.76.0                         
## [10] rtracklayer_1.68.0                      
## [11] BiocIO_1.18.0                           
## [12] TxDb.Hsapiens.UCSC.hg38.knownGene_3.21.0
## [13] GenomicFeatures_1.60.0                  
## [14] AnnotationDbi_1.70.0                    
## [15] data.table_1.17.0                       
## [16] VariantAnnotation_1.54.0                
## [17] Rsamtools_2.24.0                        
## [18] Biostrings_2.76.0                       
## [19] XVector_0.48.0                          
## [20] SummarizedExperiment_1.38.0             
## [21] Biobase_2.68.0                          
## [22] GenomicRanges_1.60.0                    
## [23] GenomeInfoDb_1.44.0                     
## [24] IRanges_2.42.0                          
## [25] S4Vectors_0.46.0                        
## [26] MatrixGenerics_1.20.0                   
## [27] matrixStats_1.5.0                       
## [28] BiocGenerics_0.54.0                     
## [29] generics_0.1.3                          
## [30] httpgd_2.0.4                            
## 
## loaded via a namespace (and not attached):
##  [1] farver_2.1.2             blob_1.2.4               R.utils_2.13.0          
##  [4] bitops_1.0-9             fastmap_1.2.0            RCurl_1.98-1.17         
##  [7] GenomicAlignments_1.44.0 XML_3.99-0.18            digest_0.6.37           
## [10] lifecycle_1.0.4          KEGGREST_1.48.0          RSQLite_2.3.9           
## [13] magrittr_2.0.3           compiler_4.5.0           rlang_1.1.6             
## [16] sass_0.4.10              tools_4.5.0              yaml_2.3.10             
## [19] knitr_1.50               labeling_0.4.3           S4Arrays_1.8.0          
## [22] htmlwidgets_1.6.4        bit_4.6.0                curl_6.2.2              
## [25] DelayedArray_0.34.1      RColorBrewer_1.1-3       abind_1.4-8             
## [28] BiocParallel_1.42.0      unigd_0.1.3              withr_3.0.2             
## [31] R.oo_1.27.0              grid_4.5.0               scales_1.4.0            
## [34] cli_3.6.5                rmarkdown_2.29           crayon_1.5.3            
## [37] httr_1.4.7               rjson_0.2.23             DBI_1.2.3               
## [40] cachem_1.1.0             stringr_1.5.1            restfulr_0.0.15         
## [43] vctrs_0.6.5              Matrix_1.7-3             jsonlite_2.0.0          
## [46] bit64_4.6.0-1            crosstalk_1.2.1          systemfonts_1.2.2       
## [49] jquerylib_0.1.4          glue_1.8.0               codetools_0.2-20        
## [52] stringi_1.8.7            rJava_1.0-11             gtable_0.3.6            
## [55] UCSC.utils_1.4.0         tibble_3.2.1             pillar_1.10.2           
## [58] GenomeInfoDbData_1.2.14  R6_2.6.1                 evaluate_1.0.3          
## [61] lattice_0.22-7           R.methodsS3_1.8.2        png_0.1-8               
## [64] memoise_2.0.1            bslib_0.9.0              Rcpp_1.0.14             
## [67] nlme_3.1-168             SparseArray_1.8.0        xfun_0.52               
## [70] pkgconfig_2.0.3