Introduction
CHORD: Classifier of HOmologous Recombination Deficiency (https://github.com/UMCUGenetics/CHORD)
CHORD is a random forest model that uses the relative counts of somatic mutation contexts to predict homologous recombination deficiency (HRD). The primary contexts used by CHORD are deletions with flanking microhomology and 1-100kb structural duplications. Additionally, 1-100kb structural duplications are used to distinguish BRCA1-type HRD from BRCA2-type HRD.
Data
CHORD takes as input somatic VCFs containing SNVs, INDELs and SVs.
The former two can be in the same VCF. It is also able to take
data.frames with those variants as input. Due to the annotation size in
some of our processed Manta VCFs, we prefer to feed it a pre-processed
SV data.frame instead of a VCF, thus we use the
sigrap::chord_mantavcf2df
function.
snvindel_vcf <- system.file("extdata/umccrise/snv/somatic-ensemble-PASS.vcf.gz", package = "gpgr")
sv_vcf <- system.file("extdata/umccrise/sv/manta.vcf.gz", package = "gpgr")
Analysis
sv_df <- sigrap::chord_mantavcf2df(sv_vcf) # prepare SV VCF as data.frame
res <- sigrap::chord_run(
vcf.snv = snvindel_vcf,
df.sv = sv_df,
sv.caller = "manta",
# vcf.sv = sv_vcf, # alternative
sample.name = "sample_A",
ref.genome = "hg38",
verbose = TRUE
)
#>
#> #====== Loading variants from vcfs ======#
#>
#> ## SNVs
#> Warning in fun(libname, pkgname):
#> No reference genome loaded. Please install and load a BSgenome.
#> For example:
#> install.packages('BiocManager')
#> BiocManager::install('BSgenome.Hsapiens.UCSC.hg19')
#> library('BSgenome.Hsapiens.UCSC.hg19')
#>
#> Then specify the BSgenome to the ref.genome arguemnts to the relevant functions.
#> For example:
#> extractSigsSnv(..., ref.genome=BSgenome.Hsapiens.UCSC.hg19)
#> Reading in vcf file...
#> Converting chrom name style to style in ref.genome...
#> Loading required package: BiocGenerics
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:dplyr':
#>
#> combine, intersect, setdiff, union
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, aperm, append, as.data.frame, basename, cbind,
#> colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
#> get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
#> match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
#> Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
#> table, tapply, union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> Loading required package: stats4
#>
#> Attaching package: 'S4Vectors'
#> The following objects are masked from 'package:dplyr':
#>
#> first, rename
#> The following objects are masked from 'package:base':
#>
#> expand.grid, I, unname
#>
#> Attaching package: 'IRanges'
#> The following objects are masked from 'package:dplyr':
#>
#> collapse, desc, slice
#>
#> ## Indels
#> vcf file is the same for both SNVs and indels. Skipping reading vcf for indels
#>
#> ## SVs
#>
#> #====== Counting mutation contexts ======#
#>
#> ## Single base substitutions
#> Loading variants...
#> Initializing SNV signature output vector...
#> Removing rows with multiple ALT sequences...
#> Subsetting for SNVs...
#> Getting SNV trinucleotide contexts...
#> Converting trinucleotide contexts to substitution contexts...
#> Counting substitution context occurrences...
#> Returning context counts...
#>
#> ## Indel contexts (types x lengths)
#> Loading variants...
#> Removing rows with multiple ALT sequences...
#> Determining indel type...
#> Initializing indel signature output vector...
#> Determining indel length and sequence...
#> Determining the start/end positions for the left/right flanks of each indel...
#> Retrieving flanking sequences...
#> Calculating the number of copies of the indel sequence are present in the 3' flanking sequence...
#> Calculating the (max) number of bases that are homologous to the 5'/3' flanking sequence...
#> Determining indel contexts...
#> Counting indel context occurrences...
#> Returning indel context counts...
#>
#> ## SV contexts (types x lengths)
#> Creating SV type/length lookup table...
#> Counting DEL, DUP, and INV context occurrences...
#> Counting TRA occurrences...
#> Returning SV contexts...
#>
#> #====== Exporting output =========#
#> output.path not specified. Directly returning output
#>
Results
str(res, list.len = 4)
#> List of 2
#> $ contexts : num [1, 1:145] 123 82 18 91 92 98 21 94 51 55 ...
#> ..- attr(*, "dimnames")=List of 2
#> .. ..$ : chr "sample_A"
#> .. ..$ : chr [1:145] "A[C>A]A" "A[C>A]C" "A[C>A]G" "A[C>A]T" ...
#> $ prediction:'data.frame': 1 obs. of 17 variables:
#> ..$ sample : chr "sample_A"
#> ..$ p_hrd : num 0.804
#> ..$ hr_status : chr "HR_deficient"
#> ..$ hrd_type : chr "BRCA2_type"
#> .. [list output truncated]
cbind(var = colnames(res$prediction), value = unlist(res$prediction[1, ])) |>
dplyr::as_tibble(.name_repair = "check_unique") |>
knitr::kable(caption = "CHORD results.")
var | value |
---|---|
sample | sample_A |
p_hrd | 0.804 |
hr_status | HR_deficient |
hrd_type | BRCA2_type |
p_BRCA1 | 0.156 |
p_BRCA2 | 0.648 |
remarks_hr_status | |
remarks_hrd_type | |
p_hrd.5% | 0.753 |
p_hrd.50% | 0.806 |
p_hrd.95% | 0.82 |
p_BRCA1.5% | 0.076 |
p_BRCA1.50% | 0.177 |
p_BRCA1.95% | 0.265 |
p_BRCA2.5% | 0.548 |
p_BRCA2.50% | 0.626 |
p_BRCA2.95% | 0.701 |
# grab first and last few of the contexts
col_ind <- c(1:6, (ncol(res$contexts) - 10):ncol(res$contexts))
res$contexts |>
as.data.frame() |>
t() |>
dplyr::as_tibble(rownames = "context", .name_repair = "check_unique") |>
dplyr::mutate(n = row_number()) |>
dplyr::select(n, dplyr::everything()) |>
dplyr::slice(col_ind) |>
knitr::kable(caption = "Sample of rows from contexts counts.")
n | context | sample_A |
---|---|---|
1 | A[C>A]A | 123 |
2 | A[C>A]C | 82 |
3 | A[C>A]G | 18 |
4 | A[C>A]T | 91 |
5 | C[C>A]A | 92 |
6 | C[C>A]C | 98 |
135 | DUP_1e04_1e05_bp | 4 |
136 | DUP_1e05_1e06_bp | 0 |
137 | DUP_1e06_1e07_bp | 0 |
138 | DUP_1e07_Inf_bp | 18 |
139 | INV_0e00_1e03_bp | 0 |
140 | INV_1e03_1e04_bp | 0 |
141 | INV_1e04_1e05_bp | 0 |
142 | INV_1e05_1e06_bp | 0 |
143 | INV_1e06_1e07_bp | 0 |
144 | INV_1e07_Inf_bp | 0 |
145 | TRA | 77 |
Session Info
package | version | datestamp | source |
---|---|---|---|
base | 4.2.3 | 2023-07-12 | local |
CHORD | 2.0 | 2023-07-12 | local |
gpgr | 1.4.4 | 2023-07-12 | local |
mutSigExtractor | 1.25 | 2023-07-12 | local |
sigrap | 0.1.1 | 2023-07-12 | local |
name | value |
---|---|
version | R version 4.2.3 (2023-03-15) |
os | Ubuntu 22.04.2 LTS |
system | x86_64, linux-gnu |
ui | X11 |
language | en |
collate | C.UTF-8 |
ctype | C.UTF-8 |
tz | Etc/UTC |
date | 2023-07-12 |
pandoc | 2.19.2 @ /usr/share/miniconda/envs/pkgdownenv/bin/ (via rmarkdown) |