FIREVAT 0.5.9
RunFIREVAT
function performs variant refinement using mutational signatures.
This function optimizes the filtering cutoff values by minimizing artifactual signature weights.
With 2 cores (each with 3.5GHz clock speed) this sample code below takes about 10 minutes to run.
library(FIREVAT)
output.dir <- "" # assign output directory
sample.vcf.file <- system.file("extdata",
"DCC_PCAWG_Cell_Lines_HCC1954.vcf",
package = "FIREVAT")
config.file <- system.file("config",
"PCAWG_DKFZ_Cell_Line_Filtering_Params.json",
package = "FIREVAT")
results <- RunFIREVAT(
vcf.file = sample.vcf.file,
vcf.file.genome = 'hg19',
config.file = config.file,
df.ref.mut.sigs = GetPCAWGMutSigs(),
target.mut.sigs = GetPCAWGMutSigsNames(),
df.ref.mut.sigs.groups.colors = GetPCAWGMutSigsEtiologiesColors(),
sequencing.artifact.mut.sigs = PCAWG.All.Sequencing.Artifact.Signatures,
output.dir = output.dir,
objective.fn = Default.Obj.Fn,
num.cores = 2,
ga.pop.size = 100,
ga.max.iter = 5,
ga.run = 5,
ga.pmutation = 0.1,
perform.strand.bias.analysis = TRUE,
ref.forward.strand.var = "TumorDPRefForward",
ref.reverse.strand.var = "TumorDPRefReverse",
alt.forward.strand.var = "TumorDPAltForward",
alt.reverse.strand.var = "TumorDPAltReverse",
annotate = FALSE)
You can also perform manual variant filtering.
library(FIREVAT)
output.dir <- "" # assign output directory
sample.vcf.file <- system.file("extdata",
"DCC_PCAWG_Cell_Lines_HCC1954.vcf",
package = "FIREVAT")
config.file <- system.file("config",
"PCAWG_DKFZ_Cell_Line_Filtering_Params.json",
package = "FIREVAT")
results <- RunFIREVAT(
vcf.file = sample.vcf.file,
vcf.file.genome = 'hg19',
config.file = config.file,
mode = "manual",
df.ref.mut.sigs = GetPCAWGMutSigs(),
target.mut.sigs = GetPCAWGMutSigsNames(),
sequencing.artifact.mut.sigs = PCAWG.All.Sequencing.Artifact.Signatures,
output.dir = output.dir,
num.cores = 2,
mutalisk.method = "random.sampling",
perform.strand.bias.analysis = TRUE,
ref.forward.strand.var = "TumorDPRefForward",
ref.reverse.strand.var = "TumorDPRefReverse",
alt.forward.strand.var = "TumorDPAltForward",
alt.reverse.strand.var = "TumorDPAltReverse",
annotate = FALSE)
Using FIREVAT
, you can run Mutalisk
library(FIREVAT)
sample.vcf.file <- system.file("extdata",
"DCC_PCAWG_Cell_Lines_HCC1954.vcf",
package = "FIREVAT")
vcf.obj <- ReadVCF(sample.vcf.file)
mutalisk.results <- RunMutalisk(vcf.obj = vcf.obj,
df.ref.mut.sigs = GetPCAWGMutSigs(),
target.mut.sigs = GetPCAWGMutSigsNames(),
n.sample = 20,
n.iter = 10)
Subsequently, Mutalisk
results can be plotted.
g <- PlotMutaliskResults(
mutalisk.results = mutalisk.results,
signatures = unique(mutalisk.results$identified.mut.sigs),
df.ref.sigs.groups.colors = GetPCAWGMutSigsEtiologiesColors(),
trinuc.max.y = 1,
trinuc.min.y = 0,
mut.type.max.y = 1,
title = "")
print(g$f1)
print(g$f2.1)
print(g$f2.2)
print(g$f2.3)
print(g$f3)
FIREVAT
(FInding REliable Variants without ArTifacts) uses mutational signatures to perform variant refinement and generates accurate signature analysis.
The R package FIREVAT
provides convenient methods to identify artifactual variants and generate visualizations that explain the refinement outcomes. These functions can be used to aid quality control efforts in cancer genomics studies.
The released version of FIREVAT
is available on CRAN
.
install.packages("FIREVAT")
Alternatively, you can install FIREVAT
directly from our GitHub repository.
install.packages("devtools")
library(devtools)
install_github("cgab-ncc/FIREVAT")
To perform variant refinement using RunFIREVAT
, you need to supply a VCF file and a FIREVAT
configuration file.
FIREVAT
accepts a standard Variant Call Format (VCF) file as the primary input. Here are some example VCF files included in our package.
# Sample VCFs
sample.vcf.file.1 <- system.file("extdata",
"DCC_PCAWG_Cell_Lines_HCC1954.vcf",
package = "FIREVAT")
sample.vcf.file.2 <- system.file("extdata",
"DCC_PCAWG_Cell_Lines_HCC1143.vcf",
package = "FIREVAT")
sample.vcf.obj.1 <- ReadVCF(sample.vcf.file.1)
sample.vcf.obj.2 <- ReadVCF(sample.vcf.file.2)
HCC1954
print(head(sample.vcf.obj.1$header))
#> $fileformat
#> [1] "VCFv4.1"
#>
#> $fileDate
#> [1] "20160129"
#>
#> $pancancerversion
#> [1] "1.0"
#>
#> $reference
#> [1] "<ID=hs37d5,Source=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz>;"
#>
#> $center
#> [1] "\"DKFZ\""
#>
#> $workflowName
#> [1] "DKFZ_SNV_workflow"
print(head(sample.vcf.obj.1$data))
#> CHROM POS ID REF ALT QUAL FILTER
#> 1: chr1 249041 <NA> C T NA RE;MAP
#> 2: chr1 786568 <NA> T G NA RE;SB;TAC
#> 3: chr1 816972 rs80078473_816972 T G NA RE;SB;HSDEPTH
#> 4: chr1 816994 rs77988874_816994 A G NA RE;TAC;HSDEPTH;SBAF;FRQ
#> 5: chr1 816995 rs72888865_816995 A C NA RE;TAC;HSDEPTH;SBAF;FRQ;BI
#> 6: chr1 821270 <NA> T G NA RE;TAC;SBAF;FRQ;BI
#> INFO FORMAT CONTROL
#> 1: SOMATIC;SNP;AF=0.00,0.32;MQ=45 GT:DP:DP4 0/0:63:31,32,0,0
#> 2: SOMATIC;SNP;AF=0.00,0.21;MQ=40 GT:DP:DP4 0/0:61:23,38,0,0
#> 3: SOMATIC;SNP;AF=0.03,0.15;MQ=57;DB GT:DP:DP4 0/0:35:20,14,0,1
#> 4: SOMATIC;SNP;AF=0.00,0.09;MQ=58;DB GT:DP:DP4 0/0:40:25,15,0,0
#> 5: SOMATIC;SNP;AF=0.00,0.09;MQ=58;DB GT:DP:DP4 0/0:41:26,15,0,0
#> 6: SOMATIC;SNP;AF=0.00,0.05;MQ=58 GT:DP:DP4 0/0:91:55,36,0,0
#> TUMOR
#> 1: 0/1:38:13,13,6,6
#> 2: 0/1:24:0,19,0,5
#> 3: 0/1:40:18,16,0,6
#> 4: 0/0:43:23,16,0,4
#> 5: 0/0:44:24,16,0,4
#> 6: 0/0:58:30,25,3,0
print(head(sample.vcf.obj.1$genome))
#> [1] "hg19"
HCC1143
head(sample.vcf.obj.2$header)
#> $fileformat
#> [1] "VCFv4.1"
#>
#> $fileDate
#> [1] "20160128"
#>
#> $pancancerversion
#> [1] "1.0"
#>
#> $reference
#> [1] "<ID=hs37d5,Source=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz>;"
#>
#> $center
#> [1] "\"DKFZ\""
#>
#> $workflowName
#> [1] "DKFZ_SNV_workflow"
head(sample.vcf.obj.2$data)
#> CHROM POS ID REF ALT QUAL FILTER
#> 1: chr1 10123 <NA> C A NA RE;SB;TAC;HSDEPTH;FRQ
#> 2: chr1 324486 <NA> G C NA RE;SB;MAP
#> 3: chr1 449637 <NA> G A NA RE;DP;SB;TAC;MAP
#> 4: chr1 536095 rs189259765_536095 C T NA RE;SB;TAC;MAP
#> 5: chr1 537613 rs375637635_537613 C G NA RE;DP;SB;TAC;MAP
#> 6: chr1 605108 rs373700493_605108 A G NA RE;DP;SB;TAC;MAP
#> INFO FORMAT CONTROL
#> 1: SOMATIC;SNP;AF=0.00,0.12;MQ=40 GT:DP:DP4 0/0:209:107,101,1,0
#> 2: SOMATIC;SNP;AF=0.00,0.33;MQ=40 GT:DP:DP4 0/0:15:15,0,0,0
#> 3: SOMATIC;SNP;AF=0.00,0.80;MQ=34 GT:DP:DP4 0/0:10:0,10,0,0
#> 4: SOMATIC;SNP;AF=0.00,0.33;MQ=44;DB GT:DP:DP4 0/0:17:1,16,0,0
#> 5: SOMATIC;SNP;AF=0.00,0.50;MQ=36;DB GT:DP:DP4 0/0:42:14,28,0,0
#> 6: SOMATIC;SNP;AF=0.00,0.40;MQ=38;DB GT:DP:DP4 0/0:23:20,3,0,0
#> TUMOR
#> 1: 0/1:25:22,0,3,0
#> 2: 0/1:18:12,0,6,0
#> 3: 0/1:5:0,1,0,4
#> 4: 0/1:6:0,4,2,0
#> 5: 0/1:4:0,2,2,0
#> 6: 0/1:5:3,0,0,2
head(sample.vcf.obj.2$genome)
#> [1] "hg19"
RunFIREVAT
needs a FIREVAT
configuration (JSON) file that specifies the instructions
for extracting and treating desired filter parameters. We have prepared configuration files for
widely used variant callers.
mutect2.config.file <- system.file("config",
"MuTect2_Filtering_Params.json",
package = "FIREVAT")
muse.config.file <- system.file("config",
"Muse_Filtering_Params.json",
package = "FIREVAT")
somaticsniper.config.file <- system.file("config",
"SomaticSniper_Filtering_Params.json",
package = "FIREVAT")
varscan2.config.file <- system.file("config",
"Varscan_Filtering_Params.json",
package = "FIREVAT")
The JSON/YAML file must have the following information for each filter parameter:
We developed FIREVAT to run with custom ‘config’ files to handle different kinds of VCF formats. Even if two vcfs were called with same variant caller, those files can have different formats due to differences in running options and post-calling steps.
Here is an example of writing your own custom FIREVAT configuration file from scratch.
Let’s suppose that the following two variants were called using Sequenza (obtained from the mutation text file).
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
chr1 758859 . T C . . GC=61;GOODREAD=76;F=0.132;RATIO=0.974028106944897;CNn=2;CNt=2;Mt=1;LPP=-23.5968475500134 . .
chr1 808496 . T C . . GC=52;GOODREAD=88;F=0.102;RATIO=0.974028106944897;CNn=2;CNt=2;Mt=0;LPP=-24.4626670646139 . .
To use the GC value in the format above as a parameter for FIREVAT, you should first create a JSON (or YAML) file like the following:
GC_example.json
The “name” of this desired GC quality metric is “MinGC” to imply that we wish to filter out variants that fail to meet this minimum threshold. The “direction” is “POS” because higher values indicate higher quality variants. The “field” parameter concerns how FIREVAT should extract the desired GC value from the VCF file. The GC value appears in the “INFO” column under the key “GC” in the first index (hence 1). The “default” GC value is 50 (set this as you see fit). You may wish to set the minimum and maximum “range” that FIREVAT will iterate to choose the optimized GC filtering value. In the example below, the “range” is commented out. The “use_in_filter” is an important parameter because FIREVAT will only use this as a filtering parameter if this is set to ‘true’. Where “use_in_filter” should be set to ‘false’ is when you wish to derive intermediate values before calculating the final desired filtering parameter.
[
{
"name": "MinGC", // variable name
"direction": "POS" /* filtering direction; refined > cutoff (POS),
refined < cutoff (NEG) */,
"field": {
"field_type": "INFO", // "INFO" or "FORMAT"
"key": "GC", // key for the value
/* If "field_type" is "FORMAT",
"column_header" value should be given here like:
"column_header": "SAMPLE" or
"column_header": "$10" */
"index": 1 //index for the value (comma separated)
},
"default": 50, // default values for FIREVAT manual mode
// filtering ranges can be given
// "range": [min, max],
"type": "integer", // type of value in R session
"use_in_filter": true /* boolean whether given value is
used in filtering or not */
}
]
Furthermore, you can use your own functions to add a filtering value which is not directly described in the VCF file.
[
{
"name": "RATIO",
"direction": "POS",
"field": {
"field_type": "INFO",
"key": "RATIO",
"index": 1
},
"default": 0,
"type": "float",
"use_in_filter": false
}, // same format with GC_example.json
{
"name": "MinRATIO",
"direction": "POS",
"op": {
/* By adding custom functions in "op", you can add
a new variable "MinRATIO" which is "RATIO"* 100. */
"function_string": "function(a){tryCatch({return(100*a)},error=function(e){return(0)})}",
//custom function, evaluated in R
"args": ["RATIO"] /* argument of custom function, variable
should be defined in the config file */
},
"type": "float",
"default": 0,
"use_in_filter": true
}
]
Here is another example of config file:
[
{
"name": "PrimaryDPAltReverse",
"direction": "POS",
"field": {
"field_type": "FORMAT",
"column_header": "PRIMARY",
"key": "DP4",
"index": 4
},
"type": "integer",
"default": 1,
"range": [0, 10],
"use_in_filter": false
},
{
"name": "TumorVAF",
"direction": "POS",
"op": {
"function_string": "function(a,b,c){tryCatch({return((b+c)/a)},error=function(e){return(0)})}",
"args": ["PrimaryDP", "PrimaryDPAltForward", "PrimaryDPAltReverse"]
},
"type": "float",
"default": 10,
"use_in_filter": true
}
]
Shown below is a summary table of ‘field’ values (and corresponding descriptions) that FIREVAT config files can recognize.
NAME | SUBFIELD | DESCRIPTION |
---|---|---|
name | - | variable name used in R |
direction | - | filtering direction; refined > cutoff (POS), refined < cutoff (NEG) |
field | - | information to access values in vcf |
field | field_type | “INFO” or “FORMAT” |
field | column_header | if field type is “FORMAT”, the column header where wanted value is |
field | key | key for the value (INFO: sep=“=”, FORMAT: sep=“:”) |
field | index | index for the value (comma separated) |
type | - | type of value (integer, string, double, boolean) |
range | - | filtering range of value ( [min, max]) |
use_in_filter | - | boolean whether given value is used in filtering or not |
Shown below is a summary table of ‘operation’ values (and corresponding descriptions) that FIREVAT config file can recognize.
NAME | SUBFIELD | DESCRIPTION |
---|---|---|
name | - | variable name used in R |
direction | - | filtering direction; refined > cutoff (POS), refined > cutoff (NEG) |
op | - | information to access values in vcf |
op | function_string | custom function, evaluated in R |
op | args | args for custom function, variable should be defined in config file |
type | - | type of value (integer, string, double, boolean) |
range | - | filtering range of value ( [min, max]) |
use_in_filter | - | boolean whether given value is used in filtering or not |
FIREVAT
relies on mutational signatures to identify artifactual variants.
The PCAWG signatures include artifactual signatures that accommodate this approach.
GetPCAWGMutSigs()[1:5,1:10]
#> Type SubType SomaticMutationType SBS1 SBS2 SBS3
#> 1 C>A ACA A[C>A]A 0.000371161 0.000000506 0.017372710
#> 2 C>A ACC A[C>A]C 0.001235459 0.000166799 0.017869145
#> 3 C>A ACG A[C>A]G 0.000148379 0.000091500 0.002931684
#> 4 C>A ACT A[C>A]T 0.000548660 0.000087200 0.010447932
#> 5 C>A CCA C[C>A]A 0.000171135 0.000237324 0.024580879
#> SBS4 SBS5 SBS6 SBS7a
#> 1 0.03094174 0.009810659 0.000212145 0.000050400
#> 2 0.03154572 0.009992683 0.000338441 0.000174475
#> 3 0.02296053 0.003030899 0.000052100 0.000107520
#> 4 0.02207244 0.005533286 0.000092000 0.000190755
#> 5 0.07733754 0.007957358 0.001191647 0.000448556
Here are the signatures related to sequencing artifact:
PCAWG.All.Sequencing.Artifact.Signatures
#> [1] "SBS60" "SBS27" "SBS43" "SBS45" "SBS46" "SBS47" "SBS48" "SBS49" "SBS50"
#> [10] "SBS51" "SBS52" "SBS53" "SBS54" "SBS55" "SBS56" "SBS57" "SBS58" "SBS59"
You may supply your own custom mutational signatures. Please make sure that the columns contain the following headers in the order they appear:
GetPCAWGMutSigs()[1:5,1:3]
#> Type SubType SomaticMutationType
#> 1 C>A ACA A[C>A]A
#> 2 C>A ACC A[C>A]C
#> 3 C>A ACG A[C>A]G
#> 4 C>A ACT A[C>A]T
#> 5 C>A CCA C[C>A]A
Your first custom signature should begin at column 4 (in 1-based index in R).
Once you have the right data.frame, you can supply it into df.ref.mut.sigs
input parameter of RunFIREVAT
.
FIREVAT
optimization employs an evaluation approach that incorporates cosine similarity scores and sequencing artifact signature weights (see below) to determine if a given set of filter parameters leads to a more refined set of mutations. Here we first define the terms used for the evaluation approach, namely the objective function.
Cosine similarity score computed from signature analysis using refined mutations: \[ cos\Theta_{r} \]
Sum of sequencing artifact signature weights computed from signature analysis using refined mutations: \[ \Sigma w_{A,r}^i \]
Cosine similarity score computed from signature analysis using artifactual mutations: \[ cos\Theta_{a} \]
Sum of sequencing artifact signature weights computed from signature analysis using artifactual mutations: \[ \Sigma w_{A,a}^i \]
Default.Obj.Fn
(default objective function) is then the product of the four terms above:
\[ cos\Theta_{r} \cdot (1-\Sigma w_{A,r}^i) \cdot cos\Theta_{a} \cdot \Sigma w_{A,a}^i \]
(Experimental) there are different ways of working with the terms to formulate an evaluation method. Such possibilities include logarithmically or exponentially weighting each term. There are other experimental objective functions included in FIREVAT
. Please refer to FIREVAT Experimenta Objective Functions for these. Please note that based on our validation studies, the Default.Obj.Fn
function performed the most reliably and accurately.
You may choose to supply a different subset of target signatures depending on the cancer type by using the input parameter target.mut.sigs
in RunFIREVAT
. For example, we may choose to only identify SBS1, SBS2, SBS5, and SBS13 for the sample VCF file:
output.dir <- "" # assign output directory
target.mut.sigs <- c("SBS1", "SBS2", "SBS5", "SBS13")
sample.vcf.file <- system.file("extdata",
"DCC_PCAWG_Cell_Lines_HCC1954.vcf",
package = "FIREVAT")
config.file <- system.file("config",
"PCAWG_DKFZ_Cell_Line_Filtering_Params.json",
package = "FIREVAT")
results <- RunFIREVAT(
vcf.file = sample.vcf.file,
vcf.file.genome = 'hg19',
config.file = config.file,
df.ref.mut.sigs = GetPCAWGMutSigs(),
target.mut.sigs = target.mut.sigs,
sequencing.artifact.mut.sigs = PCAWG.All.Sequencing.Artifact.Signatures,
output.dir = output.dir,
objective.fn = Default.Obj.Fn,
num.cores = 2,
ga.pop.size = 100,
ga.max.iter = 5,
ga.run = 5,
ga.pmutation = 0.1,
perform.strand.bias.analysis = TRUE,
ref.forward.strand.var = "TumorDPRefForward",
ref.reverse.strand.var = "TumorDPRefReverse",
alt.forward.strand.var = "TumorDPAltForward",
alt.reverse.strand.var = "TumorDPAltReverse",
annotate = FALSE)
To annotate variants, supply a data.frame
with the the following parameters:
as well as any information that you wish to annotate.
Here is an example of including annotation in RunFIREVAT
.
First, download a clinical variant database such as ClinVar GRCh37 or ClinVar GRCh38 (depending on the genome assemlby of your VCF file). In the case of ClinVar, download the clinvar_<date>.vcf.gz
file and unzip this. The ClinVar VCF can be preapred by FIREVAT
for RunFIREVAT
by using the function PrepareAnnotationDB
.
output.dir <- "" # assign output directory
sample.vcf.file <- system.file("extdata",
"DCC_PCAWG_Cell_Lines_HCC1954.vcf",
package = "FIREVAT")
config.file <- system.file("config",
"PCAWG_DKFZ_Cell_Line_Filtering_Params.json",
package = "FIREVAT")
# Unzipped ClinVar VCF file (e.g. "clinvar_hg19_20190212.vcf")
clinvar.vcf.file <- ""
clinvar.vcf.obj <- ReadVCF(vcf.file = clinvar.vcf.file,
genome = "hg19", split.info = TRUE)
df.annotation.db <- PrepareAnnotationDB(annotation.vcf.obj = clinvar.vcf.obj)
# Annotation parameters
cols.to.display = c("GENEINFO", "CLNSIG")
filter.key.value.pairs <- list("CLNSIG" = c("Pathogenic",
"Pathogenic/Likely_pathogenic",
"Likely_pathogenic"))
# Run FIREVAT
results <- RunFIREVAT(
vcf.file = sample.vcf.file,
vcf.file.genome = 'hg19',
config.file = config.file,
df.ref.mut.sigs = GetPCAWGMutSigs(),
target.mut.sigs = GetPCAWGMutSigsNames(),
sequencing.artifact.mut.sigs = PCAWG.All.Sequencing.Artifact.Signatures,
output.dir = output.dir,
objective.fn = Default.Obj.Fn,
num.cores = 6,
ga.pop.size = 100,
ga.max.iter = 5,
ga.run = 5,
ga.pmutation = 0.1,
perform.strand.bias.analysis = TRUE,
ref.forward.strand.var = "TumorDPRefForward",
ref.reverse.strand.var = "TumorDPRefReverse",
alt.forward.strand.var = "TumorDPAltForward",
alt.reverse.strand.var = "TumorDPAltReverse",
annotate = TRUE,
df.annotation.db = df.annotation.db,
annotated.columns.to.display = cols.to.display,
annotation.filter.key.value.pairs = filter.key.value.pairs,
annotation.filter.condition = "AND")
FIREVAT
refinement generates a HTML
report file. Here we explain each element of the report by using the report produced on the HCC1954 cell line.
At the top of the report, you’ll find the basic information about the VCF file that you input as well as the execution time and the GA parameters.
Refinement Optimization section presents the refinement processes and results. Below are the optimized filtering parameters. The resulting refined VCF has been filtered based on these values.
Here is a summary of the variant optimization outcome and process. The values of the four terms constituting the objective function are plotted in each of the 100 iterations. In the subsequent plot the weight of each artifactual signature identified as a result of each filter applied in each iteration is shown.
Optimized Mutational Signature Identificadtion section presents the Mutalisk results by applying the optimized filters. The mutational signature analysis results of the original VCF, refined VCF, and artifactual VCF are shown here. The sequencing artifact signatures apparent in the original VCF are now predominanly present in the artifactual VCF, while refined VCF shows relatively smaller proportion of these signatures.
Mutational signature was performed using Mutalisk by applying the refined filter parameters. The refinement led to filtering out 43,042 (69.23%) of variants in the original VCF, leaving 19,134 (30.77%) as refined point mutations. The cosine similarity score and the residual sum of square from each of the three analyses are also indicated.
Graphical representation of the observed spectrum in each of the original, refined, and artifactual VCFs.
Graphical representation of the reconstructed spectrum in each of the original, refined, and artifactual VCFs.
Graphical representation of the residual spectrum in each of the original, refined, and artifactual VCFs.
Graphical representation of nucleotide substitution types in each of the original, refined, and artifactual VCFs.
Optimized VCF Statistics section presents distribution of each filter parameter in the original, refined, and artifactual VCFs. In each row, the barplot on the right shows the statistical significance (Mann Whitney; two-tailed) between each pair of VCFs.
Variants with Strand Bias section presents strand bias found in each refined and artifactual VCFs. Here we asked FIREVAT
to filter out variants with strand bias in the refined VCF so we do not see any of those variants under the Refined VCF sub-heading. We also performed corrected for false discovery rate and variants were filtered out based on q-values (< 0.05).
VCF Annotation section presents variants with clinical significance. Here we see that while mutations in PIK3CA and TP53 were included in the refined VCF NPHS1 variant was filtered out and included in the artifactual VCF. This result shows that further investigation (e.g. manual inspection with the BAM file) is necessary to assess the veracity of this variant.
Additionally, FIREVAT
generates a set of relevant files useful for downstream analysis of the refinement results:
FIREVAT
FIREVAT
execution that can be readily loaded (TSV file)sessionInfo()
#> R version 3.6.2 (2019-12-12)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.3 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] parallel stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] BiocStyle_2.12.0 FIREVAT_0.5.9 Biobase_2.44.0
#> [4] BiocGenerics_0.30.0
#>
#> loaded via a namespace (and not attached):
#> [1] backports_1.1.5 NMF_0.21.0
#> [3] plyr_1.8.5 lazyeval_0.2.2
#> [5] BiocParallel_1.18.1 SnowballC_0.6.0
#> [7] usethis_1.5.1 GenomeInfoDb_1.20.0
#> [9] ggplot2_3.2.1 gridBase_0.4-7
#> [11] digest_0.6.23 foreach_1.4.7
#> [13] htmltools_0.4.0 fansi_0.4.0
#> [15] magrittr_1.5 memoise_1.1.0
#> [17] BSgenome_1.52.0 cluster_2.1.0
#> [19] doParallel_1.0.15 remotes_2.1.0
#> [21] Biostrings_2.52.0 extrafont_0.17
#> [23] matrixStats_0.55.0 bedr_1.0.7
#> [25] BSgenome.Hsapiens.UCSC.hg38_1.4.1 R.utils_2.9.2
#> [27] extrafontdb_1.0 prettyunits_1.0.2
#> [29] colorspace_1.4-1 blob_1.2.0
#> [31] ggrepel_0.8.1 xfun_0.11
#> [33] dplyr_0.8.3 callr_3.4.0
#> [35] crayon_1.3.4 RCurl_1.95-4.12
#> [37] MutationalPatterns_1.10.0 jsonlite_1.6
#> [39] roxygen2_7.0.2 deconstructSigs_1.8.0
#> [41] zeallot_0.1.0 VariantAnnotation_1.30.1
#> [43] iterators_1.0.12 glue_1.3.1
#> [45] registry_0.5-1 gtable_0.3.0
#> [47] zlibbioc_1.30.0 XVector_0.24.0
#> [49] DelayedArray_0.10.0 pkgbuild_1.0.6
#> [51] Rttf2pt1_1.3.7 scales_1.1.0
#> [53] futile.options_1.0.1 DBI_1.1.0
#> [55] rngtools_1.4 bibtex_0.4.2.1
#> [57] Rcpp_1.0.3 progress_1.2.2
#> [59] xtable_1.8-4 bit_1.1-14
#> [61] stats4_3.6.2 httr_1.4.1
#> [63] RColorBrewer_1.1-2 ellipsis_0.3.0
#> [65] farver_2.0.1 pkgconfig_2.0.3
#> [67] XML_3.98-1.20 R.methodsS3_1.7.1
#> [69] labeling_0.3 tidyselect_0.2.5
#> [71] rlang_0.4.2 reshape2_1.4.3
#> [73] AnnotationDbi_1.46.1 munsell_0.5.0
#> [75] tools_3.6.2 cli_2.0.0
#> [77] RSQLite_2.1.5 devtools_2.2.1
#> [79] ggdendro_0.1-20 evaluate_0.14
#> [81] stringr_1.4.0 yaml_2.2.0
#> [83] processx_3.4.1 knitr_1.26
#> [85] bit64_0.9-7 fs_1.3.1
#> [87] caTools_1.17.1.3 purrr_0.3.3
#> [89] doRNG_1.7.1 formatR_1.7
#> [91] GA_3.2 R.oo_1.23.0
#> [93] pracma_2.2.9 xml2_1.2.2
#> [95] biomaRt_2.40.5 compiler_3.6.2
#> [97] rstudioapi_0.10 testthat_2.3.1
#> [99] ggsignif_0.6.0 tibble_2.1.3
#> [101] stringi_1.4.3 ps_1.3.0
#> [103] futile.logger_1.4.3 GenomicFeatures_1.36.4
#> [105] desc_1.2.0 lattice_0.20-38
#> [107] Matrix_1.2-18 vctrs_0.2.1
#> [109] pillar_1.4.3 lifecycle_0.1.0
#> [111] BiocManager_1.30.10 cowplot_1.0.0
#> [113] data.table_1.12.8 bitops_1.0-6
#> [115] rtracklayer_1.44.4 GenomicRanges_1.36.1
#> [117] R6_2.4.1 bookdown_0.16
#> [119] gridExtra_2.3 lsa_0.73.1
#> [121] IRanges_2.18.3 sessioninfo_1.1.1
#> [123] codetools_0.2-16 lambda.r_1.2.4
#> [125] MASS_7.3-51.5 assertthat_0.2.1
#> [127] pkgload_1.0.2 SummarizedExperiment_1.14.1
#> [129] pkgmaker_0.27 rprojroot_1.3-2
#> [131] withr_2.1.2 GenomicAlignments_1.20.1
#> [133] Rsamtools_2.0.3 S4Vectors_0.22.1
#> [135] GenomeInfoDbData_1.2.1 hms_0.5.2
#> [137] VennDiagram_1.6.20 grid_3.6.2
#> [139] rmarkdown_2.0 BSgenome.Hsapiens.UCSC.hg19_1.4.0
#> [141] ggpubr_0.2.4.999