Contents

1 Basic examples

1.1 Variant refinement

RunFIREVAT function performs variant refinement using mutational signatures. This function optimizes the filtering cutoff values by minimizing artifactual signature weights. With 2 cores (each with 3.5GHz clock speed) this sample code below takes about 10 minutes to run.

library(FIREVAT)

output.dir <- "" # assign output directory

sample.vcf.file <- system.file("extdata", 
                               "DCC_PCAWG_Cell_Lines_HCC1954.vcf", 
                               package = "FIREVAT")
config.file <- system.file("config", 
                           "PCAWG_DKFZ_Cell_Line_Filtering_Params.json",
                           package = "FIREVAT")

results <- RunFIREVAT(
    vcf.file = sample.vcf.file,
    vcf.file.genome = 'hg19',
    config.file = config.file,
    df.ref.mut.sigs = GetPCAWGMutSigs(),
    target.mut.sigs = GetPCAWGMutSigsNames(),
    df.ref.mut.sigs.groups.colors = GetPCAWGMutSigsEtiologiesColors(),
    sequencing.artifact.mut.sigs = PCAWG.All.Sequencing.Artifact.Signatures,
    output.dir = output.dir,
    objective.fn = Default.Obj.Fn,
    num.cores = 2,
    ga.pop.size = 100,
    ga.max.iter = 5,
    ga.run = 5,
    ga.pmutation = 0.1,
    perform.strand.bias.analysis = TRUE,
    ref.forward.strand.var = "TumorDPRefForward",
    ref.reverse.strand.var = "TumorDPRefReverse",
    alt.forward.strand.var = "TumorDPAltForward",
    alt.reverse.strand.var = "TumorDPAltReverse",
    annotate = FALSE)

1.2 Manual filtering

You can also perform manual variant filtering.

library(FIREVAT)

output.dir <- "" # assign output directory

sample.vcf.file <- system.file("extdata", 
                               "DCC_PCAWG_Cell_Lines_HCC1954.vcf", 
                               package = "FIREVAT")
config.file <- system.file("config",
                           "PCAWG_DKFZ_Cell_Line_Filtering_Params.json", 
                           package = "FIREVAT")

results <- RunFIREVAT(
    vcf.file = sample.vcf.file,
    vcf.file.genome = 'hg19',
    config.file = config.file,
    mode = "manual",
    df.ref.mut.sigs = GetPCAWGMutSigs(),
    target.mut.sigs = GetPCAWGMutSigsNames(),
    sequencing.artifact.mut.sigs = PCAWG.All.Sequencing.Artifact.Signatures,
    output.dir = output.dir,
    num.cores = 2,
    mutalisk.method = "random.sampling",
    perform.strand.bias.analysis = TRUE,
    ref.forward.strand.var = "TumorDPRefForward",
    ref.reverse.strand.var = "TumorDPRefReverse",
    alt.forward.strand.var = "TumorDPAltForward",
    alt.reverse.strand.var = "TumorDPAltReverse",
    annotate = FALSE)

1.3 Mutational signature analysis

Using FIREVAT, you can run Mutalisk

library(FIREVAT)

sample.vcf.file <- system.file("extdata", 
                               "DCC_PCAWG_Cell_Lines_HCC1954.vcf", 
                               package = "FIREVAT")
vcf.obj <- ReadVCF(sample.vcf.file)

mutalisk.results <- RunMutalisk(vcf.obj = vcf.obj,
                                df.ref.mut.sigs = GetPCAWGMutSigs(),
                                target.mut.sigs = GetPCAWGMutSigsNames(),
                                n.sample = 20,
                                n.iter = 10)

Subsequently, Mutalisk results can be plotted.

g <- PlotMutaliskResults(
    mutalisk.results = mutalisk.results,
    signatures = unique(mutalisk.results$identified.mut.sigs),
    df.ref.sigs.groups.colors = GetPCAWGMutSigsEtiologiesColors(),
    trinuc.max.y = 1,
    trinuc.min.y = 0,
    mut.type.max.y = 1,
    title = "")

1.3.1 Signature contribution probabilities

print(g$f1)

1.3.2 Observed spectrum

print(g$f2.1)

1.3.3 Reconstructed spectrum

print(g$f2.2)

1.3.4 Residual spectrum

print(g$f2.3)

1.3.5 Trinucleotide spectrum

print(g$f3)

2 Introduction

FIREVAT (FInding REliable Variants without ArTifacts) uses mutational signatures to perform variant refinement and generates accurate signature analysis.

The R package FIREVAT provides convenient methods to identify artifactual variants and generate visualizations that explain the refinement outcomes. These functions can be used to aid quality control efforts in cancer genomics studies.

2.1 Installing package

The released version of FIREVAT is available on CRAN.

install.packages("FIREVAT")

Alternatively, you can install FIREVAT directly from our GitHub repository.

install.packages("devtools")
library(devtools)
install_github("cgab-ncc/FIREVAT")

2.2 Inputs

To perform variant refinement using RunFIREVAT, you need to supply a VCF file and a FIREVAT configuration file.

2.2.1 VCF file

FIREVAT accepts a standard Variant Call Format (VCF) file as the primary input. Here are some example VCF files included in our package.

# Sample VCFs
sample.vcf.file.1 <- system.file("extdata",
                                 "DCC_PCAWG_Cell_Lines_HCC1954.vcf",
                                 package = "FIREVAT")
sample.vcf.file.2 <- system.file("extdata",
                                 "DCC_PCAWG_Cell_Lines_HCC1143.vcf",
                                 package = "FIREVAT")

sample.vcf.obj.1 <- ReadVCF(sample.vcf.file.1)
sample.vcf.obj.2 <- ReadVCF(sample.vcf.file.2)

HCC1954

print(head(sample.vcf.obj.1$header))
#> $fileformat
#> [1] "VCFv4.1"
#> 
#> $fileDate
#> [1] "20160129"
#> 
#> $pancancerversion
#> [1] "1.0"
#> 
#> $reference
#> [1] "<ID=hs37d5,Source=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz>;"
#> 
#> $center
#> [1] "\"DKFZ\""
#> 
#> $workflowName
#> [1] "DKFZ_SNV_workflow"
print(head(sample.vcf.obj.1$data))
#>    CHROM    POS                ID REF ALT QUAL                     FILTER
#> 1:  chr1 249041              <NA>   C   T   NA                     RE;MAP
#> 2:  chr1 786568              <NA>   T   G   NA                  RE;SB;TAC
#> 3:  chr1 816972 rs80078473_816972   T   G   NA              RE;SB;HSDEPTH
#> 4:  chr1 816994 rs77988874_816994   A   G   NA    RE;TAC;HSDEPTH;SBAF;FRQ
#> 5:  chr1 816995 rs72888865_816995   A   C   NA RE;TAC;HSDEPTH;SBAF;FRQ;BI
#> 6:  chr1 821270              <NA>   T   G   NA         RE;TAC;SBAF;FRQ;BI
#>                                 INFO    FORMAT          CONTROL
#> 1:    SOMATIC;SNP;AF=0.00,0.32;MQ=45 GT:DP:DP4 0/0:63:31,32,0,0
#> 2:    SOMATIC;SNP;AF=0.00,0.21;MQ=40 GT:DP:DP4 0/0:61:23,38,0,0
#> 3: SOMATIC;SNP;AF=0.03,0.15;MQ=57;DB GT:DP:DP4 0/0:35:20,14,0,1
#> 4: SOMATIC;SNP;AF=0.00,0.09;MQ=58;DB GT:DP:DP4 0/0:40:25,15,0,0
#> 5: SOMATIC;SNP;AF=0.00,0.09;MQ=58;DB GT:DP:DP4 0/0:41:26,15,0,0
#> 6:    SOMATIC;SNP;AF=0.00,0.05;MQ=58 GT:DP:DP4 0/0:91:55,36,0,0
#>               TUMOR
#> 1: 0/1:38:13,13,6,6
#> 2:  0/1:24:0,19,0,5
#> 3: 0/1:40:18,16,0,6
#> 4: 0/0:43:23,16,0,4
#> 5: 0/0:44:24,16,0,4
#> 6: 0/0:58:30,25,3,0
print(head(sample.vcf.obj.1$genome))
#> [1] "hg19"

HCC1143

head(sample.vcf.obj.2$header)
#> $fileformat
#> [1] "VCFv4.1"
#> 
#> $fileDate
#> [1] "20160128"
#> 
#> $pancancerversion
#> [1] "1.0"
#> 
#> $reference
#> [1] "<ID=hs37d5,Source=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz>;"
#> 
#> $center
#> [1] "\"DKFZ\""
#> 
#> $workflowName
#> [1] "DKFZ_SNV_workflow"
head(sample.vcf.obj.2$data)
#>    CHROM    POS                 ID REF ALT QUAL                FILTER
#> 1:  chr1  10123               <NA>   C   A   NA RE;SB;TAC;HSDEPTH;FRQ
#> 2:  chr1 324486               <NA>   G   C   NA             RE;SB;MAP
#> 3:  chr1 449637               <NA>   G   A   NA      RE;DP;SB;TAC;MAP
#> 4:  chr1 536095 rs189259765_536095   C   T   NA         RE;SB;TAC;MAP
#> 5:  chr1 537613 rs375637635_537613   C   G   NA      RE;DP;SB;TAC;MAP
#> 6:  chr1 605108 rs373700493_605108   A   G   NA      RE;DP;SB;TAC;MAP
#>                                 INFO    FORMAT             CONTROL
#> 1:    SOMATIC;SNP;AF=0.00,0.12;MQ=40 GT:DP:DP4 0/0:209:107,101,1,0
#> 2:    SOMATIC;SNP;AF=0.00,0.33;MQ=40 GT:DP:DP4     0/0:15:15,0,0,0
#> 3:    SOMATIC;SNP;AF=0.00,0.80;MQ=34 GT:DP:DP4     0/0:10:0,10,0,0
#> 4: SOMATIC;SNP;AF=0.00,0.33;MQ=44;DB GT:DP:DP4     0/0:17:1,16,0,0
#> 5: SOMATIC;SNP;AF=0.00,0.50;MQ=36;DB GT:DP:DP4    0/0:42:14,28,0,0
#> 6: SOMATIC;SNP;AF=0.00,0.40;MQ=38;DB GT:DP:DP4     0/0:23:20,3,0,0
#>              TUMOR
#> 1: 0/1:25:22,0,3,0
#> 2: 0/1:18:12,0,6,0
#> 3:   0/1:5:0,1,0,4
#> 4:   0/1:6:0,4,2,0
#> 5:   0/1:4:0,2,2,0
#> 6:   0/1:5:3,0,0,2
head(sample.vcf.obj.2$genome)
#> [1] "hg19"

2.2.2 Configuration file

RunFIREVAT needs a FIREVAT configuration (JSON) file that specifies the instructions for extracting and treating desired filter parameters. We have prepared configuration files for widely used variant callers.

mutect2.config.file <- system.file("config",
                                   "MuTect2_Filtering_Params.json",
                                   package = "FIREVAT")
muse.config.file <- system.file("config",
                                "Muse_Filtering_Params.json",
                                package = "FIREVAT")
somaticsniper.config.file <- system.file("config",
                                         "SomaticSniper_Filtering_Params.json",
                                         package = "FIREVAT")
varscan2.config.file <- system.file("config",
                                    "Varscan_Filtering_Params.json",
                                    package = "FIREVAT")

2.2.3 Preparing your own configuration file

The JSON/YAML file must have the following information for each filter parameter:

  • name
  • direction [POS or NEG]
  • field
    • field_type [string; e.g. ‘FORMAT’]
    • column_header [string; e.g. ‘TUMOR’]
    • key [string; e.g. ‘AD’]
    • index [integer; e.g. 2]
  • type [integer or float]
  • default [integer or float]
  • range [min,max; integer or float; e.g. 10,20]
  • use_in_filter [false or true]
  • op
    • function_string [R code that gets evaluated]
    • args [string vector]

We developed FIREVAT to run with custom ‘config’ files to handle different kinds of VCF formats. Even if two vcfs were called with same variant caller, those files can have different formats due to differences in running options and post-calling steps.

Here is an example of writing your own custom FIREVAT configuration file from scratch.

Let’s suppose that the following two variants were called using Sequenza (obtained from the mutation text file).

#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  SAMPLE
chr1    758859  .   T   C   .   .   GC=61;GOODREAD=76;F=0.132;RATIO=0.974028106944897;CNn=2;CNt=2;Mt=1;LPP=-23.5968475500134    .   .
chr1    808496  .   T   C   .   .   GC=52;GOODREAD=88;F=0.102;RATIO=0.974028106944897;CNn=2;CNt=2;Mt=0;LPP=-24.4626670646139    .   .

To use the GC value in the format above as a parameter for FIREVAT, you should first create a JSON (or YAML) file like the following:

GC_example.json

The “name” of this desired GC quality metric is “MinGC” to imply that we wish to filter out variants that fail to meet this minimum threshold. The “direction” is “POS” because higher values indicate higher quality variants. The “field” parameter concerns how FIREVAT should extract the desired GC value from the VCF file. The GC value appears in the “INFO” column under the key “GC” in the first index (hence 1). The “default” GC value is 50 (set this as you see fit). You may wish to set the minimum and maximum “range” that FIREVAT will iterate to choose the optimized GC filtering value. In the example below, the “range” is commented out. The “use_in_filter” is an important parameter because FIREVAT will only use this as a filtering parameter if this is set to ‘true’. Where “use_in_filter” should be set to ‘false’ is when you wish to derive intermediate values before calculating the final desired filtering parameter.

[
  {
    "name": "MinGC", // variable name
    "direction": "POS" /* filtering direction; refined > cutoff (POS),
                           refined < cutoff (NEG) */,
    "field": {
      "field_type": "INFO", // "INFO" or "FORMAT"
      "key": "GC", // key for the value
      /* If "field_type" is "FORMAT", 
           "column_header" value should be given here like:
               "column_header": "SAMPLE" or 
               "column_header": "$10" */
      "index": 1 //index for the value (comma separated)
    },
    "default": 50, // default values for FIREVAT manual mode
    // filtering ranges can be given
    // "range": [min, max],
    "type": "integer", // type of value in R session
    "use_in_filter": true /* boolean whether given value is 
                             used in filtering or not */
  }
]

Furthermore, you can use your own functions to add a filtering value which is not directly described in the VCF file.

[
  {
    "name": "RATIO",
    "direction": "POS",
    "field": {
      "field_type": "INFO",
      "key": "RATIO",
      "index": 1
    },
    "default": 0,
    "type": "float",
    "use_in_filter": false
  }, // same format with GC_example.json
  {
    "name": "MinRATIO",
    "direction": "POS",
    "op": {
      /* By adding custom functions in "op", you can add 
                a new variable "MinRATIO" which is "RATIO"* 100. */
      "function_string": "function(a){tryCatch({return(100*a)},error=function(e){return(0)})}",
      //custom function, evaluated in R
      "args": ["RATIO"] /* argument of custom function, variable 
                                should be defined in the config file */
    },
    "type": "float",
    "default": 0,
    "use_in_filter": true
  }
]

Here is another example of config file:

[
  {
    "name": "PrimaryDPAltReverse",
    "direction": "POS",
    "field": {
      "field_type": "FORMAT",
      "column_header": "PRIMARY",
      "key": "DP4",
      "index": 4
    },
    "type": "integer",
    "default": 1,
    "range": [0, 10],
    "use_in_filter": false
  },
  {
    "name": "TumorVAF",
    "direction": "POS",
    "op": {
      "function_string": "function(a,b,c){tryCatch({return((b+c)/a)},error=function(e){return(0)})}",
      "args": ["PrimaryDP", "PrimaryDPAltForward", "PrimaryDPAltReverse"]
    },
    "type": "float",
    "default": 10,
    "use_in_filter": true
  }
]

2.2.3.1 Configuration file ‘Field’

Shown below is a summary table of ‘field’ values (and corresponding descriptions) that FIREVAT config files can recognize.

NAME SUBFIELD DESCRIPTION
name - variable name used in R
direction - filtering direction; refined > cutoff (POS), refined < cutoff (NEG)
field - information to access values in vcf
field field_type “INFO” or “FORMAT”
field column_header if field type is “FORMAT”, the column header where wanted value is
field key key for the value (INFO: sep=“=”, FORMAT: sep=“:”)
field index index for the value (comma separated)
type - type of value (integer, string, double, boolean)
range - filtering range of value ( [min, max])
use_in_filter - boolean whether given value is used in filtering or not

2.2.3.2 Configuration file ‘Operation’

Shown below is a summary table of ‘operation’ values (and corresponding descriptions) that FIREVAT config file can recognize.

NAME SUBFIELD DESCRIPTION
name - variable name used in R
direction - filtering direction; refined > cutoff (POS), refined > cutoff (NEG)
op - information to access values in vcf
op function_string custom function, evaluated in R
op args args for custom function, variable should be defined in config file
type - type of value (integer, string, double, boolean)
range - filtering range of value ( [min, max])
use_in_filter - boolean whether given value is used in filtering or not

2.2.4 Reference mutational signature matrix

FIREVAT relies on mutational signatures to identify artifactual variants. The PCAWG signatures include artifactual signatures that accommodate this approach.

GetPCAWGMutSigs()[1:5,1:10]
#>   Type SubType SomaticMutationType        SBS1        SBS2        SBS3
#> 1  C>A     ACA             A[C>A]A 0.000371161 0.000000506 0.017372710
#> 2  C>A     ACC             A[C>A]C 0.001235459 0.000166799 0.017869145
#> 3  C>A     ACG             A[C>A]G 0.000148379 0.000091500 0.002931684
#> 4  C>A     ACT             A[C>A]T 0.000548660 0.000087200 0.010447932
#> 5  C>A     CCA             C[C>A]A 0.000171135 0.000237324 0.024580879
#>         SBS4        SBS5        SBS6       SBS7a
#> 1 0.03094174 0.009810659 0.000212145 0.000050400
#> 2 0.03154572 0.009992683 0.000338441 0.000174475
#> 3 0.02296053 0.003030899 0.000052100 0.000107520
#> 4 0.02207244 0.005533286 0.000092000 0.000190755
#> 5 0.07733754 0.007957358 0.001191647 0.000448556

Here are the signatures related to sequencing artifact:

PCAWG.All.Sequencing.Artifact.Signatures
#>  [1] "SBS60" "SBS27" "SBS43" "SBS45" "SBS46" "SBS47" "SBS48" "SBS49" "SBS50"
#> [10] "SBS51" "SBS52" "SBS53" "SBS54" "SBS55" "SBS56" "SBS57" "SBS58" "SBS59"

2.2.5 Preparing your own mutational signature reference matrix

You may supply your own custom mutational signatures. Please make sure that the columns contain the following headers in the order they appear:

GetPCAWGMutSigs()[1:5,1:3]
#>   Type SubType SomaticMutationType
#> 1  C>A     ACA             A[C>A]A
#> 2  C>A     ACC             A[C>A]C
#> 3  C>A     ACG             A[C>A]G
#> 4  C>A     ACT             A[C>A]T
#> 5  C>A     CCA             C[C>A]A

Your first custom signature should begin at column 4 (in 1-based index in R). Once you have the right data.frame, you can supply it into df.ref.mut.sigs input parameter of RunFIREVAT.

2.2.6 Genetic Algorithm (GA) parameters

2.2.7 Objective functions

FIREVAT optimization employs an evaluation approach that incorporates cosine similarity scores and sequencing artifact signature weights (see below) to determine if a given set of filter parameters leads to a more refined set of mutations. Here we first define the terms used for the evaluation approach, namely the objective function.

Cosine similarity score computed from signature analysis using refined mutations: \[ cos\Theta_{r} \]

Sum of sequencing artifact signature weights computed from signature analysis using refined mutations: \[ \Sigma w_{A,r}^i \]

Cosine similarity score computed from signature analysis using artifactual mutations: \[ cos\Theta_{a} \]

Sum of sequencing artifact signature weights computed from signature analysis using artifactual mutations: \[ \Sigma w_{A,a}^i \]

Default.Obj.Fn (default objective function) is then the product of the four terms above: \[ cos\Theta_{r} \cdot (1-\Sigma w_{A,r}^i) \cdot cos\Theta_{a} \cdot \Sigma w_{A,a}^i \]


(Experimental) there are different ways of working with the terms to formulate an evaluation method. Such possibilities include logarithmically or exponentially weighting each term. There are other experimental objective functions included in FIREVAT. Please refer to FIREVAT Experimenta Objective Functions for these. Please note that based on our validation studies, the Default.Obj.Fn function performed the most reliably and accurately.

2.2.8 Selecting target mutational signatures by cancer type

You may choose to supply a different subset of target signatures depending on the cancer type by using the input parameter target.mut.sigs in RunFIREVAT. For example, we may choose to only identify SBS1, SBS2, SBS5, and SBS13 for the sample VCF file:

output.dir <- "" # assign output directory

target.mut.sigs <- c("SBS1", "SBS2", "SBS5", "SBS13")

sample.vcf.file <- system.file("extdata", 
                               "DCC_PCAWG_Cell_Lines_HCC1954.vcf", 
                               package = "FIREVAT")
config.file <- system.file("config", 
                           "PCAWG_DKFZ_Cell_Line_Filtering_Params.json",
                           package = "FIREVAT")
results <- RunFIREVAT(
    vcf.file = sample.vcf.file,
    vcf.file.genome = 'hg19',
    config.file = config.file,
    df.ref.mut.sigs = GetPCAWGMutSigs(),
    target.mut.sigs = target.mut.sigs,
    sequencing.artifact.mut.sigs = PCAWG.All.Sequencing.Artifact.Signatures,
    output.dir = output.dir,
    objective.fn = Default.Obj.Fn,
    num.cores = 2,
    ga.pop.size = 100,
    ga.max.iter = 5,
    ga.run = 5,
    ga.pmutation = 0.1,
    perform.strand.bias.analysis = TRUE,
    ref.forward.strand.var = "TumorDPRefForward",
    ref.reverse.strand.var = "TumorDPRefReverse",
    alt.forward.strand.var = "TumorDPAltForward",
    alt.reverse.strand.var = "TumorDPAltReverse",
    annotate = FALSE)

2.2.9 Mutalisk parameters

2.2.10 Variant annotation parameters

To annotate variants, supply a data.frame with the the following parameters:

  • CHROM
  • POS
  • REF
  • ALT

as well as any information that you wish to annotate.

Here is an example of including annotation in RunFIREVAT.
First, download a clinical variant database such as ClinVar GRCh37 or ClinVar GRCh38 (depending on the genome assemlby of your VCF file). In the case of ClinVar, download the clinvar_<date>.vcf.gz file and unzip this. The ClinVar VCF can be preapred by FIREVAT for RunFIREVAT by using the function PrepareAnnotationDB.

output.dir <- "" # assign output directory

sample.vcf.file <- system.file("extdata", 
                               "DCC_PCAWG_Cell_Lines_HCC1954.vcf", 
                               package = "FIREVAT")
config.file <- system.file("config", 
                           "PCAWG_DKFZ_Cell_Line_Filtering_Params.json", 
                           package = "FIREVAT")

# Unzipped ClinVar VCF file (e.g. "clinvar_hg19_20190212.vcf")
clinvar.vcf.file <- "" 
clinvar.vcf.obj <- ReadVCF(vcf.file = clinvar.vcf.file, 
                           genome = "hg19", split.info = TRUE)
df.annotation.db <- PrepareAnnotationDB(annotation.vcf.obj = clinvar.vcf.obj)

# Annotation parameters
cols.to.display = c("GENEINFO", "CLNSIG")
filter.key.value.pairs <- list("CLNSIG" = c("Pathogenic", 
                                            "Pathogenic/Likely_pathogenic", 
                                            "Likely_pathogenic"))

# Run FIREVAT
results <- RunFIREVAT(
    vcf.file = sample.vcf.file,
    vcf.file.genome = 'hg19',
    config.file = config.file,
    df.ref.mut.sigs = GetPCAWGMutSigs(),
    target.mut.sigs = GetPCAWGMutSigsNames(),
    sequencing.artifact.mut.sigs = PCAWG.All.Sequencing.Artifact.Signatures,
    output.dir = output.dir,
    objective.fn = Default.Obj.Fn,
    num.cores = 6,
    ga.pop.size = 100,
    ga.max.iter = 5,
    ga.run = 5,
    ga.pmutation = 0.1,
    perform.strand.bias.analysis = TRUE,
    ref.forward.strand.var = "TumorDPRefForward",
    ref.reverse.strand.var = "TumorDPRefReverse",
    alt.forward.strand.var = "TumorDPAltForward",
    alt.reverse.strand.var = "TumorDPAltReverse",
    annotate = TRUE,
    df.annotation.db = df.annotation.db,
    annotated.columns.to.display = cols.to.display,
    annotation.filter.key.value.pairs = filter.key.value.pairs,
    annotation.filter.condition = "AND")

3 Outputs

3.1 Report

FIREVAT refinement generates a HTML report file. Here we explain each element of the report by using the report produced on the HCC1954 cell line.

At the top of the report, you’ll find the basic information about the VCF file that you input as well as the execution time and the GA parameters.

Refinement Optimization section presents the refinement processes and results. Below are the optimized filtering parameters. The resulting refined VCF has been filtered based on these values.

Here is a summary of the variant optimization outcome and process. The values of the four terms constituting the objective function are plotted in each of the 100 iterations. In the subsequent plot the weight of each artifactual signature identified as a result of each filter applied in each iteration is shown.

Optimized Mutational Signature Identificadtion section presents the Mutalisk results by applying the optimized filters. The mutational signature analysis results of the original VCF, refined VCF, and artifactual VCF are shown here. The sequencing artifact signatures apparent in the original VCF are now predominanly present in the artifactual VCF, while refined VCF shows relatively smaller proportion of these signatures.

Mutational signature was performed using Mutalisk by applying the refined filter parameters. The refinement led to filtering out 43,042 (69.23%) of variants in the original VCF, leaving 19,134 (30.77%) as refined point mutations. The cosine similarity score and the residual sum of square from each of the three analyses are also indicated.

Graphical representation of the observed spectrum in each of the original, refined, and artifactual VCFs.

Graphical representation of the reconstructed spectrum in each of the original, refined, and artifactual VCFs.

Graphical representation of the residual spectrum in each of the original, refined, and artifactual VCFs.

Graphical representation of nucleotide substitution types in each of the original, refined, and artifactual VCFs.

Optimized VCF Statistics section presents distribution of each filter parameter in the original, refined, and artifactual VCFs. In each row, the barplot on the right shows the statistical significance (Mann Whitney; two-tailed) between each pair of VCFs.

Variants with Strand Bias section presents strand bias found in each refined and artifactual VCFs. Here we asked FIREVAT to filter out variants with strand bias in the refined VCF so we do not see any of those variants under the Refined VCF sub-heading. We also performed corrected for false discovery rate and variants were filtered out based on q-values (< 0.05).

VCF Annotation section presents variants with clinical significance. Here we see that while mutations in PIK3CA and TP53 were included in the refined VCF NPHS1 variant was filtered out and included in the artifactual VCF. This result shows that further investigation (e.g. manual inspection with the BAM file) is necessary to assess the veracity of this variant.

3.2 Files

Additionally, FIREVAT generates a set of relevant files useful for downstream analysis of the refinement results:

  • Optimization log (TSV file). This includes all of the optimization logs plotted in section 1 (Refined Optimization)
  • FIREVAT results (RData file). This contains all of the relevant data specified and processed by FIREVAT
  • Refined mutations (VCF file)
  • Artifactual mutations (VCF file)
  • Summary data on FIREVAT execution that can be readily loaded (TSV file)

4 Session information

sessionInfo()
#> R version 3.6.2 (2019-12-12)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] parallel  stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#> [1] BiocStyle_2.12.0    FIREVAT_0.5.9       Biobase_2.44.0     
#> [4] BiocGenerics_0.30.0
#> 
#> loaded via a namespace (and not attached):
#>   [1] backports_1.1.5                   NMF_0.21.0                       
#>   [3] plyr_1.8.5                        lazyeval_0.2.2                   
#>   [5] BiocParallel_1.18.1               SnowballC_0.6.0                  
#>   [7] usethis_1.5.1                     GenomeInfoDb_1.20.0              
#>   [9] ggplot2_3.2.1                     gridBase_0.4-7                   
#>  [11] digest_0.6.23                     foreach_1.4.7                    
#>  [13] htmltools_0.4.0                   fansi_0.4.0                      
#>  [15] magrittr_1.5                      memoise_1.1.0                    
#>  [17] BSgenome_1.52.0                   cluster_2.1.0                    
#>  [19] doParallel_1.0.15                 remotes_2.1.0                    
#>  [21] Biostrings_2.52.0                 extrafont_0.17                   
#>  [23] matrixStats_0.55.0                bedr_1.0.7                       
#>  [25] BSgenome.Hsapiens.UCSC.hg38_1.4.1 R.utils_2.9.2                    
#>  [27] extrafontdb_1.0                   prettyunits_1.0.2                
#>  [29] colorspace_1.4-1                  blob_1.2.0                       
#>  [31] ggrepel_0.8.1                     xfun_0.11                        
#>  [33] dplyr_0.8.3                       callr_3.4.0                      
#>  [35] crayon_1.3.4                      RCurl_1.95-4.12                  
#>  [37] MutationalPatterns_1.10.0         jsonlite_1.6                     
#>  [39] roxygen2_7.0.2                    deconstructSigs_1.8.0            
#>  [41] zeallot_0.1.0                     VariantAnnotation_1.30.1         
#>  [43] iterators_1.0.12                  glue_1.3.1                       
#>  [45] registry_0.5-1                    gtable_0.3.0                     
#>  [47] zlibbioc_1.30.0                   XVector_0.24.0                   
#>  [49] DelayedArray_0.10.0               pkgbuild_1.0.6                   
#>  [51] Rttf2pt1_1.3.7                    scales_1.1.0                     
#>  [53] futile.options_1.0.1              DBI_1.1.0                        
#>  [55] rngtools_1.4                      bibtex_0.4.2.1                   
#>  [57] Rcpp_1.0.3                        progress_1.2.2                   
#>  [59] xtable_1.8-4                      bit_1.1-14                       
#>  [61] stats4_3.6.2                      httr_1.4.1                       
#>  [63] RColorBrewer_1.1-2                ellipsis_0.3.0                   
#>  [65] farver_2.0.1                      pkgconfig_2.0.3                  
#>  [67] XML_3.98-1.20                     R.methodsS3_1.7.1                
#>  [69] labeling_0.3                      tidyselect_0.2.5                 
#>  [71] rlang_0.4.2                       reshape2_1.4.3                   
#>  [73] AnnotationDbi_1.46.1              munsell_0.5.0                    
#>  [75] tools_3.6.2                       cli_2.0.0                        
#>  [77] RSQLite_2.1.5                     devtools_2.2.1                   
#>  [79] ggdendro_0.1-20                   evaluate_0.14                    
#>  [81] stringr_1.4.0                     yaml_2.2.0                       
#>  [83] processx_3.4.1                    knitr_1.26                       
#>  [85] bit64_0.9-7                       fs_1.3.1                         
#>  [87] caTools_1.17.1.3                  purrr_0.3.3                      
#>  [89] doRNG_1.7.1                       formatR_1.7                      
#>  [91] GA_3.2                            R.oo_1.23.0                      
#>  [93] pracma_2.2.9                      xml2_1.2.2                       
#>  [95] biomaRt_2.40.5                    compiler_3.6.2                   
#>  [97] rstudioapi_0.10                   testthat_2.3.1                   
#>  [99] ggsignif_0.6.0                    tibble_2.1.3                     
#> [101] stringi_1.4.3                     ps_1.3.0                         
#> [103] futile.logger_1.4.3               GenomicFeatures_1.36.4           
#> [105] desc_1.2.0                        lattice_0.20-38                  
#> [107] Matrix_1.2-18                     vctrs_0.2.1                      
#> [109] pillar_1.4.3                      lifecycle_0.1.0                  
#> [111] BiocManager_1.30.10               cowplot_1.0.0                    
#> [113] data.table_1.12.8                 bitops_1.0-6                     
#> [115] rtracklayer_1.44.4                GenomicRanges_1.36.1             
#> [117] R6_2.4.1                          bookdown_0.16                    
#> [119] gridExtra_2.3                     lsa_0.73.1                       
#> [121] IRanges_2.18.3                    sessioninfo_1.1.1                
#> [123] codetools_0.2-16                  lambda.r_1.2.4                   
#> [125] MASS_7.3-51.5                     assertthat_0.2.1                 
#> [127] pkgload_1.0.2                     SummarizedExperiment_1.14.1      
#> [129] pkgmaker_0.27                     rprojroot_1.3-2                  
#> [131] withr_2.1.2                       GenomicAlignments_1.20.1         
#> [133] Rsamtools_2.0.3                   S4Vectors_0.22.1                 
#> [135] GenomeInfoDbData_1.2.1            hms_0.5.2                        
#> [137] VennDiagram_1.6.20                grid_3.6.2                       
#> [139] rmarkdown_2.0                     BSgenome.Hsapiens.UCSC.hg19_1.4.0
#> [141] ggpubr_0.2.4.999