1 logistics

remember to change “root” to your home directory!

# load packages
pkgs = c("ape", "phangorn", "seqinr", "phytools", "apTreeshape", "DMwR",
         "phyloTop", "outliers", "e1071", "ROCR",
         "igraph", "plotly", "dplyr", "tidyr", "Matrix",
         "geosphere", "leaflet")
pkgs_ui = setdiff(pkgs, rownames(installed.packages()))
if (length(pkgs_ui) > 0) install.packages(pkgs_ui, verbose=F)
sapply(pkgs, require, character.only=T)

# set paths
root = "~/projects/ai4all-sfu_bio"

seq_dir = paste(root, "/data/FASTA.fa", sep="") #original genetic sequence
meta_dir = paste(root, "/data/meta.csv", sep="")
align_dir = paste(root, "/data/alignment.fa", sep="")

aux_dir = paste(root, "/data/Aux_Data.csv", sep="")
df_dir = paste(root, "/data/df.csv", sep="")
tree_rax_dir = paste(root, "/data/FinalH3N2.tree", sep="")


result_dir_ = paste(root, "/result", sep=""); dir.create(result_dir_, showWarnings=F)

# use a previous result; set to "" if starting anew
result_time_ = "2018-07-09_20-09-04"

# options
nseq = 200 #number of sequences to sample; 2*nseq < total # of sequences

##         ape    phangorn      seqinr    phytools apTreeshape        DMwR 
##        TRUE        TRUE        TRUE        TRUE       FALSE        TRUE 
##    phyloTop    outliers       e1071        ROCR      igraph      plotly 
##        TRUE        TRUE        TRUE        TRUE        TRUE        TRUE 
##       dplyr       tidyr      Matrix   geosphere     leaflet 
##        TRUE        TRUE        TRUE        TRUE        TRUE

1.1 align influenza virus genetic sequences

# load sequence alignments; to align sequences, install mafft @ (https://mafft.cbrc.jp/alignment/software/)
if (!file.exists(align_dir)) system(paste("mafft ", seq_dir, " > ", align_dir, sep=""))
align = read.dna(align_dir, format="fasta")
strains_id = rownames(align)
strains = sapply(strsplit(strains_id,"[-]"), function(x) x[1])
dates = as.Date(sapply(strsplit(strains_id,"[-]"), function(x) x[2]))

1.2 load/subset results

#sample for more recent strains
set.seed(17)
result_time_ = ifelse(result_time_!="" & !dir.exists(paste(result_dir_, "/", result_time_, sep="")), "", result_time_)
result_time = ifelse(result_time_=="", gsub(" ", "_", gsub("[:]", "-", Sys.time())), result_time_)
result_dir = paste(result_dir_, "/", result_time, sep=""); dir.create(result_dir, showWarnings=F)
ind_dir = paste(result_dir, "/ind.csv", sep="")

if (result_time_ == "") {
  sam_ind = rownames(align)[sample(c(1:min(2*nseq,nrow(align))), nseq, replace=F, prob=NULL)]
  write.csv(sam_ind, file=ind_dir, row.names=F)
}
sam_ind = read.csv(ind_dir, stringsAsFactors=F)[,1]
# sam_ind

2 infer phylogeny (ancestral tree)

2.1 UPGMA

calculate distance between sequences

align_n = align[sam_ind,]
align_n = as.phyDat(align_n)
dm = dist.ml(align_n, model="JC69")

infer the tree using upgma

tree_UPGMA = upgma(dm)
plot(tree_UPGMA, show.tip.label=F, main="UPGMA")

2.2 neighbour joining

infer the tree using neighbour joining

tree_NJ = NJ(dm)
plot(tree_NJ, main="Phylogeny: Neighbour Joining", show.tip.label=F)
tree = ladderize(tree_NJ)
plot(tree, main="Phylogeny: Neighbour Joining", show.tip.label=F)
is.rooted(tree_NJ) #is tree rooted
is.binary(tree_NJ) #are tree branches binary

# save to use in a later section
tree_dir = paste(result_dir, "/tree_NJ.Rdata", sep="")
if (!file.exists(tree_dir)) save(tree_NJ, file=tree_dir)

## [1] FALSE
## [1] TRUE

but which of these trees is a better fit for your data? Using the parsimony() function, you can compare their respective parsimony scores.

parsimony(tree_UPGMA, align_n)
parsimony(tree_NJ, align_n)

## [1] 598
## [1] 561

optim.parsimony tries to find the maximum parsimony tree using either Nearest Neighbor Interchange (NNI) rearrangements or sub tree pruning and regrafting (SPR).

tree_optim = optim.parsimony(tree_NJ,align_n)
tree_pratchet = pratchet(align_n)
plot(tree_optim, main="Phylogeny: Maximum Parsimony NNI", show.tip.label=F)
plot(tree_pratchet, main="Phylogeny: Maximum Parsimony SPR", show.tip.label=F)

## Final p-score 559 after  2 nni operations 
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"

2.3 maximum likelihood

maximum likelihood methods allow you to include the full data from your sequence alignment in a statistical framework that estimates model parameters.

the first thing we need to do is create a special type of object of class “pml”. This object includes mostly our tree and data, but also parameters of an evolutionary model, their values (usually set to some default), and the likelihood of the model.

fit = pml(tree_NJ,align_n)

## negative edges length changed to 0!

print(fit)
plot(fit, main="Phylogeny: Neighbour Joining (PML)", show.tip.label=F)

## 
##  loglikelihood: -7200.111 
## 
## unconstrained loglikelihood: -5500.084 
## 
## Rate matrix:
##   a c g t
## a 0 1 1 1
## c 1 0 1 1
## g 1 1 0 1
## t 1 1 1 0
## 
## Base frequencies:  
## 0.25 0.25 0.25 0.25

function optim.pml() will optimize tree topology and branch length for your selected model of nucleotide evolution.

fitJC = optim.pml(fit, model="JC", rearrangement="stochastic")
logLik(fitJC)
plot(fitJC, main="Phylogeny: Neighbour Joining Optimized (PML)", show.tip.label=F)

## optimize edge weights:  -7200.111 --> -7175.895 
## optimize edge weights:  -7175.895 --> -7175.895 
## optimize topology:  -7175.895 --> -7165.78 
## optimize topology:  -7165.78 --> -7165.78 
## optimize topology:  -7165.78 --> -7164.921 
## 4 
## optimize edge weights:  -7164.921 --> -7164.921 
## optimize topology:  -7164.921 --> -7164.921 
## 0 
## [1] "Ratchet iteration  1 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  2 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  3 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  4 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  5 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  6 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  7 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  8 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  9 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  10 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  11 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  12 , best pscore so far: -7164.92050394874"
## [1] "Ratchet iteration  13 , best pscore so far: -7164.92050394874"
## [1] "Ratchet iteration  14 , best pscore so far: -7164.92050383717"
## [1] "Ratchet iteration  15 , best pscore so far: -7164.92050383717"
## [1] "Ratchet iteration  16 , best pscore so far: -7164.92050383717"
## [1] "Ratchet iteration  17 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  18 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  19 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  20 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  21 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  22 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  23 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  24 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  25 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  26 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  27 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  28 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  29 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  30 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  31 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  32 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  33 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  34 , best pscore so far: -7164.920503434"
## [1] "Ratchet iteration  35 , best pscore so far: -7164.920503434"
## [1] "Ratchet iteration  36 , best pscore so far: -7164.920503434"
## [1] "Ratchet iteration  37 , best pscore so far: -7164.92049494019"
## [1] "Ratchet iteration  38 , best pscore so far: -7164.92049494019"
## [1] "Ratchet iteration  39 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  40 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  41 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  42 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  43 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  44 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  45 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  46 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  47 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  48 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  49 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  50 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  51 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  52 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  53 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  54 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  55 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  56 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  57 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  58 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  59 , best pscore so far: -7164.92049435348"
## optimize edge weights:  -7164.92 --> -7164.92 
## optimize topology:  -7164.92 --> -7164.92 
## 0 
## optimize edge weights:  -7164.92 --> -7164.92 
## 'log Lik.' -7164.92 (df=397)

note that the object returned by optim.pml is not a phylogeny, but an optimized object of class “pml” (which contains an optimized phylogeny with edge lengths). Let’s plot this phylogeny:

tree_JC = fitJC$tree
plot(tree_JC, type="phylogram",  lab4ut="horizontal", edge.width = 2, show.tip.label=F)

3 infer ancestral sequences

3.1 reconstruct ancestrial sequences

# define what tree we want to do ancestral sequence reconstruction for; since we saved this tree_NJ before, let's use this one
tree_type = "NJ"
tree = tree_NJ
if (file.exists(tree_dir)) tree = get(load(tree_dir))

# convert alignment format
align_npd = phyDat(align_n) 

# parsimony reconstructions: based on the fitch algorithm for bifurcating trees (note: there will be often no unique solution)
anc = ancestral.pars(tree, align_npd, "ACCTRAN")
plotAnc(tree, anc, cex.pie=.5, show.tip.label=F)
title(paste0(tree_type, ": ACCTRAN"))

3.2 match ancestral sequences with known strains

align known influenza virus with ancestral sequences; known strains must have a date preceding their child strains; we assume there are at least 2 layers in a tree

# get sequences ready (combine ancestral with known strains)
seqa_dir = paste(result_dir, "/FASTA_anc.fa", sep="")
seq_all_dir = paste(result_dir, "/FASTA_all.fa", sep="")

# merge ancestral and unknown sequences
if (!file.exists(seq_all_dir)) {
  sequ = read.dna(seq_dir, format="fasta")
  sequ_anc = as.DNAbin(lapply(anc, function(x) colnames(x)[apply(x,1,function(y) which.max(y))] ))
  write.dna(sequ_anc, file=seqa_dir, format="fasta")
  
  sequ_anc = read.dna(seqa_dir, format="fasta", as.matrix=F)
  
  sequ_all = append(sequ_anc, sequ)
  sequ_all = sequ_all[!(duplicated(names(sequ_all)))] # leaf sequences not included in distance calulcation
  write.dna(sequ_all, file=seq_all_dir, format="fasta")
}

# align sequences, install mafft @ (https://mafft.cbrc.jp/alignment/software/)
align_anc_dir = paste(result_dir, "/alignment_anc.fa", sep="")
if (!file.exists(align_anc_dir)) system(paste("mafft ", seq_all_dir, " > ", align_anc_dir, sep=""))
align_anc = read.dna(align_anc_dir, format="fasta")

3.2.1 calculate distance from alignments

# get distances ready
dm_anc_dir = paste(result_dir, "/dm_anc.Rdata", sep="")
if (!file.exists(dm_anc_dir)) {
  dm_anc_ = dist.ml(phyDat(align_anc),model="JC69")
  dm_anc = as.matrix(dm_anc_, sparse=T)
  tree_inds = !rownames(dm_anc)%in%strains_id | rownames(dm_anc)%in%sam_ind
  dm_anc = dm_anc[tree_inds, !tree_inds]
  save(dm_anc, file=dm_anc_dir)
}
dm_anc = get(load(dm_anc_dir))
strains_kn = sapply(strsplit(rownames(dm_anc),"[-]"), function(x) x[1])
strains_unkn = sapply(strsplit(colnames(dm_anc),"[-]"), function(x) x[1])
dates_unkn = as.Date(sapply(strsplit(colnames(dm_anc),"[-]"), function(x) x[2]))
names(dates_unkn) = strains_unkn

3.2.2 label tree nodes

evo_month = 444 # max number of months needed for a parent strain to evolve into a child strain (max = 444)

# load metadata
meta = read.csv(meta_dir, stringsAsFactors=F)[,-1]
rownames(meta) = meta[,"strain"]

# label nodes starting from root using depth first search
edges = tree$edge[order(tree$edge[,2]),] #first nseq nodes are leaves
edges[1:nseq,2] = tree$tip.label
edges[,2] = sapply(strsplit(edges[,2], "[-]"), function(x) x[1])

nodes = unique(as.vector(edges))
node = unique(edges[!edges[,1]%in%edges[,2],1]) #root as the starting node

try it yourself :)

given the starting node, all nodes, and edges, try labelling all the non-leaf nodes with virus strains (i.e. the numbers inside ‘nodes’ and ‘edges’ should should be replaced by strain names (e.g. KY583561)) following the constrains below: - parent nodes should be a strain had a sampling date earlier than child nodes - parent nodes should be a strain that are most similar to the inferred parent strain (labelled as numbers on the ‘rownames(dm_anc)’) i.e. have the shortest distance in dm_anc matrix (ancestral sequences x known sequences) - tip: you can use ‘as.Date’ to convert character dates into date variables so that you can use the ‘<’, ‘>’ to compare earlier or later dates!

check your answers:

edges_t = get(load(paste(result_dir,"/edges.Rdata", sep=""))) # this is what 'edges' should look like
nodes_t = get(load(paste(result_dir,"/nodes.Rdata", sep=""))) # this is what 'nodes' should look like

3.3 plot infection pathway

prepare nodelist virus & edgelist vphy

edges = edges_t
nodes = nodes_t

# viral strains
virus = meta[nodes, c("strain", "subtype", "date", "lng", "lat")]

# viral phylogeny
vphy = data.frame(id=seq_len(nrow(edges)),
                  start_lng=meta[edges[,1], "lng"], start_lat=meta[edges[,1], "lat"],
                  end_lng=meta[edges[,2], "lng"], end_lat=meta[edges[,2], "lat"])

plot map

edges are coloured based on the child strain; red -> yellow (ancestor -> new strains)

# heat colours: red (early) to yellow (recent)
colours = sapply(heat.colors(length(unique(virus$date))), function(x) gsub("FF$","",x))
names(colours) = sort(unique(virus$date))

# phylogeny edges
geo_lines = gcIntermediate(vphy[,c("start_lng", "start_lat")], 
                           vphy[,c("end_lng", "end_lat")], 
                           n=200, addStartEnd=T, sp=T, breakAtDateLine=T)

# map
leaflet() %>% 
  addMiniMap(tiles = providers$Esri.OceanBasemap, width = 120, height=80) %>%
  addProviderTiles(providers$Esri.OceanBasemap) %>% 
  addCircleMarkers(data=virus, lng=~lng, lat=~lat,
                   color = as.vector(colours[virus[,"date"]]), weight=2, radius=2,
                   label=paste(virus[,"subtype"], " (", 
                               virus[,"strain"], ") ", 
                               virus[,"date"], sep="")) %>%
  addPolylines(data=geo_lines, color = as.vector(colours[meta[edges[,2],"date"]]),
               weight = 1, opacity = 0.5, fill = FALSE, fillOpacity = 0.5, label = NULL)

keep in mind that our data set contains monstly strains found in he USA, therefore most of the strains on our phylogeny will be in the USA; this could be because USA has the most flu strains, or a more practical reason can be simply because USA collects the most data!

let’s see where the root strain comes from

# display root and leaf nodes
meta[unique(edges[!edges[,1]%in%edges[,2],1]),] # root strain
# meta[unique(edges[!edges[,2]%in%edges[,1],2]),] # leaf strains

4 predict future influenza strains

we assume that only the strongest influenza virus strain will sruvive to cause the influenza in the coming season; therefore, we try to predict which strains, out of all the strains we have collected, will survive

4.1 load and clean input data

# load the data
Auxdata = read.csv(aux_dir)[,2:3]
df = read.csv(df_dir,sep=",",stringsAsFactors=F)[-1,-1]
nrow(df)
names(df) = c("Clade","number_of_tips","number_of_tips_prune","sackin",
            "colless","Variance","I2","B1","avgLadder","ILnumber","pitchforks",
            "maxHeight","MaxWidth","DelW","Stairs1","Stairs2","Cherries","BS","descinm","getstattest","skewness","kurtosis","tips_pairwise_distance","tips_pairwise_distance_max",
            "diameter", "WienerIndex", "betweenness", "closeness", "eigenvector","Labels")
tree = read.tree(tree_rax_dir)

#prepare the dataset
df$number_of_tips = as.numeric(df$number_of_tips)
df$number_of_tips_prune = as.numeric(df$number_of_tips_prune)
df$sackin = as.numeric(df$sackin)
df$colless = as.numeric(df$colless)
df$Variance = as.numeric(df$Variance)
df$I2 = as.numeric(df$I2)
df$B1 = as.numeric(df$B1)
df$avgLadder = as.numeric(df$avgLadder)
df$ILnumber  = as.numeric(df$ILnumber)
df$pitchforks = as.numeric(df$pitchforks)
df$maxHeight = as.numeric(df$maxHeight)
df$MaxWidth  = as.numeric(df$MaxWidth)
df$DelW = as.numeric(df$DelW)
df$Stairs1 = as.numeric(df$Stairs1)
df$Stairs2 = as.numeric(df$Stairs2)
df$Cherries = as.numeric(df$Cherries)
df$BS = as.numeric(df$BS )
df$descinm = as.numeric(df$descinm )
df$getstattest =  as.numeric(df$getstattest)
df$skewness = as.numeric(df$skewness)
df$kurtosis =  as.numeric(df$kurtosis)
df$tips_pairwise_distance =  as.numeric(df$tips_pairwise_distance)
df$tips_pairwise_distance_max =  as.numeric(df$tips_pairwise_distance_max)
df$tips_pairwise_distance_max =  as.numeric(df$tips_pairwise_distance_max)
df$diameter =  as.numeric(df$diameter)
df$WienerIndex =  as.numeric(df$WienerIndex)
df$betweenness =  as.numeric(df$betweenness)
df$closeness =  as.numeric(df$closeness)
df$eigenvector  = as.numeric(df$eigenvector)
df$Labels = as.factor(df$Labels)

scaled.df = scale(df[,3:(ncol(df)-1)])
df1 = cbind(as.data.frame(df[,1:2]),scaled.df)
df = cbind(df1,df$Labels)

names(df) = c("Clade","NumberofTipsMain","NumberofTips","sackin",
            "colless","Variance","I2","B1","avgLadder","ILnumber","pitchforks",
            "maxHeight","MaxWidth","DelW","Stairs1","Stairs2","Cherries","BS","descinm","getstattest","skewness","kurtosis","MeanPairwiseDist","MaxPairwiseDist",
            "diameter", "WienerIndex", "betweenness", "closeness", "eigenvector","Labels")

#remove NA's
rm = which(is.na(df), TRUE)
df = df[-rm,]

#remove the outliers
df_outlier = df[,3:(ncol(df)-1)]
outlier.scores = lofactor(df_outlier, k=5)
# plot(density(outlier.scores))

# pick top 5 as outliers
outliers = order(outlier.scores, decreasing=T)[1:5]

# who are outliers? remove them
#print(outliers)
df = df[-outliers,]

## [1] 191

4.2 build prediction model

we train the prediction model on the strains before year 2015 and test on the latest strains (after 2015) clades rooted at 10563 to 11927 (are used for testing)

clades = df[,1]
test_ind = c()
for(i in 1:length(clades)){
  t = extract.clade(tree,as.numeric(clades[i]))
  labels = t$tip.label
  ind = match(labels,Auxdata[,1])
  dates = Auxdata[ind,2]
  dates = as.Date(dates)
  start = min(dates)
  if (start >= "2015-01-01") test_ind=c(test_ind,i)
}
train =  df[-test_ind, ]
test = df[test_ind, ]
dim(train)
dim(test)

train = train[,3:ncol(train)]
test = test[,3:ncol(test)]

## [1] 140  30
## [1] 42 30

try it yourself :)

tune the parameters to choose the best parameters

tune.out = tune(svm, Labels ~ .,
              data = train,kernel ="radial",ranges=list(gamma = 2^c(-5:5),cost=2^c(-5:5)),
              coef0 =0,degree =3,nu = 0.5,class.weigth=c("0"=0.45,"1"=0.55))
tune.out

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##  0.125    1
## 
## - best performance: 0.2428571

build the model (SVM)

svm.fit = svm(data = train, Labels ~.,
              kernel ="radial", degree = 3, gamma = 0.03125 , 
              coef0 = 3, cost =2, nu = 0.5,class.weigth=c("0"=0.45,"1"=0.55))

svm.prob = predict(svm.fit, newdata = test)
summary(svm.prob)

##  0  1 
## 30 12

4.3 evaluate the model

finally, we calculate metrics to evaluate how well prediction model did

table(test$Labels,svm.prob)
agreement = svm.prob == test$Labels
table(agreement)
prop.table(table(agreement))

#computing the true possitive rate
p = length(which(test$Labels==1))
TP = which(test$Labels==1)
TP = length(which(svm.prob[TP]==1))
TPR = TP/p
TPR

#computing the true negative rate
N = length(which(test$Labels==0))
TN = which(test$Labels==0)
TN = length(which(svm.prob[TN]==0))
TNR = TN/N
TNR
#computing the false positive rate
P = length(which(test$Labels==0))
FP = which(test$Labels==0)
FP = length(which(svm.prob[FP]==1))
FPR = FP/P
FPR

#computing the false negative rate
P = length(which(test$Labels==1))
FN = which(test$Labels==1)
FN = length(which(svm.prob[FN]==0))
FNR = FN/P
FNR

#compute the AUC
svmmodel.predict = predict(svm.fit, newdata = test,decision.values=TRUE)
svmmodel.probs = attr(svmmodel.predict,"decision.values")
svmmodel.class = predict(svm.fit,test,type="class")
svmmodel.labels = test$Labels

#roc analysis for test data
svmmodel.prediction = prediction(svmmodel.probs,svmmodel.labels)
svmmodel.performance = performance(svmmodel.prediction,"tpr","fpr")
svmmodel.auc = performance(svmmodel.prediction,"auc")@y.values[[1]]

##    svm.prob
##      0  1
##   0 26  2
##   1  4 10
## agreement
## FALSE  TRUE 
##     6    36 
## agreement
##     FALSE      TRUE 
## 0.1428571 0.8571429 
## [1] 0.7142857
## [1] 0.9285714
## [1] 0.07142857
## [1] 0.2857143

plot the AUC (area under curve)

plot(svmmodel.performance,type="l", col="blue")
round(svmmodel.auc,2)
legend("bottomright", legend=c("AUC=0.90"), fill=c("blue"), cex=0.7)

## [1] 0.89

inferring the infection pathway of influenza

alice & maryam

2018-07-09