Comparison of nanopore and miseq

require("phyloseq")
## Loading required package: phyloseq
require("data.table")
## Loading required package: data.table
require("vegan")
## Loading required package: vegan
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
source("https://raw.githubusercontent.com/mahendra-mariadassou/phyloseq-extended/master/load-extra-functions.R")
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
## Loading required package: scales
## Loading required package: parallel

Import data

Lets import the data files

otu.table <- read.csv("otu_table.csv", header = T, row.names = 1,sep = ";")
taxon.table <- read.table("taxon_table.csv", header = T, row.names = 1,sep = ";",quote = "", comment.char="#")
taxon.table.nano <- read.table("taxon_table_nano.csv", header = T, row.names = 1,sep = ";",quote = "", comment.char="#")
otu.table.nano <- read.table("OTU_table_nano.csv", header = T, row.names = 1,sep = ";",quote = "", comment.char="#")
sample.table <- read.table("sample_data.csv", header = T, row.names = 1,sep = ";")
sample.table$fertilization <- ifelse(sample.table$treatment%like%"Fertilizer", "yes", "no")
sample.table$pest <- ifelse(sample.table$treatment%like%"Pesticide", "yes", "no")
sample.table$biochar <- ifelse(sample.table$treatment%like%"Biochar", "yes", "no")
sample.table.nano <- sample.table[which(rownames(sample.table)%in%colnames(otu.table.nano)),]
otu.table.miseq <- otu.table[,which(colnames(otu.table)%in%rownames(sample.table.nano))]

OTU.miseq=otu_table(otu.table.miseq, taxa_are_rows = T)
OTU.nano=otu_table(otu.table.nano, taxa_are_rows = T)
TAX.miseq = tax_table(as.matrix(taxon.table))
TAX.nano = tax_table(as.matrix(taxon.table.nano))
SAM= sample_data(sample.table.nano)
data.miseq <- phyloseq(OTU.miseq, TAX.miseq, SAM)
data.nano <- phyloseq(OTU.nano, TAX.nano, SAM)

In order to compare the output from sequencing with nanopore and miseq we can first look at the number of reads that we got for the different samples

par(mfrow=c(1,2))
bar.miseq<- barplot(sort(colSums(otu_table(data.miseq))), las=2, cex.names = 0.5)
bar.nano <- barplot(sort(colSums(otu_table(data.nano))), las=2, cex.names = 0.5)

We see that there is some variation in the number of reads that are produced from each sample. This may affect the calculations of diversity estimates etc.

One way that thisv can be accounted for is to rarify the samples, meaning that the samples are resampled with the smallest number of samples. We can also remove the reads that has not been assigned to any domain such as bacteria and archea and wee remove the otus that is only found in very low frequency.

unique(tax_table(data.miseq)[,c("Domain")])
## Taxonomy Table:     [3 taxa by 1 taxonomic ranks]:
##       Domain    
## otu_1 ""        
## otu_2 "Archaea" 
## otu_9 "Bacteria"
unique(tax_table(data.nano)[,c("Domain")])
## Taxonomy Table:     [2 taxa by 1 taxonomic ranks]:
##        Domain      
## 2      "Bacteria"  
## 186813 "Unassigned"
data.nano.sub <- subset_taxa(data.nano, 
                    Domain != "Unassigned")
data.miseq.sub <- subset_taxa(data.miseq, 
                    Domain != "")

data.nano.sub <-  filter_taxa(data.nano.sub, 
                     function(x) sum(x >= 10) > (1), 
                     prune =  TRUE) 
data.miseq.sub <-  filter_taxa(data.miseq.sub, 
                     function(x) sum(x >= 10) > (1), 
                     prune =  TRUE) 


data.nano.rare <- rarefy_even_depth(data.nano.sub, 
                               sample.size = min(colSums(otu_table(data.miseq.sub))))
## You set `rngseed` to FALSE. Make sure you've set & recorded
##  the random seed of your session for reproducibility.
## See `?set.seed`
## ...
data.miseq.rare <- rarefy_even_depth(data.miseq.sub, 
                               sample.size = min(colSums(otu_table(data.miseq.sub))))
## You set `rngseed` to FALSE. Make sure you've set & recorded
##  the random seed of your session for reproducibility.
## See `?set.seed`
## 
## ...

Now we have rarefied the samples and we can now start to compare the results of the different technologies. We can start with comparing different estimates of the alpha diversity, i.e. the diversity within samples.

Lets calculate the alpha diversity in the different samples. There are several different estimates of alpha diversity. We calculate the species richness, ACE and the shannon index

p.nano <- plot_richness(data.nano.rare, 
                   x="sample", 
                   color="treatment", 
                   measures=c("Observed","Shannon","ACE"), 
                   nrow = 1)
print(p.nano)

And for Miseq

p.miseq <- plot_richness(data.miseq.rare, 
                   x="sample", 
                   color="treatment", 
                   measures=c("Observed","Shannon","ACE"), 
                   nrow = 1)
print(p.miseq)

Lets check if the alpha diversity values from nanopore and miseq are correlated. First we plot them against each other:

rich.nano <- dcast(p.nano$data,  samples ~ variable)
rich.miseq <- dcast(p.miseq$data, samples ~variable)

colnames(rich.miseq)[-1]<- paste(colnames(rich.miseq)[-1], "_miseq", sep = "")

rich.both<- merge(rich.miseq, rich.nano)

par(mfrow=c(3,1))
plot(rich.both$Observed_miseq, rich.both$Observed, pch=19, xlab = "Observed species richness MiSeq", ylab = "Observed species richness ONT")
plot(rich.both$ACE_miseq, rich.both$ACE, pch=19, xlab = "ACE MiSeq", ylab = "ACE ONT")
plot(rich.both$Shannon_miseq, rich.both$Shannon, pch=19, xlab = "Shannon MiSeq", ylab = "Shannon ONT")

We can also test if the correlation is significant and we can calculate the correlation coefficient

cor.test(rich.both$Observed_miseq, rich.both$Observed)
## 
##  Pearson's product-moment correlation
## 
## data:  rich.both$Observed_miseq and rich.both$Observed
## t = 1.4247, df = 20, p-value = 0.1696
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1353840  0.6428833
## sample estimates:
##       cor 
## 0.3035509
cor.test(rich.both$ACE_miseq, rich.both$ACE)
## 
##  Pearson's product-moment correlation
## 
## data:  rich.both$ACE_miseq and rich.both$ACE
## t = 1.4245, df = 20, p-value = 0.1697
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1354336  0.6428537
## sample estimates:
##      cor 
## 0.303505
cor.test(rich.both$Shannon_miseq, rich.both$Shannon)
## 
##  Pearson's product-moment correlation
## 
## data:  rich.both$Shannon_miseq and rich.both$Shannon
## t = 1.8946, df = 20, p-value = 0.0727
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03773842  0.69704863
## sample estimates:
##       cor 
## 0.3900765

The correlation is not significant. However, from the plot it is evident that some of the plots are outliers and it is interesting to see how the correlations are when these are removed. The two samples are MFP3_0909 and
MFP2_0909. There may be different reasons why these are particularly low in the miseq compared with nanopore. Lets remove these two sampels and calculate the correlation

rich.both_removed <- rich.both[which(rich.both$samples!="MFP3_0909"&rich.both$samples!="MFP2_0909"),]
cor.test(rich.both_removed$Observed_miseq, rich.both_removed$Observed)
## 
##  Pearson's product-moment correlation
## 
## data:  rich.both_removed$Observed_miseq and rich.both_removed$Observed
## t = 3.5363, df = 18, p-value = 0.002359
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2759176 0.8437299
## sample estimates:
##       cor 
## 0.6402628
cor.test(rich.both_removed$ACE_miseq, rich.both_removed$ACE)
## 
##  Pearson's product-moment correlation
## 
## data:  rich.both_removed$ACE_miseq and rich.both_removed$ACE
## t = 2.9733, df = 18, p-value = 0.008144
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1761230 0.8105742
## sample estimates:
##      cor 
## 0.573914
cor.test(rich.both_removed$Shannon_miseq, rich.both_removed$Shannon)
## 
##  Pearson's product-moment correlation
## 
## data:  rich.both_removed$Shannon_miseq and rich.both_removed$Shannon
## t = 2.7933, df = 18, p-value = 0.01201
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1419198 0.7981907
## sample estimates:
##       cor 
## 0.5499052

The correlation is now significant and the correlation coefficient is much higher.

Beta diversity

Let us look at the beta diversity. This is the diversity between the samples, meaning how divergent the samples are from each other. First we plot it for the nanopore data:

a <-transform_sample_counts(otu_table(data.nano.rare), function(x) x/sum(x))
dist.nano<- vegdist(as.data.frame(sqrt(t(a))), binary=F, method = "bray")
ord.nano <- ordinate(data.nano.rare,"PCoA",dist.nano)

plot_ordination(data.nano.rare, 
                ord.nano,
                color = "treatment", 
                title = "PCoA sqrt Bray curtis - ONT", 
                label= "SampleID" ) + 
  geom_point(aes(size=rich.nano$Observed)) +
  theme_bw()
## Warning in plot_ordination(data.nano.rare, ord.nano, color = "treatment", :
## Label variable was not found in the available data you provided.No label mapped.

And then we plot it for the MiSeq data

a <-transform_sample_counts(otu_table(data.miseq.rare), function(x) x/sum(x))
dist.miseq<- vegdist(as.data.frame(sqrt(t(a))), binary=F, method = "bray")
ord.miseq <- ordinate(data.miseq.rare,"PCoA",dist.miseq)

plot_ordination(data.miseq.rare, 
                ord.miseq,
                color = "treatment", 
                title = "PCoA sqrt Bray curtis - MiSeq", 
                label= "SampleID" ) + 
  geom_point(aes(size=rich.miseq$Observed)) +
  theme_bw()
## Warning in plot_ordination(data.miseq.rare, ord.miseq, color = "treatment", :
## Label variable was not found in the available data you provided.No label mapped.

It is clear that the nanopore analysis distinguish better the different treatments.

Dominating groups of bacteria

domain_nano <- plot_composition(data.nano.rare, "Domain", "Bacteria", "Phylum", fill="Phylum")+ facet_grid(~treatment, scales="free_x", space="free_x")

domain_miseq <- plot_composition(data.miseq.rare, "Domain", "Bacteria", "Phylum", fill="Phylum")+ facet_grid(~treatment, scales="free_x", space="free_x")

#We can plot the two plots side by side by the grid.arrange function
grid.arrange(domain_nano, domain_miseq, nrow = 1)

It is clear here that it is a large difference in the abundance of the different phyla with the two different technologies. This could be investigated further. Lets look at the phylum Actinobacteriota specifically

acti_nano <- plot_composition(data.nano.rare, "Phylum", "Actinobacteria", "Family", fill="Family")+ facet_grid(~treatment, scales="free_x", space="free_x")

acti_miseq <- plot_composition(data.miseq.rare, "Phylum", "Actinobacteriota", "Family", fill="Family")+ facet_grid(~treatment, scales="free_x", space="free_x")

grid.arrange(acti_nano, acti_miseq, nrow = 1)

We can see that most of the Actinobacteria is unnassigned. This means that it may be that the main reason for the difference in Actinobacteria is a difference in the reference data. Lets look at the proteobacteria that dominates in the nanopore data

proteo_nano <- plot_composition(data.nano.rare, "Phylum", "Proteobacteria", "Family", fill="Family")+ facet_grid(~treatment, scales="free_x", space="free_x")

proteo_miseq <- plot_composition(data.miseq.rare, "Phylum", "Proteobacteria", "Family", fill="Family")+ facet_grid(~treatment, scales="free_x", space="free_x")

grid.arrange(proteo_nano, proteo_miseq, nrow = 1)

Also here we can see that there is quite a large amount of unassigned reads here also for the nanopore data compared with the miseq.

Let us look at the lower level. We see that Xanthomonadaceae varies between the different treatments and let us see how species can be determined here.

Xanth_nano <- plot_composition(data.nano.rare, "Family", "Xanthomonadaceae", "Species", fill="Species")+ facet_grid(~treatment, scales="free_x", space="free_x")

xanth_miseq <- plot_composition(data.miseq.rare, "Family", "Xanthomonadaceae", "Species", fill="Species")+ facet_grid(~treatment, scales="free_x", space="free_x")

grid.arrange(Xanth_nano, xanth_miseq, nrow = 1)

Another examples can be Bacillaceae

bacil_nano <- plot_composition(data.nano.rare, "Family", "Bacillaceae", "Species", fill="Species")+ facet_grid(~treatment, scales="free_x", space="free_x")

bacil_miseq <- plot_composition(data.miseq.rare, "Family", "Bacillaceae", "Species", fill="Species")+ facet_grid(~treatment, scales="free_x", space="free_x")

grid.arrange(bacil_nano, bacil_miseq, nrow = 1)