R-ODAF DEG Analysis

#R-ODAF Differential Expression Analysis

The source code for R-ODAF’s DEG analysis can be found on GitHub (https://github.com/R-ODAF/Main/blob/main/scripts/R_ODAF_DEGs.R).

The General pipeline can be seen here in Figure 1. I conducted a differential expression analysis using the R-ODAF pipeline, then used the differentially expressed genes as input for the BMD analysis. For the BMD analysis, I did not perform William’s trend test to identify differentially expressed genes, because I use the R-ODAF DESeq2::DESeq() function to identify DEGs instead… the purpose of doing so was to facilitate a discussion of the differentially expressed genes from the R-ODAF pipeline and enriched gene sets from the R-ODAF pipeline. Since the R-ODAF DEG pipeline is less sensitive, it is less likely that DEGs are false-positive results. Thus, I can have more confidence in discussing enriched gene sets and differentially expressed genes identified from this analysis without worrying about false-positives. I reiterate that, the purpose of this section is to facilitate a discussion and not to identify points of departure, although it is possible to calculate tPODs using this method (which I have included below). One of the issues with calculating a tPOD from the R-ODAF DEG method is that it is less sensitive and thus I excluded this tPOD derivation method from the main body of my thesis and this section will end up in the supplementarty information.

Again to reiterate, I followed the R-ODAF pipeline all the way down to the Differential Expresion analysis part of the pipeline. I then used the table of differentially expressed genes in the benchmark dose analysis. I did not filter the DEGs with William’s trend test of a fold-change filter since DEGs have already been identified using the DESeq2::DESeq() function. I did however, keep the remaining filtering criteria downstream of the William’s trend test and fold-change filter.

Please see the following code chunk for the source code used for this specific data set

library(DESeq2)
library(edgeR)

norm_counts <- list()
norm_CPM_counts <- list()
all_DECounts_real <- list()

minCoverage <- 5000000 #reads
MinCount <- 1
pAdjValue <- 0.01

for(i in names(filter_list)) {
  Samp4compare <- metadata %>%
    filter(chemical == i) %>%
    filter(!dose == 0) %>%
    select(dose) %>%
    unique() %>%
    as.vector()
  names(Samp4compare) <- NULL
  Cont4compare <- metadata %>%
    filter(chemical == i) %>%
    filter(dose == 0) %>%
    select(dose) %>%
    unique() %>%
    as.vector()
  names(Cont4compare) <- NULL
  DESIGN <- "dose"
  
  sampleData <- filter_list[[i]]
  DESeqDesign <- metadata_list[[i]]
  #MAke sure these are in the exact same order!!!!!
  #all(rownames(DESeqDesign) %in% colnames(sampleData)) # Are all the row and coulm names matching?
  #all(rownames(DESeqDesign) == colnames(sampleData)) #Are they in the exact same order?
  if (all(rownames(DESeqDesign) == colnames(sampleData)) == FALSE) {
    print(
      "Metadata check: Rownames of DESeqDesign DO NOT MATCH the column names of sampleData!!!"
    )
    break
  } else {
    print("Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!")
  }
  
  sampleData[is.na(sampleData)] <- 0 #Replace NA with zero
  sampleData <-
    sampleData[, (colSums(sampleData) > minCoverage)] #Exclude samples with less than 5 million reads
  
  condition1 <- Cont4compare[[1]][1]
  condition2 <- Samp4compare[[1]]
  
  DE_Design <- matrix(data = NA, ncol = 2)
  DE_Design <- DESeqDesign %>%
    filter(dose %in% c(condition1, condition2))
  DE_Design$dose <-
    factor(DE_Design$dose,
           levels = c(sort(unique(DE_Design$dose))),
           ordered = FALSE)
  samples <- sampleData[, rownames(DE_Design)]
  
  ###########
  print(paste(condition2, " vs ", condition1))
  
  colnames(samples) <- NULL
  dds <-
    DESeqDataSetFromMatrix(
      countData = round(samples),
      colData = as.data.frame(DE_Design),
      design = as.formula(paste0("~", DESIGN[1]))
    )
  
  #Re-level the data
  dds$dose <- relevel(dds$dose, ref = "0")
  
  print("Wait... (dds step executing)")
  dds <- DESeq(dds, quiet = TRUE)
  
  print(paste0(
    "Filtering genes: 75% of at least 1 group need to be above ",
    MinCount,
    " CPM"
  ))
  
  #Filter low readcounts (genes not meeting the condition : at least one condition with 75% of the samples above 1 CPM)
  
  SampPerGroup <- table(DE_Design[, DESIGN])
  Counts <- counts(dds, normalized = TRUE)
  CPMdds <- cpm(counts(dds, normalized = TRUE))
  
  Filter <- matrix(data = NA,
                   ncol = 3,
                   nrow = nrow(Counts))
  rownames(Filter) <- rownames(Counts)
  colnames(Filter) <- c("Low", "quantile", "spike")
  
  # Apply the "Relevance" condition
  
  for (gene in 1:nrow(dds)) {
    CountsPass <- NULL
    for (group in 1:length(SampPerGroup)) {
      sampleCols <-
        grep(paste0("^", dimnames(SampPerGroup)[[1]][group], "$"), DE_Design[, DESIGN]) #Once again R-ODAF has a bug where the grep is not explicit enough... for example if the group is "0" i greps all dimnames that contain a zero... annoying
      Check <-
        sum(CPMdds[gene, sampleCols] >= MinCount) >= 0.75 * SampPerGroup[group]
      CountsPass <- c(CountsPass, Check)
    }
    
    if (sum(CountsPass) > 0) {
      Filter[gene, 1] <- 1
    }   else {
      Filter[gene, 1] <- 0
    }
    
  }
  
  
  compte <- Counts[Filter[, 1] == 1,]
  Filter <- Filter[rownames(Filter) %in% rownames(compte),]
  
  print(
    paste(
      "Relevance filtering removed ",
      nrow(dds) - nrow(Filter),
      " genes from the ",
      nrow(dds),
      " assessed. ",
      nrow(Filter),
      " genes remaining",
      sep = ""
    )
  )
  
  print("Obtaining the DESeq2 results")
  
  # compute the DEGs on the genes passing the Relevance condition
  
  res_list <- NULL
  FileName_list <- NULL
  DEsamples_list <- NULL
  DECounts_list <- NULL
  Filter_DEG_list <- NULL
  for (k in condition2) {
    res <-
      results(
        dds[rownames(compte),],
        alpha = pAdjValue,
        cooksCutoff = F,
        independentFiltering = F,
        contrast = c(DESIGN[1], k, condition1),
        pAdjustMethod = 'fdr'
      )
    res_list[[paste0(i, "_", k)]] <- res
    
    FileName <- paste(i, k, "vs", condition1, "FDR", pAdjValue, sep = "_")
    FileName_list[[paste0(i, "_", k)]] <- FileName
    
    DEsamples <- subset(res, res$padj < pAdjValue)
    DEsamples_list[[paste0(i, "_", k)]] <- DEsamples
    
    DECounts <-
      compte[rownames(compte) %in% rownames(DEsamples), , drop = FALSE] #R-ODAF has a bug where the rowname is dropped if only row is returned... Fixing that with a simple argument here
    DECounts_list[[paste0(i, "_", k)]] <- DECounts
    
    Filter_DEG <-
      Filter[rownames(Filter) %in% rownames(DECounts), , drop = FALSE]
    Filter_DEG_list[[paste0(i, "_", k)]] <- Filter_DEG
  }
  
  Filter_DEGs <- do.call(rbind, Filter_DEG_list)
  rows_to_keep <- unique(rownames(Filter_DEGs))
  Filter_DEGs <- Filter_DEGs[rows_to_keep,]
  
  all_DECounts <- do.call(rbind, DECounts_list)
  rows_to_keep <- unique(rownames(all_DECounts))
  all_DECounts <- all_DECounts[rows_to_keep,]
  
  all_DEsamples <- do.call(rbind, DEsamples_list)
  all_DEsamples$comparison <- rep(names(DEsamples_list), sapply(DEsamples_list, nrow))
  
  norm_data <- counts(dds[rownames(compte)], normalized = TRUE) #Normalized counts data filtered for only relevant genes!!
  
    print("Check median against third quantile")
    print("AND")
    print("Check the presence of a spike")
    for (j in 1:length(condition2)+1) {
      for (gene in 1:nrow(all_DECounts)) {
        # Check the median against third quantile
        quantilePass <- NULL
        sampleColsg1 <-
          grep(paste0("^", dimnames(SampPerGroup)[[1]][1], "$"), DE_Design[, DESIGN])
        sampleColsg2 <-
          grep(paste0("^", dimnames(SampPerGroup)[[1]][j], "$"), DE_Design[, DESIGN])
        
        Check <- median(all_DECounts[gene,sampleColsg1]) > quantile(all_DECounts[gene,sampleColsg2], 0.75)[[1]]
        quantilePass <-c(quantilePass, Check)
        Check <- median(all_DECounts[gene,sampleColsg2]) > quantile(all_DECounts[gene,sampleColsg1], 0.75)[[1]]
        quantilePass <-c(quantilePass, Check)
        
        if ( sum(quantilePass) > 0 ) {Filter_DEGs[gene,2] <- 1 }    else { Filter_DEGs[gene,2] <- 0 }
        
        # Check for spike 
        spikePass <- NULL
        for (group in 1:length(SampPerGroup)) { 
          sampleCols<-grep(paste0("^", dimnames(SampPerGroup)[[1]][group], "$"),DE_Design[,DESIGN])
          if (max(all_DECounts[gene,sampleCols]) ==0) {Check <- FALSE} else {
            Check <- (max(all_DECounts[gene,sampleCols])/sum(all_DECounts[gene,sampleCols])) >= 1.4*(SampPerGroup[group])^(-0.66)
            spikePass<-c(spikePass, Check)
          }
        }
        if ( sum(spikePass) > 1 ) {Filter_DEGs[gene,3] <- 0 }   else { Filter_DEGs[gene,3] <- 1 }
      }
    }
    DECounts_real <- all_DEsamples[rowSums(Filter_DEGs) == 3 ,]
    DECounts_no_quant <- all_DEsamples[Filter_DEGs[,2] == 0 ,]
    DECounts_spike <- all_DEsamples[Filter_DEGs[,3] == 0 ,]
    
    print(paste("A total of ",nrow(DECounts_real), " DEGs were selected, after ",nrow(DECounts_no_quant)," genes(s) removed by the quantile rule and ", nrow(DECounts_spike)," gene(s) with a spike",sep=""))
    
    write.table(norm_data,file=paste0(here::here(), "/RNAseqData/output/", i, "_R-ODAF_DESeq2_Norm_Data.txt"), sep="\t", quote=FALSE)
    write.table(DECounts_real,file=paste0(here::here(), "/RNAseqData/output/", i, "_R-ODAF_DESeq2_DEG_table.txt"), sep="\t", quote=FALSE)
}

ANILINE:
[1] "Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!"
[1] "0.001  vs  0" "0.01  vs  0"  "0.1  vs  0"   "1  vs  0"     "10  vs  0"
converting counts to integer mode
[1] "Wait... (dds step executing)"
[1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
[1] "Relevance filtering removed 3158 genes from the 21989 assessed. 18831 genes remaining"
[1] "Obtaining the DESeq2 results"
[1] "Check median against third quantile"
[1] "AND"
[1] "Check the presence of a spike"
[1] "A total of 6 DEGs were selected, after 0 genes(s) removed by the quantile rule and 0 gene(s) with a spike"

DECANOL:
[1] "Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!"
[1] "0.0001  vs  0" "0.001  vs  0"  "0.01  vs  0"   "0.1  vs  0"    "1  vs  0"
converting counts to integer mode
[1] "Wait... (dds step executing)"
[1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
[1] "Relevance filtering removed 2983 genes from the 23274 assessed. 20291 genes remaining"
[1] "Obtaining the DESeq2 results"
[1] "Check median against third quantile"
[1] "AND"
[1] "Check the presence of a spike"
[1] "A total of 401 DEGs were selected, after 2 genes(s) removed by the quantile rule and 0 gene(s) with a spike"

EE2:
[1] "Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!"
[1] "0.0000001  vs  0" "0.000001  vs  0"  "0.00001  vs  0"   "0.0001  vs  0"    "0.001  vs  0"
converting counts to integer mode
[1] "Wait... (dds step executing)"
[1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
[1] "Relevance filtering removed 3390 genes from the 22235 assessed. 18845 genes remaining"
[1] "Obtaining the DESeq2 results"
[1] "Check median against third quantile"
[1] "AND"
[1] "Check the presence of a spike"
[1] "A total of 174 DEGs were selected, after 28 genes(s) removed by the quantile rule and 1 gene(s) with a spike"

FADROZOLE:
[1] "Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!"
[1] "0.0001  vs  0" "0.001  vs  0"  "0.01  vs  0"   "0.1  vs  0"    "1  vs  0"
converting counts to integer mode
[1] "Wait... (dds step executing)"
[1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
[1] "Relevance filtering removed 3077 genes from the 22090 assessed. 19013 genes remaining"
[1] "Obtaining the DESeq2 results"
[1] "Check median against third quantile"
[1] "AND"
[1] "Check the presence of a spike"
[1] "A total of 49 DEGs were selected, after 6 genes(s) removed by the quantile rule and 2 gene(s) with a spike"

FENITROTHION:
[1] "Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!"
[1] "0.0001  vs  0" "0.001  vs  0"  "0.01  vs  0"   "0.1  vs  0"    "1  vs  0"
converting counts to integer mode
[1] "Wait... (dds step executing)"
[1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
[1] "Relevance filtering removed 2835 genes from the 23280 assessed. 20445 genes remaining"
[1] "Obtaining the DESeq2 results"
[1] "Check median against third quantile"
[1] "AND"
[1] "Check the presence of a spike"
[1] "A total of 1303 DEGs were selected, after 128 genes(s) removed by the quantile rule and 1 gene(s) with a spike"

GLYPHOSATE:
[1] "Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!"
[1] "0.0001  vs  0" "0.001  vs  0"  "0.01  vs  0"   "0.1  vs  0"    "1  vs  0"
converting counts to integer mode
[1] "Wait... (dds step executing)"
[1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
[1] "Relevance filtering removed 3268 genes from the 22157 assessed. 18889 genes remaining"
[1] "Obtaining the DESeq2 results"
[1] "Check median against third quantile"
[1] "AND"
[1] "Check the presence of a spike"
[1] "A total of 14 DEGs were selected, after 0 genes(s) removed by the quantile rule and 0 gene(s) with a spike"

MALATHION:
[1] "Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!"
[1] "0.0001  vs  0" "0.001  vs  0"  "0.01  vs  0"   "0.1  vs  0"    "1  vs  0"
converting counts to integer mode
[1] "Wait... (dds step executing)"
[1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
[1] "Relevance filtering removed 2896 genes from the 23315 assessed. 20419 genes remaining"
[1] "Obtaining the DESeq2 results"
[1] "Check median against third quantile"
[1] "AND"
[1] "Check the presence of a spike"
[1] "A total of 163 DEGs were selected, after 4 genes(s) removed by the quantile rule and 0 gene(s) with a spike"

PROCHLORAZ:
[1] "Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!"
[1] "0.0001  vs  0" "0.001  vs  0"  "0.01  vs  0"   "0.1  vs  0"    "1  vs  0"
converting counts to integer mode
[1] "Wait... (dds step executing)"
[1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
[1] "Relevance filtering removed 2906 genes from the 23213 assessed. 20307 genes remaining"
[1] "Obtaining the DESeq2 results"
[1] "Check median against third quantile"
[1] "AND"
[1] "Check the presence of a spike"
[1] "A total of 259 DEGs were selected, after 20 genes(s) removed by the quantile rule and 0 gene(s) with a spike"

TRENBOLONE:
[1] "Metadata check: Rownames of DESeqDesign MATCH column names of sampleData!"
[1] "0.00001  vs  0" "0.0001  vs  0"  "0.001  vs  0"   "0.01  vs  0"    "0.1  vs  0"
converting counts to integer mode
[1] "Wait... (dds step executing)"
[1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
[1] "Relevance filtering removed 2960 genes from the 23265 assessed. 20305 genes remaining"
[1] "Obtaining the DESeq2 results"
[1] "Check median against third quantile"
[1] "AND"
[1] "Check the presence of a spike"
[1] "A total of 72 DEGs were selected, after 0 genes(s) removed by the quantile rule and 1 gene(s) with a spike"

Following normalization and the ‘relevance’ filter, the DESeq2 R package was used to identify differentially expressed genes via the DESeq2::DESeq() function which applies Negative Binomial Generalized Linear Model (GLM) fitting and the Wald test for the GLM coefficients (Love et al., 2014). The model fit p-values were adjusted for multiple testing via the FDR adjustment (ɑ = 0.01). Genes with adjusted p-values greater than 0.01 were removed. Furthermore, a spurious spike in expression filter was applied.

Spurious Spike Threshold =1.4 (# of biological replicates)-0.66

If any samples exceeded the spurious spike threshold for a gene, the gene was removed. This function excludes genes with replicates representing more than 66% of the total read counts (with triplicate biological replicates). Spurious spikes in expression are defined as a single sample representing the majority of reads of a dose for a gene (Verheijen et al., 2022). Finally, a 3rd quartile rule filter was applied. It is common for DEG analyses to apply a fold-change filter. However, a fold-change filter can lead to the exclusion of genes that are differentially expressed, but have low expression levels. To replace the fold-change filter, the 3rd quartile rule can be used to remove genes from the gene list for which the median of a dose group is not higher than the 3rd quartile of the control group (Verheijen et al., 2022). The list of unique genes from all dose group comparisons (5 comparisons: three controls vs three dose 5, three control vs three dose 1) remaining after the before mentioned filters were considered DEGs responsive to treatment. Figure 2 shows the number of DEGs per chemical, grouped by dose, in a bar plot. The bar plots showing the number of DEGs in each dose group were made o to ensure that DEGs were dose-responseive and monotonic (Figure 2).

#Plot
multiplot(plotlist = p_list,
          layout = matrix(
            c(1:length(p_list), NA, NA),
            nrow = 3,
            ncol = 3,
            byrow = TRUE
          ))

A list of all the DEGs identified from the R-ODAF DEG analysis can be found in the following table (Table 1)… It can be exported to a csv file if you are interested in downloading the data.

#Pull the DEG tables for supplementary Information
all_DEGs <- plyr::ldply(DEGs_list) %>%
  mutate(gene = str_split(gene, pattern = "[.]", simplify = TRUE)[, 1]) %>%
  mutate(comparison = str_split(comparison, pattern = "_", simplify = TRUE)[, 2])
datatable(
  all_DEGs,
  rownames = FALSE,
  extensions = 'Buttons',
  options = list(
    dom = 'Blfrtip',
    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
    lengthMenu = list(c(10, 25, 50, -1),
                      c(10, 25, 50, "All"))
  )
)

#NTP Dose-response Analysis - Using the DEGs from the R-ODAF Differential Expresion Analysis pipeline

knitr::include_graphics("Plots/R-ODAF_DEGs/R-ODAF_BMD_Histogram_small.svg")

A list of the filtered DEGs for the benchmark dose analysis can be found in the following table (Table 2)

#Pull the filtered DEG tables for supplementary Information
filtered_DEGs <- read_csv("All_Chemicals_BMD_Data_for_Histogram_R-ODAF_DEGs.csv")

## Rows: 572 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): .id, Probe.Id
## dbl (5): BMD, BMDL, BMDU, fitPValue, logBMD
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

datatable(
  filtered_DEGs,
  rownames = FALSE,
  extensions = 'Buttons',
  options = list(
    dom = 'Blfrtip',
    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
    lengthMenu = list(c(10, 25, 50, -1),
                      c(10, 25, 50, "All"))
  )
)

knitr::include_graphics("Plots/R-ODAF_DEGs/R-ODAF_GO_Plot_small.svg")

A table of the enriched GO terms can be found in the following table (Table 3)…

#Pull the enriched GO terms for supplementary Information
GO_terms <- read_csv("All_Chemicals_GO_Data_R-ODAF_DEGs.csv")

## Rows: 366 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): .id, GO.Pathway.Gene.Set.Gene.ID, GO.Pathway.Gene.Set.Gene.Name, BM...
## dbl (9): GO.Level, All.Genes..Expression.Data., Genes.That.Passed.All.Filter...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

datatable(
  GO_terms,
  rownames = FALSE,
  extensions = 'Buttons',
  options = list(
    dom = 'Blfrtip',
    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
    lengthMenu = list(c(10, 25, 50, -1),
                      c(10, 25, 50, "All"))
  )
)

knitr::include_graphics("Plots/R-ODAF_DEGs/R-ODAF_REACTOME_Plot_small.svg")

A table of all the enriched REACTOME pathways can be found in the following table (Table 4)…

#Pull the enriched REACTOME pathways for supplementary Information
REACTOME_paths <- read_csv("All_Chemicals_Reactome_Data_R-ODAF_DEGs.csv")

## Rows: 33 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): .id, GO.Pathway.Gene.Set.Gene.ID, GO.Pathway.Gene.Set.Gene.Name, B...
## dbl (10): All.Genes..Expression.Data., Genes.That.Passed.All.Filters, Fisher...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

datatable(
  REACTOME_paths,
  rownames = FALSE,
  extensions = 'Buttons',
  options = list(
    dom = 'Blfrtip',
    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
    lengthMenu = list(c(10, 25, 50, -1),
                      c(10, 25, 50, "All"))
  )
)

R-ODAF DEG Analysis

Jory Curry

24 July 2023