Compare time-course GCN across tissues (Mosquito)

Last updated on 2025-10-30.

Show code

library(dplyr)
library(dbplyr)
library(ggplot2)
for (i in list.files(here::here("R"), full.names = TRUE)) {
  source(i)
}

# SAMPLE NAME
## specify sample name
sample.names <- c(
  # # dmel
  # "dmel-head",
  # "dmel-body",
  # "dmel-heart"
  # # mmus
  # "mmus-brain_stem",
  # "mmus-cerebellum",
  # "mmus-hypothalamus",
  # "mmus-heart"
  # # panu
  # "panu-brain_stem",
  # "panu-cerebellum",
  # "panu-hypothalamus",
  # "panu-heart",
  # "panu-scn"
  # agam
  "agam-head",
  "agam-body"
)
# sample.cycles <- c("LD", "DD")

## SPECIFY THE DATASET TO BUILD GCN WITH
ref.sample <- "agam-head"

writeLines(
  glue::glue("Reference tissue: {ref.sample}")
)

Reference tissue: agam-head

Prep data

Expected format: one row per gene, one col per sample

Structure of reference tissue data:

ONE-SHOT (make_modules)

Make modules in one call

Show code

mods <- timecourseRnaseq::make_modules(
  data = tidydata.db[[ref.sample]],
  log2 = TRUE,
  id_column = "gene_name",
  min_expression = NULL,          # automatically estimated
  min_timepoints = NULL,          # automatically estimated
  method = "wgcna",
  qc = TRUE,
  sim_method = "kendall",
  soft_power = NULL,              # automatically estimated
  min_module_size = 50,
  max_modules = 16,
  merge_cutoff_similarity = 0.9,
  plot_network = FALSE,
  # plot_network_min_edge = 0.5,  # used when `plot_network == TRUE`
  tidy_modules = TRUE
  )

---------------------------------------------------
1. Log2-transform and subset 
---------------------------------------------------
Applying log2-transformation...Done.
Estimated min_expression = 3 
Estimated min_timepoints = 9 
Subsetting data...Done.
[ NOTE ]: After subsetting, 3819 of 11269 rows remain.

 Flagging genes and samples with too many missing values...
  ..step 1
All okay!Visualizing the log-transformed data
---------------------------------------------------
2. Calculate similarity of expression 
---------------------------------------------------
Running gene-gene similarity...Done.


---------------------------------------------------
3. Create adjacency matrix 
---------------------------------------------------
Performing network topology analysis to pick
  soft-thresholding power...
   Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
1      1   0.5890  2.350          0.954 1320.00   1350.00 1840.0
2      2   0.1500  0.441          0.867  654.00    659.00 1150.0
3      3   0.0784 -0.220          0.815  382.00    371.00  807.0
4      4   0.4330 -0.559          0.843  246.00    227.00  607.0
5      5   0.6480 -0.752          0.906  169.00    146.00  476.0
6      6   0.7340 -0.883          0.920  122.00     98.70  385.0
7      7   0.8060 -0.970          0.940   91.20     68.80  318.0
8      8   0.8520 -1.020          0.957   70.10     49.00  267.0
9      9   0.8760 -1.060          0.965   55.10     35.80  228.0
10    10   0.8990 -1.090          0.977   44.20     26.70  198.0
11    12   0.9030 -1.170          0.973   29.70     15.60  154.0
12    15   0.9170 -1.240          0.982   17.90      7.56  113.0
13    18   0.9100 -1.330          0.974   11.60      4.03   86.9
14    21   0.9150 -1.360          0.973    7.92      2.35   68.7

Plotting resutls from the network topology analysis...

Done.
[ NOTE, FIGURE ]: Red horizontal line indicates a signed R^2 of 0.9

Setting soft-thresholding power to: 8.
Power-transforming the gene-gene similarity matrix...Done.

---------------------------------------------------
4. Convert into topological overlap matrix (dissTOM) 
---------------------------------------------------
Creating dissTOM...Done.
Performing hierarchical clustering on dissTOM...Done.

---------------------------------------------------
5. Identify modules (clusters) 
---------------------------------------------------
Merging modules that have a correlation ≥ 0.9 ...Done.
[ NOTE, FIGURE ] Plotting identified clusters before and after merging.

Module (cluster) size:
mergedColors
       black         blue        brown         cyan        green  greenyellow 
         299          389          354           79          327          111 
   lightcyan      magenta midnightblue         pink       purple          red 
          57          244           77          246          148          305 
      salmon          tan    turquoise       yellow 
          80          101          653          349 

Cutoff used: 0.9 
Number of modules identified: 16 

Calculating module-module similarity based
  on module-eigengene-expression...Done.
Tidying module names...Done.
Plotting adjacency matrix for module-module similarity...

---------------------------------------------------
6. Tidy modules (clusters) 
---------------------------------------------------

Show code

# saveRDS(
#   mods,
#   here::here(
#     glue::glue(
#       "result/tissue-comparison/ref_{ref.sample}timecourse-gcn.rds"
#     )
#   )
# )

Modify network plot

Internal function; use ::: to call

Show code

timecourseRnaseq:::plot_adj_as_network(
  matrix = mods[["adj_matrix_ME"]][["ME"]],
  # layout = igraph::layout.sugiyama,         # default view
  layout = igraph::layout_in_circle,          # changed
  min_edge = 0.6,
  node_label_size = 1.2,
  node_size = 30,
  edge_size = 3,
  node_frame_col = "grey20",
  node_fill_col = "grey80",
  vertex.frame.width = 3
)

Visualizing a simplified representation of the circadian GCN

Annotate the network

Identify rhythmic modules

Show code

# Obtain a list of genes in each module
l_module_genes <- mods[["module_genes"]] |> 
  arrange(module_identity) |> 
  group_split(module_identity) |> 
  purrr::map(
    ~ .x |> pull(gene_name)
  ) |> 
  setNames(unique(mods[["module_genes"]]$module_identity))

# Load results of rhythmicity analyses
db_rhy <- purrr::map(
  sample.names,
  ~ load_rhy_genes(
      sample = .x
    )
) |> 
  setNames(sample.names)

# Set your p-value of choice
###-###-###-###-###-###-###-###-
# col_pval = "BH.Q"
col_pval = "default.pvalue"
# col_pval = "raw.pvalue"
###-###-###-###-###-###-###-###-

# Obtain a list of rhythmic genes in each tissue
l_rhy_genes <- purrr::map(
  sample.names,
  ~ db_rhy[[.x]] |> 
    purrr::map(
      ~ .x |> 
        filter(
          if_all(
            all_of(col_pval),
            ~ .x < 0.05
          )
        ) |> 
        filter(
          ID %in% unlist(l_module_genes)
        ) |> 
        pull(1) |> 
        unique()
    ) |> 
    purrr::compact()
) |> 
  setNames(sample.names)

Modules vs. Rhythmic genes

Note

Show code

# LIST ONE - WGCNA modules
list1 <- l_module_genes
sapply(list1, length) |> print()

 C1  C2  C3  C4  C5  C6  C7  C8  C9 C10 C11 C12 C13 C14 C15 C16 
 79 246 299  57 354 349  77 148 653 111  80 244 305 327 101 389

Show code

trash <- purrr::map(
  sample.names,
  function(x) {
    cat("Tissue:", x, "\n")
    
    ## LIST TWO - rhythmic genes
    list2 <- l_rhy_genes[[x]]
    sapply(list2, length) |> print()
    
    ## CHECK FOR OVERLAP
    # define size of genome
    size = length(unique(c(unlist(list1), unlist(list2))))
    # make a GOM object
    gom.1v2 <- GeneOverlap::newGOM(
      list2, 
      list1,
      genome.size = size
    )
    # GeneOverlap::drawHeatmap(
    #   gom.1v2,
    #   # adj.p = TRUE,
    #   # cutoff = 0.05,
    #   # what = "odds.ratio",
    #   # # what="Jaccard",
    #   # log.scale = T,
    #   # note.col = "black",
    #   grid.col = "Oranges"
    # )
    
    gom.1v2
  }
) |> 
  setNames(sample.names)

Tissue: agam-head 
      ARS    empJTK GeneCycle       JTK        LS    meta2d      RAIN 
     2074      1978      1665      1719      1093      1636      2265 
Tissue: agam-body 
      ARS    empJTK GeneCycle       JTK        LS    meta2d      RAIN 
     1174      1606      1420      1333       607      1210      2151

Summary figure

Show code

odds_ratio <- purrr::map_dfr(
  sample.names,
  ~ trash[[.x]] |> 
    GeneOverlap::getMatrix(
      "odds.ratio"
    ) |> 
    as.data.frame() |> 
    tibble::rownames_to_column("test") |> 
    tidyr::pivot_longer(
      cols = !test,
      names_to = "modules",
      values_to = "odds.ratio"
    ) |> 
    mutate(
      tissue = .x,
      .before = 1
    )
)

pvalues <- purrr::map_dfr(
  sample.names,
  ~ trash[[.x]] |> 
    GeneOverlap::getMatrix(
      "pval"
    ) |> 
    as.data.frame() |> 
    tibble::rownames_to_column("test") |> 
    tidyr::pivot_longer(
      cols = !test,
      names_to = "modules",
      values_to = "pval"
    ) |> 
    mutate(
      tissue = .x,
      .before = 1
    )
)

rhy_genes <- purrr::map_dfr(
  sample.names,
  function (x) {
    algo <- names(l_rhy_genes[[x]])
    tibble(
      tissue = x,
      test = algo,
      n_rhy_genes = sapply(l_rhy_genes[[x]], length)
    )
  }
)

left_join(
  odds_ratio,
  pvalues,
  join_by(test, modules, tissue)
) |> 
  left_join(
    rhy_genes,
    join_by(tissue, test)
  ) |> 
  # clean data
  filter(
    n_rhy_genes > 40
  ) |> 
  # check significance
  mutate(
    sig = if_else(
      pval < 0.05,
      TRUE,
      FALSE
    )
  ) |> 
  # STEP ONE
  group_by(tissue, modules) |> 
  reframe(
    prop_sig = sum(sig) / n(),
    tot_algo = n()
  ) |> 
  # STEP TWO
  filter(
    tot_algo > 3 &
      prop_sig > 3/4
  ) |> 
  # TRANSFORM / VISUALIZE
  select(- tot_algo) |> 
  tidyr::pivot_wider(
    names_from = modules,
    values_from = prop_sig
  ) |> 
  mutate(
    across(
      !tissue,
      ~ if_else(
        is.na(.x), 
        "-", 
        paste0(
          round(.x, 2)*100,
          "%"
        )
      )
    )
  ) |> 
  select(
    tissue,
    any_of(
      paste0("C", 1:20)
    )
  ) |> 
  kableExtra::kable()

tissue	C2	C5	C6	C9	C10	C11	C13	C16
agam-body	-	100%	-	86%	86%	-	-	-
agam-head	100%	100%	100%	-	100%	100%	100%	100%

Module preservation

Prep data

Show code

# tidydata.db |> 
#   purrr::map_int(
#     ~ nrow(.x)
#   )

# We work with two sets:
nSets = 2;

# Find the common set of genes across all samples
common_genes <- tidydata.db |> 
  purrr::map(
    ~ .x |> 
      pull(gene_name) |> 
      unique()
  ) |> 
  purrr::reduce(
    intersect
  ) |> 
  intersect(
    mods[["module_genes"]]$gene_name
  ) 

# filter to keep only genes used in creating REF network
samples.list <- tidydata.db |> 
  purrr::map(
    ~ .x |> 
      filter(
        gene_name %in% common_genes
      )
  )

test_samples <- setdiff(sample.names, ref.sample)

# OPTION 1: Read previous run from memory
l_mp <- readRDS(
  here::here(
    glue::glue(
      "result/tissue-comparison/ref_{ref.sample}_module-preservation.rds"
    )
  )
)

# # OPTION 2: Run module preservation for each tissue pair (Takes time)
# l_mp <- purrr::map(
#   test_samples,
#   function (x) {
#     multiExpr <- prep_data_module_preservation(
#       data = samples.list,
#       ref = ref.sample,
#       test = x
#     )
# 
#     # CHECKS
#     #---#---#---#---#---#---#---#---#---#---#---#---
#     # Check that the data has the correct format for many functions
#     # operating on multiple sets:
#     exprSize = WGCNA::checkSets(multiExpr)
#     # Check that all genes and samples have sufficiently low numbers of
#     # missing values.
#     gsg = WGCNA::goodSamplesGenesMS(multiExpr, verbose = 1);
#     cat("All samples okay?\n", gsg$allOK, "\n")
#     #---#---#---#---#---#---#---#---#---#---#---#---
# 
#     # prep data
#     #---#---#---#---#---#---#---#---#---#---#---#---
#     # Expression data
#     multiExpr_1 = list(
#       ref = list(data = multiExpr[[1]]$data),
#       test = list(data = multiExpr[[2]]$data)
#     )
#     ## filter to keep only the genes that we are working with
#     mod.identity <- mods[["module_genes"]] |>
#       select(
#         gene_name,
#         old_labels
#       ) |>
#       filter(
#         gene_name %in% common_genes
#       ) |>
#       # !!this step is necessary!! #
#       arrange(gene_name)
#     ## specify the module identity of the genes
#     moduleColors <- mod.identity |>  pull(old_labels)
#     multiColor = list(ref = moduleColors)
# 
#     # Run module preservation
#     #---#---#---#---#---#---#---#---#---#---#---#---
#     mp = WGCNA::modulePreservation(
#       multiExpr_1,
#       multiColor,
#       referenceNetworks = 1,
#       nPermutations = 100,
#       calculateQvalue = FALSE,
#       quickCor = 0,
#       verbose = 1
#     )
# 
#     mp
#   }
# )
# 
# saveRDS(
#   l_mp,
#   here::here(
#     glue::glue(
#       "result/tissue-comparison/ref_{ref.sample}_module-preservation.rds"
#     )
#   )
# )

Plot results

Show code

l_zsummary <- purrr::map2(
  l_mp,
  test_samples,
  function (mp, y) {
    ref = 1
    test = 2
    statsObs = cbind(
      mp$quality$observed[[ref]][[test]], 
      mp$preservation$observed[[ref]][[test]]
    )
    statsZ = cbind(
      mp$quality$Z[[ref]][[test]][,-1], 
      mp$preservation$Z[[ref]][[test]][,-1]
    )
    
    # Compare preservation to quality:
    z.stats <- cbind(
      statsObs[, c("moduleSize", "medianRank.pres", "medianRank.qual")],
      signif(statsZ[, c("Zsummary.pres", "Zsummary.qual")], 2)
    )
    
    mains = c(
      "Preservation Median rank", 
      "Preservation Zsummary"
    )
    pd <- position_dodge(0.5)
    
    # Plot
    out <- z.stats |> 
      tibble::rownames_to_column("old_labels") |> 
      left_join(
        distinct(mods[["module_genes"]][-1]), 
        by="old_labels"
      ) |> 
      select(
        module_name = module_identity,
        module_size = moduleSize,
        everything()
      ) |> 
      filter(
        !is.na(module_name)
      ) |>
      mutate(
        module_name = factor(
          module_name, 
          levels = unique(mods[["module_genes"]]$module_identity)
        )
      )
    
    ## PLOT PRESERVATION Z-SUMMARY
    p <- out |> 
      ggplot(aes(x=log10(module_size), y=Zsummary.pres)) +
      geom_hline(yintercept = c(2,10), col="darkred", alpha=0.5) +
      geom_point(
        alpha=0.5, 
        size=20, 
        col="lightgrey",
        position = pd
      ) +
      geom_text(
        aes(label=module_name),
        check_overlap = TRUE,
        size = 5
      ) +
      theme_bw(20) +
      scale_x_continuous(limits = c(0,max(log10(1000))+0.5),
                         breaks = c(0,1,2,3),
                         labels = c("0","10","100","1000")) +
      xlab("module size (genes)") +
      ylab(mains[2]) +
      ggtitle(paste0("Test: ", y))
    
    print(p)

    out
  }
) |> 
  setNames(
    test_samples
  )

Summary

Show code

z_summ <- purrr::map_dfr(
  test_samples,
  ~ l_zsummary[[.x]] |> 
    select(
      modules = module_name,
      zsumm = Zsummary.pres
    ) |> 
    mutate(
      # ref = which.sample,
      tissue = .x,
      .before = 1
    )
) |> 
  as_tibble()

z_summ |> 
  filter(
    zsumm > 2
  ) |> 
  arrange(
    desc(zsumm)
  ) |> 
  head(6) |> 
  tidyr::pivot_wider(
    names_from = modules,
    values_from = zsumm
  ) |> 
  mutate(
    across(
      !tissue,
      ~ if_else(
        is.na(.x), 
        "-", 
        round(.x, 2) |> as.character()
      )
    )
  ) |> 
  select(
    tissue,
    any_of(
      paste0("C", 1:20)
    )
  ) |> 
  kableExtra::kable()

tissue	C1	C2	C7	C8	C9	C12
agam-body	5.1	8.8	7	4.2	5	3.9

Network statistics

From WGCNA-tutorial

Intramodular connectivity

“We begin by calculating the intramodular connectivity for each gene. (In network literature, connectivity is often referred to as”degree”.) The function intramodularConnectivity computes:

the whole network connectivity kTotal,
the within (intra)module connectivity kWithin,
the extra-modular connectivity kOut=kTotal-kWithin, and
the difference of the intra- and extra-modular connectivities kDiff = kIn - kOut = 2*kIN-kTotal

Show code

# From what I can tell, colorh1 in the tutorial refers to moduleColors
colorh1 <- mods[["modules"]]$colors
adj_matrix <- mods[["adj_matrix"]]

# Calculate the connectivities of the genes
Alldegrees1 = WGCNA::intramodularConnectivity(
  adjMat = adj_matrix,
  colors = colorh1
) |>
  mutate(
    gene_name = rownames(adj_matrix),
    across(
      matches("^k"),
      ~ round(.x, 2)
    )
  )

Plotting the mean (± 95% CI) connectivity of genes in different modules

Show code

pd <- position_dodge(0.1)

# which_var <- "kTotal"
which_var <- c("kTotal", "kWithin", "kOut", "kDiff")

Alldegrees1 |>
  # rownames_to_column("gene_name") %>%
  left_join(
    mods[["module_genes"]],
    join_by(gene_name)
  ) |>
  # PLOT FROM RAW DATA
  mutate(
    module_identity = factor(
      module_identity,
      levels = paste0(
        "C",
        sort(
          unique(mods[["module_genes"]]$module_identity) |>
            stringr::str_replace("C", "") |>
            as.integer()
        )
      ) |> rev()
    )
  ) |>
  summarySE(
    measurevar = which_var,
    groupvars = "module_identity"
  ) |>
  mutate(
    type = factor(
      type,
      levels = which_var
    )
  ) |>
  # Plot
  ggplot(
    aes(
      y = module_identity,
      x = mean,
      group = interaction(module_identity, type)
    )
  ) +
  geom_vline(xintercept = 0, col = "maroon", alpha = 0.7, lwd =1.2) +
  labs(
    y = "",
    x = glue::glue(
      "Connectivity"
    ),
    title = ""
  ) +
  ## Add error bar here
  geom_errorbar(
    aes(xmin = mean-ci, xmax = mean+ci),
    width = 0.3,
    position=pd,
    lwd = 1.3,
    col="black",
    alpha = 1
  ) +
  # Add the points on top of the error bars
  geom_point(
    position = pd,
    size = 3,
    col = "black",
    fill = "orange",
    show.legend = F,
    pch = 21,
    alpha = 0.9
  ) +
  facet_wrap(
    ~ type,
    nrow = 2,
    scales = "free_x"
  ) +
  scale_x_continuous(
    n.breaks = 4
  ) +
  theme_bw(25) +
  # scale_color_manual(values=c("#F20505","#F5D736","#0FBF67")) +
  theme(
    legend.position = "none"
  )