Build gene co-expression network (GCN) from time-course gene expression data

Last updated on 2025-10-30.

Show code

library(dplyr)
library(dbplyr)
library(ggplot2)
for (i in list.files(here::here("R"), full.names = TRUE)) {
  source(i)
}

# SAMPLE NAME
## specify sample name
sample.names <- c(
  # dmel
  "dmel-head",
  # mmus
  "mmus-brain_stem", 
  # panu
  "panu-hypothalamus"
)
# sample.cycles <- c("LD", "DD")

## SPECIFY THE DATASET TO BUILD GCN WITH
which.sample <- sample.names[1]

writeLines(
  glue::glue("Sample: {which.sample}")
)

Sample: dmel-head

(INT) Create databases

Show code

data.db <- load_data(
  sample_names = sample.names
)

cat("Structure of input data:")

Structure of input data:

Show code

data.db[[which.sample]] |> 
  head(5) |> 
  mutate_if(
    is.numeric,
    ~ round(.x, 1)
  ) |> 
  DT::datatable(
    caption = "Column 1: Name of feature (gene), Columns 2-N: Timepoints, Value: Expression / Activity",
    rownames = FALSE
  )

Show code

# cat("Here are the tables, in each database.")
# purrr::map(
#   data.db,
#   ~ src_dbi(.x)
# )

ONE-SHOT (make_modules)

Input data

|gene_name   |  ZT00|  ZT03|  ZT06|  ZT09|  ZT12|  ZT15|  ZT18|  ZT21|
|:-----------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
|FBgn0000008 | 4.852| 5.215| 4.281| 9.320| 7.119| 2.778| 7.372| 5.757|
|FBgn0000008 | 4.663| 5.024| 4.115| 8.958| 6.904| 2.769| 7.086| 5.533|
|FBgn0000014 | 0.039| 0.072| 3.509| 0.000| 0.056| 0.000| 0.016| 0.358|
|FBgn0000015 | 0.000| 0.015| 4.928| 0.000| 0.000| 0.135| 0.000| 1.598|

Make and plot modules

Show code

mods <- timecourseRnaseq::make_modules(
  # data = data.db[[which.sample]],
  data = data.db[[3]],
  log2 = TRUE,
  id_column = "gene_name",
  min_expression = NULL,  # automatically estimated
  min_timepoints = NULL,  # automatically estimated
  method = "wgcna",
  qc = FALSE,
  sim_method = "kendall",
  soft_power = NULL,      # automatically estimated
  min_module_size = 50,
  max_modules = 15,
  merge_cutoff_similarity = 0.9,
  plot_network_min_edge = 0.5,
  plot_network = TRUE,
  tidy_modules = FALSE
  )

mods |> str(max.level = 1)

Modify network plot

Show code

plot_adj_as_network(
  matrix = mods[["adj_matrix_ME"]][["ME"]],
  # layout = igraph::layout.sugiyama,
  layout = igraph::layout_in_circle, # changed layout
  min_edge = 0.6,
  node_label_size = 1.2,
  node_size = 20,
  edge_size = 3,
  node_frame_col = "grey20",
  node_fill_col = "grey80",
  vertex.frame.width = 3
)

STEP-BY-STEP

(PREP) Log-transform data

Show code

# Define the function
dat <- log2_transform_data(
  data = data.db[[which.sample]],
  id_column = "gene_name"
)

Applying log2-transformation...Done.

(SUBSET) Filter data

Show code

# Default: min_expression and min_timepoints
min_expression <- dat |> 
  select(-gene_name) |> 
  summarize(across(everything(), ~ mean(.x, na.rm = TRUE))) |> 
  rowMeans() |> 
  ceiling()

min_timepoints <- ceiling(ncol(dat)*(2/3))

Show code

# Which genes are expressed throughout the day in forager heads?
  # count the number of time points that has ≥ 1 FPKM
  # subset the data and only keep the filtered genes
  
dat_sub <- dat |> 
  subset_data(
    min_expression = min_expression,
    min_timepoints = min_timepoints,
    id_column = "gene_name"
  )

Subsetting data...Done.
[ NOTE ]: After subsetting, 5568 of 12451 rows remain.

Show code

writeLines("Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]")

Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]

Show code

dim(dat[-1])

[1] 12451     8

(SUMMARIZE) One series per gene

Show code

dat_sub_unique <- dat_sub |>
  group_by(gene_name) |> 
  reframe(
    across(
      everything(),
      ~ mean(.x, na.rm = TRUE)
    )
  )

This is our cleaned, input data file for building the circadian GCN.

(WGCNA) Format data

transpose the dataset: rows = timepoints, cols = genes

Show code

datExpr <- dat_sub_unique %>% 
  transpose_data()

QC

Show code

datExpr %>%
  check_sample_quality()

 Flagging genes and samples with too many missing values...
  ..step 1
All okay!

Show code

datExpr %>% 
  plot_sample_expression()

Visualizing the log-transformed data

Calculate gene-gene similarity

Show code

# Calculate Kendall's tau-b correlation for each gene-gene pair
sim_matrix <- calculate_gene_gene_sim(
  data = datExpr,
  name = which.sample,
  cache = FALSE
)

Running gene-gene similarity...Done.

QC

Show code

cat("Before power transformation:")

Before power transformation:

Show code

plot_sim_matrix(
  matrix = sim_matrix
)

Plotting a chunk of the gene-gene similarity matrix with 200 genes.

Create adjacency matrix

USER INPUT REQUIRED —-

To create the adjacency matrix, we need to first identify the soft-thresholding power (see WGCNA for more info).

Show code

sft <- analyze_network_topology(
  data = datExpr
)

Performing network topology analysis to pick 
  soft-thresholding power...
   Power SFT.R.sq    slope truncated.R.sq mean.k. median.k. max.k.
1      1 0.968000  2.93000          0.959    1650      1730   2060
2      2 0.951000  1.24000          0.937    1120      1180   1620
3      3 0.918000  0.65200          0.896     851       879   1360
4      4 0.878000  0.33300          0.849     683       691   1190
5      5 0.650000  0.13500          0.578     569       572   1060
6      6 0.000434 -0.00168         -0.237     486       483    966
7      7 0.638000 -0.10600          0.549     422       416    887
8      8 0.775000 -0.19400          0.710     373       364    822
9      9 0.851000 -0.26000          0.808     333       322    766
10    10 0.879000 -0.30900          0.846     300       287    718
11    12 0.882000 -0.39800          0.852     248       230    638
12    15 0.886000 -0.49000          0.870     195       171    550
13    18 0.886000 -0.56000          0.888     159       134    483
14    21 0.883000 -0.62100          0.892     133       106    431

Plotting resutls from the network topology analysis...

Done.
[ NOTE, FIGURE ]: Red horizontal line indicates a signed R^2 of 0.9

NOTE: The scale-free topology fit index reaches close to 1 (red horizontal line) at a soft-thresholding-power of 1.

Show code

my_estimate <- sft$fitIndices |> 
  filter(
    SFT.R.sq > 0.8 &
      SFT.R.sq <= 0.9,
    Power > 6 &
      Power < 14
  ) |> 
  arrange(
    Power
  ) |> 
  head(1) |> 
  pull(1)

Now, we can go ahead and create our adjacency matrix by power-transforming the similarity matrix (see WGCNA for more info).

Show code

## Specify the soft-thresholding-power
soft.power = if_else(
  sft$powerEstimate < 16,
  max(sft$powerEstimate, my_estimate),
  my_estimate
)

cat("Selected soft-power:", soft.power)

Selected soft-power: 9

Show code

# Construct adjacency matrix
# TO DO: make function ----
adj_matrix <- WGCNA::adjacency.fromSimilarity(
  sim_matrix,
  power = soft.power,
  type = "signed"
) |> 
  as.matrix()

cat("After power transformation:")

After power transformation:

Show code

plot_sim_matrix(
  matrix = adj_matrix
)

Plotting a chunk of the gene-gene similarity matrix with 200 genes.

Identify gene clusters

The following steps are performed as per guidelines from the WGCNA package and several tutorials made available online.

2.1 Create topological overalp matrix

Show code

# Turn adjacency into topological overlap
dissTOM = 1 - WGCNA::TOMsimilarity(adj_matrix)

..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.

Show code

# Call the hierarchical clustering function
geneTree = perform_hclust(
  data = dissTOM
)

2.2 Identify clusters

User defined parameters:

minimum size (number of genes) of modules

We like large modules, so we set the minimum module size relatively high (50).

Show code

minModuleSize = 50
# Module identification using dynamic tree cut:
modules <- create_modules(
  tree = geneTree,
  dissTOM = dissTOM,
  data = datExpr,
  min_module_size = minModuleSize
)

Merging modules that have a correlation ≥ 0.9 ...Done.
[ NOTE, FIGURE ] Plotting identified clusters before and after merging.

Module (cluster) size:
mergedColors
        black          blue         brown     darkgreen darkturquoise 
          150           210           185           260           130 
        green        grey60     lightcyan    lightgreen   lightyellow 
          155           990           238           314           213 
 midnightblue     royalblue 
          118           198

2.3 Merge similar modules

User defined parameters:

minimum correlation between two modules above which they are merged into one | var-name: MEDissThres

Show code

cutoff <- 0.8

merge = create_modules(
  tree = geneTree,
  dissTOM = dissTOM,
  min_module_size = minModuleSize,
  merge_cutoff_similarity = 0.8, # 70 % similarity
  data = datExpr
)

Merging modules that have a correlation ≥ 0.8 ...Done.
[ NOTE, FIGURE ] Plotting identified clusters before and after merging.

Module (cluster) size:
mergedColors
       black       grey60   lightgreen  lightyellow midnightblue 
         503         1250          654          636          118

Show code

n_modules <- ncol(merge$modules$newMEs)

while (n_modules > 15) {
  cutoff <- cutoff - 0.05
  merge <- create_modules(
    tree = geneTree,
    dissTOM = dissTOM,
    min_module_size = minModuleSize,
    merge_cutoff_similarity = cutoff, # 70 % similarity
    data = datExpr
  )
  n_modules <- ncol(merge$modules$newMEs)
}

cat("Cutoff:", cutoff)

Cutoff: 0.8

Show code

cat("Number of modules:", n_modules)

Number of modules: 5

2.4 Calculate module-module similarity

Show code

adj_matrix_ME <- calculate_module_module_sim(
  merged_modules = merge[["modules"]]
)

Calculating module-module similarity based 
  on module-eigengene-expression...Done.
Tidying module names...Done.
Plotting adjacency matrix for module-module similarity...

Done.

2.5 Visualize the network

Show code

plot_adj_as_network(
  layout = igraph::layout_in_circle,
  matrix = adj_matrix_ME$ME,
  min_edge = 0.5,
  node_label_size = 1.2,
  node_size = 35,
  edge_size = 5,
  node_frame_col = "grey20",
  node_fill_col = "grey80",
  vertex.frame.width = 4
)

Visualizing a simplified representation of the circadian GCN

Save module identity

Obtain a list of genes in each module

Show code

module_genes <- tidy_modules(
  merged_modules = merge[["colors"]],
  mapping_tbl = adj_matrix_ME$mapping_tbl,
  data = datExpr
)

# saveRDS(
#   module_genes,
#   here::here(
#     glue::glue(
#       "./data/tmp/{which.sample}_module_genes.RDS"
#     )
#   )
# )

# TO DO:
# Save the `dat_module_gene` as a database table in the respective database.

Compare GCN vs. ants

Show code

genes_across_species <- module_genes |> 
  convert_id(
    from = "mouse",
    to = c("cflo", "drosophila")
  )

genes_across_species |> 
  group_by(gene_name, module_identity) |> 
  reframe(
    dmel = length(
      unique(
        na.omit(drosophila_ID)
      )
    ),
    cflo = length(
      unique(
        na.omit(cflo_ID)
      )
    )
  ) |> 
  arrange(
    module_identity,
    desc(dmel),
    desc(cflo)
  ) |> 
  mutate(
    across(
      c(dmel, cflo),
      ~ case_when(
        .x > 1 ~ "multi",
        .x == 1 ~ "single",
        .x == 0 ~ "x",
        .default = NA
      ) |> 
        factor(
          levels = c("multi", "single", "x")
        )
    )
  ) |> 
  group_by(module_identity, dmel, cflo) |> 
  tally() |> 
  arrange(module_identity, desc(n)) |> 
  ungroup() |> 
  DT::datatable(
    rownames = FALSE
  )

Identify rhythmic modules

Show code

db_rhy <- load_rhy_genes(
  sample = which.sample
)
###-###-###-###-###-###-###-###-
# Set your p-value of choice
# col_pval = "BH.Q"
col_pval = "default.pvalue"
# col_pval = "raw.pvalue"
###-###-###-###-###-###-###-###-
l_module_genes <- module_genes |> 
  arrange(module_identity) |> 
  group_split(module_identity) |> 
  purrr::map(
    ~ .x |> pull(gene_name)
  ) |> 
  setNames(unique(module_genes[["module_identity"]]))

l_rhy_genes <- db_rhy |> 
  purrr::map(
    ~ .x |> 
      filter(
        if_all(
          c(col_pval),
          ~ .x < 0.05
        )
      ) |> 
      filter(
        ID %in% unlist(l_module_genes)
      ) |> 
      pull(1) |> 
      unique()
  ) |> 
  purrr::compact()

Comparison

Modules vs. Rhythmic genes

Show code

writeLines("#####################################################
How many genes are in each of my geneset of interest?
#####################################################")

#####################################################
How many genes are in each of my geneset of interest?
#####################################################

Show code

## MAKE YOUR LIST OF GENES OF INTEREST ##

# LIST ONE - WGCNA modules
list1 <- l_module_genes
writeLines("List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules")

List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules

Show code

sapply(list1, length)

  C1   C2   C3   C4   C5 
1250  654  503  636  118

Show code

## LIST TWO - rhythmic genes
list2 <- l_rhy_genes
writeLines("List of interesting genes #2
----------------------------
Rhythmic genes identified by different algorithms")

List of interesting genes #2
----------------------------
Rhythmic genes identified by different algorithms

Show code

sapply(list2, length)

      ARS    empJTK GeneCycle       JTK    meta2d      RAIN 
      110       289       164        73       115       267

Show code

## CHECK FOR OVERLAP
# define size of genome
size = length(unique(c(unlist(list1), unlist(list2))))
# make a GOM object
gom.1v2 <- GeneOverlap::newGOM(
  list2, 
  list1,
  genome.size = size
)
# png(paste0(path_to_repo, "/results/figures/",
#            "02_pogo_GCN/",
#            sample.name[1],"_gom_1v2.png"),
#     width = 35, height = 15, units = "cm", res = 300)
GeneOverlap::drawHeatmap(
  gom.1v2,
  adj.p = TRUE,
  cutoff=0.05,
  what="odds.ratio",
  # what="Jaccard",
  log.scale = T,
  note.col = "black",
  grid.col = "Oranges"
)

Show code

# trash <- dev.off()

 # writeLines("How many genes exactly are overlapping between the pairwise comparisons")
# getMatrix(gom.1v4, name = "intersection") %>% t()

writeLines("Visualizing the significant overlaps between your lists of interesting genes and the identified modules")

Visualizing the significant overlaps between your lists of interesting genes and the identified modules

Network statistics

From WGCNA-tutorial

Intramodular connectivity

“We begin by calculating the intramodular connectivity for each gene. (In network literature, connectivity is often referred to as”degree”.) The function intramodularConnectivity computes:

the whole network connectivity kTotal,
the within (intra)module connectivity kWithin,
the extra-modular connectivity kOut=kTotal-kWithin, and
the difference of the intra- and extra-modular connectivities kDiff = kIn - kOut = 2*kIN-kTotal

Show code

# From what I can tell, colorh1 in the tutorial refers to moduleColors
colorh1 <- merge$colors

# Calculate the connectivities of the genes
Alldegrees1 = WGCNA::intramodularConnectivity(
  adjMat = adj_matrix, 
  colors = colorh1
) |> 
  tibble::rownames_to_column("gene_name") |>  
    mutate(
      across(
        matches("^k"),
        ~ round(.x, 2)
      )
    )

Plotting the mean (± 95% CI) connectivity of genes in different modules

Show code

pd <- position_dodge(0.1)

# which_var <- "kTotal"
which_var <- c("kTotal", "kWithin", "kOut", "kDiff")

Alldegrees1 |>  
  # rownames_to_column("gene_name") %>% 
  left_join(
    module_genes, 
    join_by(gene_name)
  ) |> 
  # PLOT FROM RAW DATA
  mutate(
    module_identity = factor(
      module_identity, 
      levels = paste0(
        "C",
        sort(
          unique(module_genes$module_identity) |> 
            stringr::str_replace("C", "") |> 
            as.integer()
        )
      ) |> rev()
    )
  ) |> 
  summarySE(
    measurevar = which_var, 
    groupvars = "module_identity"
  ) |> 
  mutate(
    type = factor(
      type,
      levels = which_var
    )
  ) |> 
  # Plot
  ggplot(
    aes(
      y = module_identity, 
      x = mean,
      group = interaction(module_identity, type)
    )
  ) +
  geom_vline(xintercept = 0, col = "maroon", alpha = 0.7) +
  labs(
    y = "",
    x = glue::glue(
      "Connectivity"
    ),
    title = ""
  ) +
  ## Add error bar here
  geom_errorbar(
    aes(xmin = mean-ci, xmax = mean+ci),
    width = 0.3, 
    position=pd, 
    lwd = 1.3,
    col="black", 
    alpha = 1
  ) +
  # Add the points on top of the error bars
  geom_point(
    position = pd, 
    size = 3,
    col = "black", 
    fill = "orange",
    show.legend = F, 
    pch = 21,
    alpha = 0.9
  ) +
  facet_wrap(
    ~ type,
    nrow = 2,
    scales = "free_x"
  ) +
  scale_x_continuous(
    n.breaks = 4
  ) +
  theme_bw(25) +
  # scale_color_manual(values=c("#F20505","#F5D736","#0FBF67")) +
  theme(
    legend.position = "none"
  )