• Tempo of Gene Expression
  • Build GCN (stepwise)
  • Build GCN (oneshot)
  • Rhy genes across tissues
    • Mice
    • Baboon
    • Flies
    • Mosquitoes
  • Contact

On this page

  • (INT) Create databases
  • ONE-SHOT (make_modules)
    • Input data
    • Make and plot modules
    • Modify network plot
  • STEP-BY-STEP
  • (PREP) Log-transform data
  • (SUBSET) Filter data
  • (SUMMARIZE) One series per gene
  • (WGCNA) Format data
    • QC
  • Calculate gene-gene similarity
    • QC
  • Create adjacency matrix
    • USER INPUT REQUIRED —-
  • Identify gene clusters
    • 2.1 Create topological overalp matrix
    • 2.2 Identify clusters
    • 2.3 Merge similar modules
    • 2.4 Calculate module-module similarity
    • 2.5 Visualize the network
  • Save module identity
  • Identify rhythmic modules
    • Comparison
  • Network statistics
    • Intramodular connectivity

Build gene co-expression network (GCN) from time-course gene expression data


Last updated on 2025-10-30.


Show code
library(dplyr)
library(dbplyr)
library(ggplot2)
for (i in list.files(here::here("R"), full.names = TRUE)) {
  source(i)
}

# SAMPLE NAME
## specify sample name
sample.names <- c(
  # dmel
  "dmel-head",
  # mmus
  "mmus-brain_stem", 
  # panu
  "panu-hypothalamus"
)
# sample.cycles <- c("LD", "DD")

## SPECIFY THE DATASET TO BUILD GCN WITH
which.sample <- sample.names[1]

writeLines(
  glue::glue("Sample: {which.sample}")
)
Sample: dmel-head

(INT) Create databases

Show code
data.db <- load_data(
  sample_names = sample.names
)

cat("Structure of input data:")
Structure of input data:
Show code
data.db[[which.sample]] |> 
  head(5) |> 
  mutate_if(
    is.numeric,
    ~ round(.x, 1)
  ) |> 
  DT::datatable(
    caption = "Column 1: Name of feature (gene), Columns 2-N: Timepoints, Value: Expression / Activity",
    rownames = FALSE
  )
Show code
# cat("Here are the tables, in each database.")
# purrr::map(
#   data.db,
#   ~ src_dbi(.x)
# )

ONE-SHOT (make_modules)

Input data

|gene_name   |  ZT00|  ZT03|  ZT06|  ZT09|  ZT12|  ZT15|  ZT18|  ZT21|
|:-----------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
|FBgn0000008 | 4.852| 5.215| 4.281| 9.320| 7.119| 2.778| 7.372| 5.757|
|FBgn0000008 | 4.663| 5.024| 4.115| 8.958| 6.904| 2.769| 7.086| 5.533|
|FBgn0000014 | 0.039| 0.072| 3.509| 0.000| 0.056| 0.000| 0.016| 0.358|
|FBgn0000015 | 0.000| 0.015| 4.928| 0.000| 0.000| 0.135| 0.000| 1.598|

Make and plot modules

Show code
mods <- timecourseRnaseq::make_modules(
  # data = data.db[[which.sample]],
  data = data.db[[3]],
  log2 = TRUE,
  id_column = "gene_name",
  min_expression = NULL,  # automatically estimated
  min_timepoints = NULL,  # automatically estimated
  method = "wgcna",
  qc = FALSE,
  sim_method = "kendall",
  soft_power = NULL,      # automatically estimated
  min_module_size = 50,
  max_modules = 15,
  merge_cutoff_similarity = 0.9,
  plot_network_min_edge = 0.5,
  plot_network = TRUE,
  tidy_modules = FALSE
  )

mods |> str(max.level = 1)

Modify network plot

Show code
plot_adj_as_network(
  matrix = mods[["adj_matrix_ME"]][["ME"]],
  # layout = igraph::layout.sugiyama,
  layout = igraph::layout_in_circle, # changed layout
  min_edge = 0.6,
  node_label_size = 1.2,
  node_size = 20,
  edge_size = 3,
  node_frame_col = "grey20",
  node_fill_col = "grey80",
  vertex.frame.width = 3
)

STEP-BY-STEP

(PREP) Log-transform data

Show code
# Define the function
dat <- log2_transform_data(
  data = data.db[[which.sample]],
  id_column = "gene_name"
)
Applying log2-transformation...Done.

(SUBSET) Filter data

Show code
# Default: min_expression and min_timepoints
min_expression <- dat |> 
  select(-gene_name) |> 
  summarize(across(everything(), ~ mean(.x, na.rm = TRUE))) |> 
  rowMeans() |> 
  ceiling()

min_timepoints <- ceiling(ncol(dat)*(2/3))
Show code
# Which genes are expressed throughout the day in forager heads?
  # count the number of time points that has ≥ 1 FPKM
  # subset the data and only keep the filtered genes
  
dat_sub <- dat |> 
  subset_data(
    min_expression = min_expression,
    min_timepoints = min_timepoints,
    id_column = "gene_name"
  )
Subsetting data...Done.
[ NOTE ]: After subsetting, 5568 of 12451 rows remain.
Show code
writeLines("Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]")
Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]
Show code
dim(dat[-1])
[1] 12451     8

(SUMMARIZE) One series per gene

Show code
dat_sub_unique <- dat_sub |>
  group_by(gene_name) |> 
  reframe(
    across(
      everything(),
      ~ mean(.x, na.rm = TRUE)
    )
  )

This is our cleaned, input data file for building the circadian GCN.

(WGCNA) Format data

  • transpose the dataset: rows = timepoints, cols = genes
Show code
datExpr <- dat_sub_unique %>% 
  transpose_data() 

QC

Show code
datExpr %>%
  check_sample_quality()
 Flagging genes and samples with too many missing values...
  ..step 1
All okay!
Show code
datExpr %>% 
  plot_sample_expression()
Visualizing the log-transformed data

Calculate gene-gene similarity

Show code
# Calculate Kendall's tau-b correlation for each gene-gene pair
sim_matrix <- calculate_gene_gene_sim(
  data = datExpr,
  name = which.sample,
  cache = FALSE
)
Running gene-gene similarity...Done.

QC

Show code
cat("Before power transformation:")
Before power transformation:
Show code
plot_sim_matrix(
  matrix = sim_matrix
)
Plotting a chunk of the gene-gene similarity matrix with 200 genes.

Create adjacency matrix

USER INPUT REQUIRED —-

To create the adjacency matrix, we need to first identify the soft-thresholding power (see WGCNA for more info).

Show code
sft <- analyze_network_topology(
  data = datExpr
)
Performing network topology analysis to pick 
  soft-thresholding power...
   Power SFT.R.sq    slope truncated.R.sq mean.k. median.k. max.k.
1      1 0.968000  2.93000          0.959    1650      1730   2060
2      2 0.951000  1.24000          0.937    1120      1180   1620
3      3 0.918000  0.65200          0.896     851       879   1360
4      4 0.878000  0.33300          0.849     683       691   1190
5      5 0.650000  0.13500          0.578     569       572   1060
6      6 0.000434 -0.00168         -0.237     486       483    966
7      7 0.638000 -0.10600          0.549     422       416    887
8      8 0.775000 -0.19400          0.710     373       364    822
9      9 0.851000 -0.26000          0.808     333       322    766
10    10 0.879000 -0.30900          0.846     300       287    718
11    12 0.882000 -0.39800          0.852     248       230    638
12    15 0.886000 -0.49000          0.870     195       171    550
13    18 0.886000 -0.56000          0.888     159       134    483
14    21 0.883000 -0.62100          0.892     133       106    431

Plotting resutls from the network topology analysis...

Done.
[ NOTE, FIGURE ]: Red horizontal line indicates a signed R^2 of 0.9

NOTE: The scale-free topology fit index reaches close to 1 (red horizontal line) at a soft-thresholding-power of 1.

Show code
my_estimate <- sft$fitIndices |> 
  filter(
    SFT.R.sq > 0.8 &
      SFT.R.sq <= 0.9,
    Power > 6 &
      Power < 14
  ) |> 
  arrange(
    Power
  ) |> 
  head(1) |> 
  pull(1)

Now, we can go ahead and create our adjacency matrix by power-transforming the similarity matrix (see WGCNA for more info).

Show code
## Specify the soft-thresholding-power
soft.power = if_else(
  sft$powerEstimate < 16,
  max(sft$powerEstimate, my_estimate),
  my_estimate
)

cat("Selected soft-power:", soft.power)
Selected soft-power: 9
Show code
# Construct adjacency matrix
# TO DO: make function ----
adj_matrix <- WGCNA::adjacency.fromSimilarity(
  sim_matrix,
  power = soft.power,
  type = "signed"
) |> 
  as.matrix()

cat("After power transformation:")
After power transformation:
Show code
plot_sim_matrix(
  matrix = adj_matrix
)
Plotting a chunk of the gene-gene similarity matrix with 200 genes.

Identify gene clusters

The following steps are performed as per guidelines from the WGCNA package and several tutorials made available online.

2.1 Create topological overalp matrix

Show code
# Turn adjacency into topological overlap
dissTOM = 1 - WGCNA::TOMsimilarity(adj_matrix)
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
Show code
# Call the hierarchical clustering function
geneTree = perform_hclust(
  data = dissTOM
)

2.2 Identify clusters

User defined parameters:

  • minimum size (number of genes) of modules

We like large modules, so we set the minimum module size relatively high (50).

Show code
minModuleSize = 50
# Module identification using dynamic tree cut:
modules <- create_modules(
  tree = geneTree,
  dissTOM = dissTOM,
  data = datExpr,
  min_module_size = minModuleSize
)

Merging modules that have a correlation ≥ 0.9 ...Done.
[ NOTE, FIGURE ] Plotting identified clusters before and after merging.

Module (cluster) size:
mergedColors
        black          blue         brown     darkgreen darkturquoise 
          150           210           185           260           130 
        green        grey60     lightcyan    lightgreen   lightyellow 
          155           990           238           314           213 
 midnightblue     royalblue 
          118           198 

2.3 Merge similar modules

User defined parameters:

  • minimum correlation between two modules above which they are merged into one | var-name: MEDissThres
Show code
cutoff <- 0.8

merge = create_modules(
  tree = geneTree,
  dissTOM = dissTOM,
  min_module_size = minModuleSize,
  merge_cutoff_similarity = 0.8, # 70 % similarity
  data = datExpr
)

Merging modules that have a correlation ≥ 0.8 ...Done.
[ NOTE, FIGURE ] Plotting identified clusters before and after merging.

Module (cluster) size:
mergedColors
       black       grey60   lightgreen  lightyellow midnightblue 
         503         1250          654          636          118 
Show code
n_modules <- ncol(merge$modules$newMEs)

while (n_modules > 15) {
  cutoff <- cutoff - 0.05
  merge <- create_modules(
    tree = geneTree,
    dissTOM = dissTOM,
    min_module_size = minModuleSize,
    merge_cutoff_similarity = cutoff, # 70 % similarity
    data = datExpr
  )
  n_modules <- ncol(merge$modules$newMEs)
}

cat("Cutoff:", cutoff)
Cutoff: 0.8
Show code
cat("Number of modules:", n_modules)
Number of modules: 5

2.4 Calculate module-module similarity

Show code
adj_matrix_ME <- calculate_module_module_sim(
  merged_modules = merge[["modules"]]
)
Calculating module-module similarity based 
  on module-eigengene-expression...Done.
Tidying module names...Done.
Plotting adjacency matrix for module-module similarity...

Done.

2.5 Visualize the network

Show code
plot_adj_as_network(
  layout = igraph::layout_in_circle,
  matrix = adj_matrix_ME$ME,
  min_edge = 0.5,
  node_label_size = 1.2,
  node_size = 35,
  edge_size = 5,
  node_frame_col = "grey20",
  node_fill_col = "grey80",
  vertex.frame.width = 4
)
Visualizing a simplified representation of the circadian GCN

Save module identity

Obtain a list of genes in each module

Show code
module_genes <- tidy_modules(
  merged_modules = merge[["colors"]],
  mapping_tbl = adj_matrix_ME$mapping_tbl,
  data = datExpr
)

# saveRDS(
#   module_genes,
#   here::here(
#     glue::glue(
#       "./data/tmp/{which.sample}_module_genes.RDS"
#     )
#   )
# )

# TO DO:
# Save the `dat_module_gene` as a database table in the respective database.
NoteCompare GCN vs. ants
Show code
genes_across_species <- module_genes |> 
  convert_id(
    from = "mouse",
    to = c("cflo", "drosophila")
  )

genes_across_species |> 
  group_by(gene_name, module_identity) |> 
  reframe(
    dmel = length(
      unique(
        na.omit(drosophila_ID)
      )
    ),
    cflo = length(
      unique(
        na.omit(cflo_ID)
      )
    )
  ) |> 
  arrange(
    module_identity,
    desc(dmel),
    desc(cflo)
  ) |> 
  mutate(
    across(
      c(dmel, cflo),
      ~ case_when(
        .x > 1 ~ "multi",
        .x == 1 ~ "single",
        .x == 0 ~ "x",
        .default = NA
      ) |> 
        factor(
          levels = c("multi", "single", "x")
        )
    )
  ) |> 
  group_by(module_identity, dmel, cflo) |> 
  tally() |> 
  arrange(module_identity, desc(n)) |> 
  ungroup() |> 
  DT::datatable(
    rownames = FALSE
  )

Identify rhythmic modules

Show code
db_rhy <- load_rhy_genes(
  sample = which.sample
)
###-###-###-###-###-###-###-###-
# Set your p-value of choice
# col_pval = "BH.Q"
col_pval = "default.pvalue"
# col_pval = "raw.pvalue"
###-###-###-###-###-###-###-###-
l_module_genes <- module_genes |> 
  arrange(module_identity) |> 
  group_split(module_identity) |> 
  purrr::map(
    ~ .x |> pull(gene_name)
  ) |> 
  setNames(unique(module_genes[["module_identity"]]))

l_rhy_genes <- db_rhy |> 
  purrr::map(
    ~ .x |> 
      filter(
        if_all(
          c(col_pval),
          ~ .x < 0.05
        )
      ) |> 
      filter(
        ID %in% unlist(l_module_genes)
      ) |> 
      pull(1) |> 
      unique()
  ) |> 
  purrr::compact()

Comparison

Modules vs. Rhythmic genes

Show code
writeLines("#####################################################
How many genes are in each of my geneset of interest?
#####################################################")
#####################################################
How many genes are in each of my geneset of interest?
#####################################################
Show code
## MAKE YOUR LIST OF GENES OF INTEREST ##

# LIST ONE - WGCNA modules
list1 <- l_module_genes
writeLines("List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules")
List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules
Show code
sapply(list1, length)
  C1   C2   C3   C4   C5 
1250  654  503  636  118 
Show code
## LIST TWO - rhythmic genes
list2 <- l_rhy_genes
writeLines("List of interesting genes #2
----------------------------
Rhythmic genes identified by different algorithms")
List of interesting genes #2
----------------------------
Rhythmic genes identified by different algorithms
Show code
sapply(list2, length)
      ARS    empJTK GeneCycle       JTK    meta2d      RAIN 
      110       289       164        73       115       267 
Show code
## CHECK FOR OVERLAP
# define size of genome
size = length(unique(c(unlist(list1), unlist(list2))))
# make a GOM object
gom.1v2 <- GeneOverlap::newGOM(
  list2, 
  list1,
  genome.size = size
)
# png(paste0(path_to_repo, "/results/figures/",
#            "02_pogo_GCN/",
#            sample.name[1],"_gom_1v2.png"),
#     width = 35, height = 15, units = "cm", res = 300)
GeneOverlap::drawHeatmap(
  gom.1v2,
  adj.p = TRUE,
  cutoff=0.05,
  what="odds.ratio",
  # what="Jaccard",
  log.scale = T,
  note.col = "black",
  grid.col = "Oranges"
)

Show code
# trash <- dev.off()

 # writeLines("How many genes exactly are overlapping between the pairwise comparisons")
# getMatrix(gom.1v4, name = "intersection") %>% t()

writeLines("Visualizing the significant overlaps between your lists of interesting genes and the identified modules")
Visualizing the significant overlaps between your lists of interesting genes and the identified modules

Network statistics

From WGCNA-tutorial

Intramodular connectivity

“We begin by calculating the intramodular connectivity for each gene. (In network literature, connectivity is often referred to as”degree”.) The function intramodularConnectivity computes:

  • the whole network connectivity kTotal,
  • the within (intra)module connectivity kWithin,
  • the extra-modular connectivity kOut=kTotal-kWithin, and
  • the difference of the intra- and extra-modular connectivities kDiff = kIn - kOut = 2*kIN-kTotal
Show code
# From what I can tell, colorh1 in the tutorial refers to moduleColors
colorh1 <- merge$colors

# Calculate the connectivities of the genes
Alldegrees1 = WGCNA::intramodularConnectivity(
  adjMat = adj_matrix, 
  colors = colorh1
) |> 
  tibble::rownames_to_column("gene_name") |>  
    mutate(
      across(
        matches("^k"),
        ~ round(.x, 2)
      )
    )

Plotting the mean (± 95% CI) connectivity of genes in different modules

Show code
pd <- position_dodge(0.1)

# which_var <- "kTotal"
which_var <- c("kTotal", "kWithin", "kOut", "kDiff")

Alldegrees1 |>  
  # rownames_to_column("gene_name") %>% 
  left_join(
    module_genes, 
    join_by(gene_name)
  ) |> 
  # PLOT FROM RAW DATA
  mutate(
    module_identity = factor(
      module_identity, 
      levels = paste0(
        "C",
        sort(
          unique(module_genes$module_identity) |> 
            stringr::str_replace("C", "") |> 
            as.integer()
        )
      ) |> rev()
    )
  ) |> 
  summarySE(
    measurevar = which_var, 
    groupvars = "module_identity"
  ) |> 
  mutate(
    type = factor(
      type,
      levels = which_var
    )
  ) |> 
  # Plot
  ggplot(
    aes(
      y = module_identity, 
      x = mean,
      group = interaction(module_identity, type)
    )
  ) +
  geom_vline(xintercept = 0, col = "maroon", alpha = 0.7) +
  labs(
    y = "",
    x = glue::glue(
      "Connectivity"
    ),
    title = ""
  ) +
  ## Add error bar here
  geom_errorbar(
    aes(xmin = mean-ci, xmax = mean+ci),
    width = 0.3, 
    position=pd, 
    lwd = 1.3,
    col="black", 
    alpha = 1
  ) +
  # Add the points on top of the error bars
  geom_point(
    position = pd, 
    size = 3,
    col = "black", 
    fill = "orange",
    show.legend = F, 
    pch = 21,
    alpha = 0.9
  ) +
  facet_wrap(
    ~ type,
    nrow = 2,
    scales = "free_x"
  ) +
  scale_x_continuous(
    n.breaks = 4
  ) +
  theme_bw(25) +
  # scale_color_manual(values=c("#F20505","#F5D736","#0FBF67")) +
  theme(
    legend.position = "none"
  )