KATP Gene Correlations

Libraries
Cached-object strategy
Metadata
Transcript-to-gene mapping with gene symbols
Transcript quantifications
Transcript annotation (for collapsing to genes)
KATP gene-level TPM
KATP expression matrix
Pairwise gene correlations
Plot correlation matrix

#Explanation This script will show KATP channel subunit correlations in our mouse skeletal muscle (AT) data. From this paper:

Staunton et al

The data is freely available: E-MTAB-10601

In this script we use only control samples, but young and old. FULLRUN == TRUE would require the data downloaded and the gtfs installed in an equivilent location to our set up BUT we publish here as the default FULLRUN == FALSE so you can run it locally from Github with just the included rds files, Where preprocessing has already been done.

NB using LFS for the large rds files: git lfs track “txi_transcripts.rds”

GAI statement: This entire script was passed with Copilot to strip-out the unnecessary chunks we had in our original.

Libraries

library(tximport)
library(tximeta)
library(GenomicFeatures)
library(AnnotationDbi)
library(dplyr)
library(tidyr)
library(ggplot2)
library(viridis)

Cached-object strategy

When FULLRUN <- FALSE, this notebook reads only these cached RDS files:

metadata_gene_corr.rds
tx2gene_mouse_symbols.rds
txi_transcripts.rds

Metadata

if (FULLRUN) {
  metadata <- read.csv(METADATA_PATH, stringsAsFactors = FALSE)
  rownames(metadata) <- metadata$sample_id
  metadata$Age <- factor(metadata$Age, levels = c("Young", "Old"))
  saveRDS(metadata, METADATA_RDS)
} else {
  metadata <- readRDS(METADATA_RDS)
}

cat("Loaded", nrow(metadata), "samples\n")

## Loaded 10 samples

print(table(metadata$Age))

## 
## Young   Old 
##     5     5

Transcript-to-gene mapping with gene symbols

For gene level analysis: Only the mapping needed to collapse transcript abundance to gene-level TPM is retained.

if (FULLRUN) {
  cat("Building transcript-to-gene mapping from GTF...\n")
  txdb <- makeTxDbFromGFF(MOUSE_GTF, format = "gtf")
  k <- keys(txdb, keytype = "TXNAME")
  tx2gene <- AnnotationDbi::select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME")
  tx2gene$TXNAME_clean <- sub("\\..*", "", tx2gene$TXNAME)
  tx2gene$GENEID_clean <- sub("\\..*", "", tx2gene$GENEID)

  # Restore your preferred symbol annotation step here if rebuilding from scratch,
  # then save the gene-symbol-annotated object.
  stop("Add/restore your preferred gene-symbol annotation step here, then save to TX2GENE_SYMBOLS_RDS")

  saveRDS(tx2gene, TX2GENE_SYMBOLS_RDS)
} else {
  tx2gene <- readRDS(TX2GENE_SYMBOLS_RDS)
}

cat("Loaded mapping for", nrow(tx2gene), "transcripts\n")

## Loaded mapping for 142333 transcripts

Transcript quantifications

if (FULLRUN) {
  quant_files <- sapply(metadata$sample_id, function(sid) {
    sample_base <- sub("_R1\\.fastq\\.gz$", "", sid)
    
    sample_base <- tolower(sample_base)
    quant_dir <- file.path(BASE_PATH, paste0("quants_", sample_base))
    quant_file <- file.path(quant_dir, "quant.sf")
    if (file.exists(quant_file)) {
      quant_file
    } else {
      warning(paste("Missing quant file for:", sid))
      NA_character_
    }
  })

  names(quant_files) <- metadata$sample_id
  valid_samples <- names(quant_files)[!is.na(quant_files)]
  metadata <- metadata[valid_samples, , drop = FALSE]
  saveRDS(metadata, METADATA_RDS)
  quant_files <- quant_files[valid_samples]

  cat("Found", length(quant_files), "quant files\n")

  txi <- tximport(
    quant_files,
    type = "salmon",
    txOut = TRUE,
    ignoreTxVersion = TRUE,
    dropInfReps = FALSE
  )

  saveRDS(txi, TXI_TRANSCRIPTS_RDS)
} else {
  txi <- readRDS(TXI_TRANSCRIPTS_RDS)
}

cat("Imported", nrow(txi$counts), "transcripts across", ncol(txi$counts), "samples\n")

## Imported 116546 transcripts across 10 samples

Transcript annotation (for collapsing to genes)

tx_ids <- rownames(txi$counts)
tx_ids_clean <- sub("\\..*", "", tx_ids)

tx_annotation <- data.frame(
  transcript_id = tx_ids,
  transcript_id_clean = tx_ids_clean,
  stringsAsFactors = FALSE
) %>%
  left_join(
    tx2gene %>%
      dplyr::select(TXNAME_clean, GENEID_clean, gene_symbol) %>%
      distinct(),
    by = c("transcript_id_clean" = "TXNAME_clean")
  ) %>%
  dplyr::rename(
    gene_id = GENEID_clean,
    symbol = gene_symbol
  )

cat("Annotated", nrow(tx_annotation), "transcripts\n")

## Annotated 116546 transcripts

cat("Transcripts with gene symbols:", sum(!is.na(tx_annotation$symbol)), "\n")

## Transcripts with gene symbols: 115881

cat("Unknown transcripts:", sum(is.na(tx_annotation$symbol)), "\n")

## Unknown transcripts: 665

KATP gene-level TPM

expr_long <- as.data.frame(t(txi$abundance)) %>%
  mutate(sample_id = rownames(.)) %>%
  pivot_longer(cols = -sample_id, names_to = "transcript_id", values_to = "TPM") %>%
  left_join(metadata %>% dplyr::select(sample_id, Age), by = "sample_id") %>%
  left_join(tx_annotation %>% dplyr::select(transcript_id, gene_id, symbol), by = "transcript_id")

katp_gene_tpm <- expr_long %>%
  dplyr::filter(symbol %in% KATP_GENES) %>%
  group_by(sample_id, Age, gene_id, symbol) %>%
  summarise(TPM = sum(TPM), .groups = "drop") %>%
  mutate(channel = KATP_LABELS[symbol])

print(katp_gene_tpm %>% dplyr::select(sample_id, Age, gene_id, symbol, channel, TPM) %>% head())

## # A tibble: 6 × 6
##   sample_id                                  Age   gene_id symbol channel    TPM
##   <chr>                                      <fct> <chr>   <chr>  <chr>    <dbl>
## 1 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.… Old   ENSMUS… Kcnj8  Kir6.1   2.04 
## 2 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.… Old   ENSMUS… Abcc9  SUR2     3.35 
## 3 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.… Old   ENSMUS… Abcc8  SUR1     0.572
## 4 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.… Old   ENSMUS… Kcnj11 Kir6.2  20.1  
## 5 24-Old_control_AT_4_AATCCGGA-AACTGTAG_R1.… Old   ENSMUS… Kcnj8  Kir6.1   9.01 
## 6 24-Old_control_AT_4_AATCCGGA-AACTGTAG_R1.… Old   ENSMUS… Abcc9  SUR2    15.0

KATP expression matrix

katp_wide <- katp_gene_tpm %>%
  dplyr::select(sample_id, channel, TPM) %>%
  pivot_wider(names_from = channel, values_from = TPM) %>%
  as.data.frame()

rownames(katp_wide) <- katp_wide$sample_id
katp_wide$sample_id <- NULL

katp_mat <- log2(as.matrix(katp_wide) + 1)

want <- c("Kir6.1", "Kir6.2", "SUR1", "SUR2")
katp_mat <- katp_mat[, intersect(want, colnames(katp_mat)), drop = FALSE]

cat("Subunits in matrix:", paste(colnames(katp_mat), collapse = ", "), "\n")

## Subunits in matrix: Kir6.1, Kir6.2, SUR1, SUR2

print(head(katp_mat))

##                                                        Kir6.1   Kir6.2
## 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.fastq.gz    1.602020 4.400538
## 24-Old_control_AT_4_AATCCGGA-AACTGTAG_R1.fastq.gz    3.323142 5.831692
## 25-Old_control_AT_5_TAATACAG-GTGAATAT_R1.fastq.gz    2.627225 5.557657
## 27-Old_control_AT_7_CGGCGTGA-ACAGGCGC_R1.fastq.gz    2.898426 5.383647
## 29-Old_control_AT_9_ATGTAAGT-CATAGAGT_R1.fastq.gz    2.800623 5.383050
## 39-Young_control_AT_11_GCAGAATT-TGGCCGGT_R1.fastq.gz 3.331374 5.893688
##                                                           SUR1     SUR2
## 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.fastq.gz    0.6527701 2.121844
## 24-Old_control_AT_4_AATCCGGA-AACTGTAG_R1.fastq.gz    1.3540769 4.003786
## 25-Old_control_AT_5_TAATACAG-GTGAATAT_R1.fastq.gz    1.4302874 3.332094
## 27-Old_control_AT_7_CGGCGTGA-ACAGGCGC_R1.fastq.gz    1.0360623 3.249208
## 29-Old_control_AT_9_ATGTAAGT-CATAGAGT_R1.fastq.gz    0.9005805 3.325907
## 39-Young_control_AT_11_GCAGAATT-TGGCCGGT_R1.fastq.gz 1.0071424 4.554527

Pairwise gene correlations

## Correlation coefficients (pearson):

##        Kir6.1 Kir6.2  SUR1  SUR2
## Kir6.1  1.000  0.854 0.379 0.906
## Kir6.2  0.854  1.000 0.570 0.895
## SUR1    0.379  0.570 1.000 0.414
## SUR2    0.906  0.895 0.414 1.000

## 
## p-values:

##          Kir6.1   Kir6.2   SUR1     SUR2
## Kir6.1 0.000000 1.65e-03 0.2810 0.000302
## Kir6.2 0.001650 1.06e-62 0.0857 0.000468
## SUR1   0.281000 8.57e-02 0.0000 0.234000
## SUR2   0.000302 4.68e-04 0.2340 0.000000