#Explanation This script will show KATP channel subunit correlations in our mouse skeletal muscle (AT) data. From this paper:
The data is freely available: E-MTAB-10601
In this script we use only control samples, but young and old. FULLRUN == TRUE would require the data downloaded and the gtfs installed in an equivilent location to our set up BUT we publish here as the default FULLRUN == FALSE so you can run it locally from Github with just the included rds files, Where preprocessing has already been done.
NB using LFS for the large rds files: git lfs track “txi_transcripts.rds”
GAI statement: This entire script was passed with Copilot to strip-out the unnecessary chunks we had in our original.
library(tximport)
library(tximeta)
library(GenomicFeatures)
library(AnnotationDbi)
library(dplyr)
library(tidyr)
library(ggplot2)
library(viridis)
When FULLRUN <- FALSE, this notebook reads only these
cached RDS files:
metadata_gene_corr.rdstx2gene_mouse_symbols.rdstxi_transcripts.rdsif (FULLRUN) {
metadata <- read.csv(METADATA_PATH, stringsAsFactors = FALSE)
rownames(metadata) <- metadata$sample_id
metadata$Age <- factor(metadata$Age, levels = c("Young", "Old"))
saveRDS(metadata, METADATA_RDS)
} else {
metadata <- readRDS(METADATA_RDS)
}
cat("Loaded", nrow(metadata), "samples\n")
## Loaded 10 samples
print(table(metadata$Age))
##
## Young Old
## 5 5
For gene level analysis: Only the mapping needed to collapse transcript abundance to gene-level TPM is retained.
if (FULLRUN) {
cat("Building transcript-to-gene mapping from GTF...\n")
txdb <- makeTxDbFromGFF(MOUSE_GTF, format = "gtf")
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- AnnotationDbi::select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME")
tx2gene$TXNAME_clean <- sub("\\..*", "", tx2gene$TXNAME)
tx2gene$GENEID_clean <- sub("\\..*", "", tx2gene$GENEID)
# Restore your preferred symbol annotation step here if rebuilding from scratch,
# then save the gene-symbol-annotated object.
stop("Add/restore your preferred gene-symbol annotation step here, then save to TX2GENE_SYMBOLS_RDS")
saveRDS(tx2gene, TX2GENE_SYMBOLS_RDS)
} else {
tx2gene <- readRDS(TX2GENE_SYMBOLS_RDS)
}
cat("Loaded mapping for", nrow(tx2gene), "transcripts\n")
## Loaded mapping for 142333 transcripts
if (FULLRUN) {
quant_files <- sapply(metadata$sample_id, function(sid) {
sample_base <- sub("_R1\\.fastq\\.gz$", "", sid)
sample_base <- tolower(sample_base)
quant_dir <- file.path(BASE_PATH, paste0("quants_", sample_base))
quant_file <- file.path(quant_dir, "quant.sf")
if (file.exists(quant_file)) {
quant_file
} else {
warning(paste("Missing quant file for:", sid))
NA_character_
}
})
names(quant_files) <- metadata$sample_id
valid_samples <- names(quant_files)[!is.na(quant_files)]
metadata <- metadata[valid_samples, , drop = FALSE]
saveRDS(metadata, METADATA_RDS)
quant_files <- quant_files[valid_samples]
cat("Found", length(quant_files), "quant files\n")
txi <- tximport(
quant_files,
type = "salmon",
txOut = TRUE,
ignoreTxVersion = TRUE,
dropInfReps = FALSE
)
saveRDS(txi, TXI_TRANSCRIPTS_RDS)
} else {
txi <- readRDS(TXI_TRANSCRIPTS_RDS)
}
cat("Imported", nrow(txi$counts), "transcripts across", ncol(txi$counts), "samples\n")
## Imported 116546 transcripts across 10 samples
tx_ids <- rownames(txi$counts)
tx_ids_clean <- sub("\\..*", "", tx_ids)
tx_annotation <- data.frame(
transcript_id = tx_ids,
transcript_id_clean = tx_ids_clean,
stringsAsFactors = FALSE
) %>%
left_join(
tx2gene %>%
dplyr::select(TXNAME_clean, GENEID_clean, gene_symbol) %>%
distinct(),
by = c("transcript_id_clean" = "TXNAME_clean")
) %>%
dplyr::rename(
gene_id = GENEID_clean,
symbol = gene_symbol
)
cat("Annotated", nrow(tx_annotation), "transcripts\n")
## Annotated 116546 transcripts
cat("Transcripts with gene symbols:", sum(!is.na(tx_annotation$symbol)), "\n")
## Transcripts with gene symbols: 115881
cat("Unknown transcripts:", sum(is.na(tx_annotation$symbol)), "\n")
## Unknown transcripts: 665
expr_long <- as.data.frame(t(txi$abundance)) %>%
mutate(sample_id = rownames(.)) %>%
pivot_longer(cols = -sample_id, names_to = "transcript_id", values_to = "TPM") %>%
left_join(metadata %>% dplyr::select(sample_id, Age), by = "sample_id") %>%
left_join(tx_annotation %>% dplyr::select(transcript_id, gene_id, symbol), by = "transcript_id")
katp_gene_tpm <- expr_long %>%
dplyr::filter(symbol %in% KATP_GENES) %>%
group_by(sample_id, Age, gene_id, symbol) %>%
summarise(TPM = sum(TPM), .groups = "drop") %>%
mutate(channel = KATP_LABELS[symbol])
print(katp_gene_tpm %>% dplyr::select(sample_id, Age, gene_id, symbol, channel, TPM) %>% head())
## # A tibble: 6 × 6
## sample_id Age gene_id symbol channel TPM
## <chr> <fct> <chr> <chr> <chr> <dbl>
## 1 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.… Old ENSMUS… Kcnj8 Kir6.1 2.04
## 2 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.… Old ENSMUS… Abcc9 SUR2 3.35
## 3 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.… Old ENSMUS… Abcc8 SUR1 0.572
## 4 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.… Old ENSMUS… Kcnj11 Kir6.2 20.1
## 5 24-Old_control_AT_4_AATCCGGA-AACTGTAG_R1.… Old ENSMUS… Kcnj8 Kir6.1 9.01
## 6 24-Old_control_AT_4_AATCCGGA-AACTGTAG_R1.… Old ENSMUS… Abcc9 SUR2 15.0
katp_wide <- katp_gene_tpm %>%
dplyr::select(sample_id, channel, TPM) %>%
pivot_wider(names_from = channel, values_from = TPM) %>%
as.data.frame()
rownames(katp_wide) <- katp_wide$sample_id
katp_wide$sample_id <- NULL
katp_mat <- log2(as.matrix(katp_wide) + 1)
want <- c("Kir6.1", "Kir6.2", "SUR1", "SUR2")
katp_mat <- katp_mat[, intersect(want, colnames(katp_mat)), drop = FALSE]
cat("Subunits in matrix:", paste(colnames(katp_mat), collapse = ", "), "\n")
## Subunits in matrix: Kir6.1, Kir6.2, SUR1, SUR2
print(head(katp_mat))
## Kir6.1 Kir6.2
## 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.fastq.gz 1.602020 4.400538
## 24-Old_control_AT_4_AATCCGGA-AACTGTAG_R1.fastq.gz 3.323142 5.831692
## 25-Old_control_AT_5_TAATACAG-GTGAATAT_R1.fastq.gz 2.627225 5.557657
## 27-Old_control_AT_7_CGGCGTGA-ACAGGCGC_R1.fastq.gz 2.898426 5.383647
## 29-Old_control_AT_9_ATGTAAGT-CATAGAGT_R1.fastq.gz 2.800623 5.383050
## 39-Young_control_AT_11_GCAGAATT-TGGCCGGT_R1.fastq.gz 3.331374 5.893688
## SUR1 SUR2
## 21-Old_control_AT_1_GGCTTAAG-GGTCACGA_R1.fastq.gz 0.6527701 2.121844
## 24-Old_control_AT_4_AATCCGGA-AACTGTAG_R1.fastq.gz 1.3540769 4.003786
## 25-Old_control_AT_5_TAATACAG-GTGAATAT_R1.fastq.gz 1.4302874 3.332094
## 27-Old_control_AT_7_CGGCGTGA-ACAGGCGC_R1.fastq.gz 1.0360623 3.249208
## 29-Old_control_AT_9_ATGTAAGT-CATAGAGT_R1.fastq.gz 0.9005805 3.325907
## 39-Young_control_AT_11_GCAGAATT-TGGCCGGT_R1.fastq.gz 1.0071424 4.554527
## Correlation coefficients (pearson):
## Kir6.1 Kir6.2 SUR1 SUR2
## Kir6.1 1.000 0.854 0.379 0.906
## Kir6.2 0.854 1.000 0.570 0.895
## SUR1 0.379 0.570 1.000 0.414
## SUR2 0.906 0.895 0.414 1.000
##
## p-values:
## Kir6.1 Kir6.2 SUR1 SUR2
## Kir6.1 0.000000 1.65e-03 0.2810 0.000302
## Kir6.2 0.001650 1.06e-62 0.0857 0.000468
## SUR1 0.281000 8.57e-02 0.0000 0.234000
## SUR2 0.000302 4.68e-04 0.2340 0.000000