Title: | Explore, Analyze and Visualize Catalogs and Patterns of Copy Number Variation in Cancer Genomics |
---|---|
Description: | Provides functionality for exploring, analyzing and visualizing the copy number variation (CNV) motifs in cancer genomics. |
Authors: | Shixiang Wang [aut, cre] |
Maintainer: | Shixiang Wang <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.1.0 |
Built: | 2024-11-19 03:26:13 UTC |
Source: | https://github.com/ShixiangWang/CNVMotif |
Build a Substitution Matrix
build_sub_matrix(simple_version = FALSE, max_len_score = 4L)
build_sub_matrix(simple_version = FALSE, max_len_score = 4L)
simple_version |
if |
max_len_score |
the maximum score for segment length (should >=4). The maximum score for copy number value is 6 (cannot be changed). |
a list
.
sub_list <- build_sub_matrix() sub_list2 <- build_sub_matrix(simple_version = TRUE)
sub_list <- build_sub_matrix() sub_list2 <- build_sub_matrix(simple_version = TRUE)
cluster::clusGap()
cannot be used here for distance matrix, so
it is removed.
cluster_pam_estimate( x, method = c("silhouette", "wss"), k.max = 10, verbose = interactive(), barfill = "steelblue", barcolor = "steelblue", linecolor = "steelblue", FUNcluster = cluster::pam, seed = 1234L, clean_memory = FALSE, ... ) cluster_pam(x, k, ...)
cluster_pam_estimate( x, method = c("silhouette", "wss"), k.max = 10, verbose = interactive(), barfill = "steelblue", barcolor = "steelblue", linecolor = "steelblue", FUNcluster = cluster::pam, seed = 1234L, clean_memory = FALSE, ... ) cluster_pam(x, k, ...)
x |
a dissimilarity matrix. |
method |
the method to be used for estimating the optimal number of clusters. Possible values are "silhouette" (for average silhouette width), "wss" (for total within sum of square) and "gap_stat" (for gap statistics). |
k.max |
the maximum number of clusters to consider, must be at least two. |
verbose |
logical value. If TRUE, the result of progress is printed. |
barfill |
fill color and outline color for bars |
barcolor |
fill color and outline color for bars |
linecolor |
color for lines |
FUNcluster |
a partitioning function which accepts as first argument a
(data) matrix like x, second argument, say k, k >= 2, the number of
clusters desired, and returns a list with a component named cluster which
contains the grouping of observations. Allowed values include: kmeans,
cluster::pam, cluster::clara, cluster::fanny, hcut, etc. This argument is
not required when x is an output of the function
|
seed |
random seed. |
clean_memory |
logical. If |
... |
other parameters passing to cluster::pam. |
k |
positive integer specifying the number of clusters, less than the number of observations. |
a ggplot
object.
a PAM clustering result object.
data("iris") head(iris) iris.scaled <- scale(iris[, -5]) iris.dist <- dist(iris.scaled) %>% as.matrix() p <- cluster_pam_estimate(iris.dist) p2 <- cluster_pam_estimate(iris.dist, method = "wss") cl <- cluster_pam(iris.dist, 3)
data("iris") head(iris) iris.scaled <- scale(iris[, -5]) iris.dist <- dist(iris.scaled) %>% as.matrix() p <- cluster_pam_estimate(iris.dist) p2 <- cluster_pam_estimate(iris.dist, method = "wss") cl <- cluster_pam(iris.dist, 3)
Split Cluster Sequence into List
cluster_split(x, s = NULL, block_size = 10)
cluster_split(x, s = NULL, block_size = 10)
x |
a named integer vector from |
s |
default is |
block_size |
block size used to split, only used when |
a list
.
Run Modified Multiple Sequence Alignment
do_msa( x, substitutionMatrix = NULL, gapOpening = 6, gapExtension = 1, verbose = FALSE, ... )
do_msa( x, substitutionMatrix = NULL, gapOpening = 6, gapExtension = 1, verbose = FALSE, ... )
x |
a character vector. |
substitutionMatrix |
substitution matrix for scoring matches and mismatches.
Default is |
gapOpening |
gap opening penalty; Note that the sign of this parameter is ignored. |
gapExtension |
gap extension penalty; Note that the sign of this parameter is ignored. |
verbose |
if |
... |
other arguments passing to msa::msa |
a list
.
r <- do_msa(c("ABCDF", "BCDEF")) r
r <- do_msa(c("ABCDF", "BCDEF")) r
See get_score_matrix()
for examples.
The result sequences are unique and sorted.
extract_seqs( dt, len = 5L, step = 1L, local_cutoff = 1e+07, flexible_approach = FALSE, return_dt = FALSE )
extract_seqs( dt, len = 5L, step = 1L, local_cutoff = 1e+07, flexible_approach = FALSE, return_dt = FALSE )
dt |
a |
len |
cut length. |
step |
step size to move on each chromosome sequence. |
local_cutoff |
any segment with length greater than this cutoff will be filtered out and
used as cutpoint, default is |
flexible_approach |
if |
return_dt |
if |
a list
.
Get Copy Number Sequence Similarity or Distance Matrix
get_score_matrix( x, sub_mat = NULL, simple_version = FALSE, block_size = NULL, dislike = FALSE, cores = 1L, verbose = FALSE )
get_score_matrix( x, sub_mat = NULL, simple_version = FALSE, block_size = NULL, dislike = FALSE, cores = 1L, verbose = FALSE )
x |
a coding copy number sequence (valid letters are A to X). |
sub_mat |
default is |
simple_version |
if |
block_size |
a block size to aggregrate, this is designed for big data, it means results from adjacent sequences will be aggregrate by means to reduce the size of result matrix. |
dislike |
if |
cores |
computer cores, default is |
verbose |
if |
a score matrix.
load(system.file("extdata", "toy_segTab.RData", package = "CNVMotif", mustWork = TRUE )) x <- transform_seqs(segTabs) x seqs <- extract_seqs(x$dt) seqs seqs2 <- extract_seqs(x$dt, flexible_approach = TRUE) seqs2 mat <- get_score_matrix(seqs$keep, x$mat, verbose = TRUE) mat mat2 <- get_score_matrix(seqs$keep, x$mat, dislike = TRUE) identical(mat2, 120L - mat) mat_b <- get_score_matrix(seqs$keep, x$mat, block_size = 2L) ## block1 represents the first 2 sequences ## block2 represents the 3rd, 4th sequences ## ... mat_b mat_c <- get_score_matrix(seqs$keep) mat_c mat_d <- get_score_matrix(seqs$keep, dislike = TRUE) mat_d if (requireNamespace("doParallel")) { mock_seqs <- sapply(1:10000, function(x) { paste(sample(LETTERS[1:24], 5, replace = TRUE), collapse = "") }) system.time( y1 <- get_score_matrix(mock_seqs, x$mat, cores = 1) ) system.time( y2 <- get_score_matrix(mock_seqs, x$mat, cores = 2) ) all.equal(y1, y2) }
load(system.file("extdata", "toy_segTab.RData", package = "CNVMotif", mustWork = TRUE )) x <- transform_seqs(segTabs) x seqs <- extract_seqs(x$dt) seqs seqs2 <- extract_seqs(x$dt, flexible_approach = TRUE) seqs2 mat <- get_score_matrix(seqs$keep, x$mat, verbose = TRUE) mat mat2 <- get_score_matrix(seqs$keep, x$mat, dislike = TRUE) identical(mat2, 120L - mat) mat_b <- get_score_matrix(seqs$keep, x$mat, block_size = 2L) ## block1 represents the first 2 sequences ## block2 represents the 3rd, 4th sequences ## ... mat_b mat_c <- get_score_matrix(seqs$keep) mat_c mat_d <- get_score_matrix(seqs$keep, dislike = TRUE) mat_d if (requireNamespace("doParallel")) { mock_seqs <- sapply(1:10000, function(x) { paste(sample(LETTERS[1:24], 5, replace = TRUE), collapse = "") }) system.time( y1 <- get_score_matrix(mock_seqs, x$mat, cores = 1) ) system.time( y2 <- get_score_matrix(mock_seqs, x$mat, cores = 2) ) all.equal(y1, y2) }
ggseqlogo
is a shortcut for generating sequence logos.
It adds the ggseqlogo theme theme_logo
by default, and facets when multiple input data are provided.
It serves as a convenient wrapper, so to customise logos beyond the defaults here, please use geom_logo
.
ggseqlogo2( data, facet = "wrap", scales = "free_x", ncol = NULL, nrow = NULL, idor = NULL, ... ) geom_logo2( data = NULL, method = "bits", seq_type = "auto", namespace = NULL, font = "roboto_medium", stack_width = 0.95, rev_stack_order = F, col_scheme = "auto", low_col = "black", high_col = "yellow", na_col = "grey20", plot = TRUE, idor = NULL, ... )
ggseqlogo2( data, facet = "wrap", scales = "free_x", ncol = NULL, nrow = NULL, idor = NULL, ... ) geom_logo2( data = NULL, method = "bits", seq_type = "auto", namespace = NULL, font = "roboto_medium", stack_width = 0.95, rev_stack_order = F, col_scheme = "auto", low_col = "black", high_col = "yellow", na_col = "grey20", plot = TRUE, idor = NULL, ... )
data |
Character vector of sequences or named list of sequences. All sequences must have same width |
facet |
Facet type, can be 'wrap' or 'grid' |
scales |
Facet scales, see |
ncol |
Number of columns, works only when |
nrow |
Number of rows, same as |
idor |
a named vector (like a dictory) to change letters one to one in the plot. |
... |
Additional arguments passed to |
method |
Height method, can be one of "bits" or "probability" (default: "bits") |
seq_type |
Sequence type, can be one of "auto", "aa", "dna", "rna" or "other" (default: "auto", sequence type is automatically guessed) |
namespace |
Character vector of single letters to be used for custom namespaces. Can be alphanumeric, including Greek characters. |
font |
Name of font. See |
stack_width |
Width of letter stack between 0 and 1 (default: 0.95) |
rev_stack_order |
If |
col_scheme |
Color scheme applied to the sequence logo. See |
low_col |
Colors for low and high ends of the gradient if a quantitative color scheme is used (default: "black" and "yellow"). |
high_col |
Colors for low and high ends of the gradient if a quantitative color scheme is used (default: "black" and "yellow"). |
na_col |
Color for letters missing in color scheme (default: "grey20") |
plot |
If |
library(ggseqlogo) data(ggseqlogo_sample) ## Same as ggseqlogo() p1 <- ggseqlogo2(seqs_dna[[1]]) p1 ## Extra feature idor <- as.character(1:4) names(idor) <- c("A", "C", "G", "T") p2 <- ggseqlogo2(seqs_dna[[1]], idor = idor) p2
library(ggseqlogo) data(ggseqlogo_sample) ## Same as ggseqlogo() p1 <- ggseqlogo2(seqs_dna[[1]]) p1 ## Extra feature idor <- as.character(1:4) names(idor) <- c("A", "C", "G", "T") p2 <- ggseqlogo2(seqs_dna[[1]], idor = idor) p2
Show Copy Number Sequence Logos
show_seq_logo( x, method = c("prob", "bits"), simple_version = FALSE, ncol = NULL, nrow = NULL, recode = FALSE, indicator = NULL, ... )
show_seq_logo( x, method = c("prob", "bits"), simple_version = FALSE, ncol = NULL, nrow = NULL, recode = FALSE, indicator = NULL, ... )
x |
a character vector of sequences or named list of sequences. All sequences must have same width. |
method |
Height method, can be one of "bits" or "probability" (default: "bits") |
simple_version |
if |
ncol |
Number of columns, works only when |
nrow |
Number of rows, same as |
recode |
if |
indicator |
a named vector (like a dictory) to change letters one to one in the plot. |
... |
Additional arguments passed to |
a ggplot
object
p1 <- show_seq_logo(sapply(split(LETTERS[1:24], 1:4), function(x) paste0(x, collapse = ""))) p1 p2 <- show_seq_logo(sapply(split(LETTERS[1:24], 1:4), function(x) paste0(x, collapse = "")), recode = TRUE ) p2 p3 <- show_seq_logo(sapply(split(LETTERS[1:6], 1:2), function(x) paste0(x, collapse = "")), simple_version = TRUE )
p1 <- show_seq_logo(sapply(split(LETTERS[1:24], 1:4), function(x) paste0(x, collapse = ""))) p1 p2 <- show_seq_logo(sapply(split(LETTERS[1:24], 1:4), function(x) paste0(x, collapse = "")), recode = TRUE ) p2 p3 <- show_seq_logo(sapply(split(LETTERS[1:6], 1:2), function(x) paste0(x, collapse = "")), simple_version = TRUE )
Show Copy Number Sequence Shapes
show_seq_shape( x, map = NULL, simple_version = FALSE, line_size_scale = 3, x_lab = ifelse(simple_version, "Assumed equal length", "Estimated segment length"), y_lab = "Copy number", nrow = NULL, ncol = NULL, scales = "free_x" )
show_seq_shape( x, map = NULL, simple_version = FALSE, line_size_scale = 3, x_lab = ifelse(simple_version, "Assumed equal length", "Estimated segment length"), y_lab = "Copy number", nrow = NULL, ncol = NULL, scales = "free_x" )
x |
a character vector of sequences or named list of sequences. All sequences must have same width. |
map |
default is |
simple_version |
if |
line_size_scale |
the scale size for line width. |
x_lab |
x lab. |
y_lab |
y lab. |
nrow |
Number of rows, same as |
ncol |
Number of columns, works only when |
scales |
Should scales be fixed ( |
a ggplot
object.
p <- show_seq_shape(c("ADGHK")) p x <- list(a = c("ABCDE", "AXFDP"), b = c("KKDFH", "GKDFM")) p2 <- show_seq_shape(x) p2 p3 <- show_seq_shape(c("ABCD"), simple_version = TRUE) p3
p <- show_seq_shape(c("ADGHK")) p x <- list(a = c("ABCDE", "AXFDP"), b = c("KKDFH", "GKDFM")) p2 <- show_seq_shape(x) p2 p3 <- show_seq_shape(c("ABCD"), simple_version = TRUE) p3
See get_score_matrix()
for examples. See details for full description
of implementation.
transform_seqs(x, simple_version = FALSE, max_len_score = 4L)
transform_seqs(x, simple_version = FALSE, max_len_score = 4L)
x |
a |
simple_version |
if |
max_len_score |
the maximum score for segment length (should >=4). The maximum score for copy number value is 6 (cannot be changed). |
For complicated cases, letters are grouped as short (<50kb), mid (<500kb), long (<5Mb), long (or extreme) long (>5Mb) segments.
A B C D for copy number 0.
E F G H for copy number 1.
I J K L for copy number 2.
M N O P for copy number 3.
Q R S T for copy number 4.
U V W X for copy number 5+.
For simplified cases, letters are used to code only segment copy number value.
A for copy number 0.
B for copy number 1.
C for copy number 2.
D for copy number 3.
E for copy number 4.
F for copy number 5+.
a list
.