---
title: "Ludwig_et_al_example_notebook"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Ludwig_et_al_example_notebook}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width=7, 
  fig.height=5
)


```

```{r setup}
library(MitoHEAR)
```

## Dataset GSE115210

In the first part of the vignette it is shown the analysis performed on the dataset with GEO accession number GSE115210 from [Ludwig et al, 2019](https://doi.org/10.1016/j.cell.2019.01.022) used in fig. 2/S2. Individually sorted cells from clonally derived TF1 clones (C9, D6, and G10) were processed with single cell RNA-seq (Smart-seq2)

### Get counts for the four alleles in each base-cell pair

```{r}
load(system.file("extdata", "name_cells_fig_2.Rda", package = "MitoHEAR"))
load(system.file("extdata", "name_cells_fig_2_analysis.Rda", package = "MitoHEAR"))
```

We don't execute the function *get_raw_counts_allele* here and  we directly load his output. 
A command line implementation of the function *get_raw_counts_allele* is also available (see github README file for info).


```{r}

load(system.file("extdata", "output_SNP_mt_fig_2_new.Rda", package = "MitoHEAR"))

output_SNP_mt_fig_2  <-  result
matrix_allele_counts  <-  output_SNP_mt_fig_2[[1]]
name_position_allele  <-  output_SNP_mt_fig_2[[2]]
name_position  <-  output_SNP_mt_fig_2[[3]]
```

```{r}

my.clusters  <-  rep(0, length(name_cells_fig_2))
my.clusters[grep("_G10_", name_cells_fig_2)] <- "G10"
my.clusters[grep("_D6_", name_cells_fig_2)]  <- "D6"
my.clusters[grep("_C9_", name_cells_fig_2)]  <- "C9"


tfi_fig_2 <- get_heteroplasmy(matrix_allele_counts, name_position_allele, name_position, number_reads = 50, number_positions = 2000, filtering  =  1, my.clusters)

```


```{r}
sum_matrix <- tfi_fig_2[[1]]
sum_matrix_qc <- tfi_fig_2[[2]]
heteroplasmy_matrix_ci <- tfi_fig_2[[3]]
allele_matrix_ci<- tfi_fig_2[[4]]
cluster_ci <- my.clusters[row.names(sum_matrix)%in%row.names(sum_matrix_qc)]

index_ci <- tfi_fig_2[[5]]
```

```{r}

name_position_allele_qc  <- name_position_allele[name_position%in%colnames(sum_matrix_qc)]
name_position_qc <- name_position[name_position%in%colnames(sum_matrix_qc)]

```

```{r}
relevant_bases <- filter_bases(heteroplasmy_matrix_ci, min_heteroplasmy = 0.01, min_cells = 10, index_ci)
```


### Identification of most different bases according to heteroplasmy between clones

```{r warning=FALSE}
p_value_wilcox_test_1 <- get_wilcox_test(heteroplasmy_matrix_ci[, relevant_bases], cluster_ci, "C9", "D6" , index_ci)

p_value_wilcox_test_2 <- get_wilcox_test(heteroplasmy_matrix_ci[, relevant_bases], cluster_ci, "C9", "G10" , index_ci)

p_value_wilcox_test_3 <- get_wilcox_test(heteroplasmy_matrix_ci[, relevant_bases], cluster_ci, "D6", "G10" , index_ci)


```


```{r}
p_value_wilcox_test_sort_1 <- sort(p_value_wilcox_test_1, decreasing  =  F)
p_value_wilcox_test_sort_2 <- sort(p_value_wilcox_test_2, decreasing  =  F)
p_value_wilcox_test_sort_3 <- sort(p_value_wilcox_test_3, decreasing  =  F)

p_value_wilcox_test_sort_small_1 <- p_value_wilcox_test_sort_1[p_value_wilcox_test_sort_1<0.05]
p_value_wilcox_test_sort_small_2 <- p_value_wilcox_test_sort_2[p_value_wilcox_test_sort_2<0.05]
p_value_wilcox_test_sort_small_3 <- p_value_wilcox_test_sort_3[p_value_wilcox_test_sort_3<0.05]
```

```{r}
q  <- list()
for ( i in 1:length(p_value_wilcox_test_sort_small_3[1:2])){
p  <- plot_heteroplasmy(names(p_value_wilcox_test_sort_small_3)[i], heteroplasmy_matrix_ci, cluster_ci, index_ci)+ggplot2::ggtitle(paste(names(p_value_wilcox_test_sort_small_3)[i], round(p_value_wilcox_test_sort_small_3[i], 4), sep = "-"))
q  <- list(q, p)
}
q

q  <- list()
for ( i in names(p_value_wilcox_test_sort_small_3[1:2])){
p  <-  plot_allele_frequency(i, heteroplasmy_matrix_ci, allele_matrix_ci, cluster_ci, name_position_qc, name_position_allele_qc, 5, index_ci)  
q  <- list(q, p)
}
q


```


### Supervised cluster analysis among cells based on allele frequency values


```{r}

features_cluster  <-  c(names(p_value_wilcox_test_sort_small_1), names(p_value_wilcox_test_sort_small_2), names(p_value_wilcox_test_sort_small_3))
features_cluster  <-  unique(features_cluster)
result_clustering_sc  <- clustering_angular_distance(heteroplasmy_matrix_ci, allele_matrix_ci, cluster_ci, length(colnames(heteroplasmy_matrix_ci)), deepSplit_param = 1, minClusterSize_param = 15, 0.2, min_value = 0.001, index = index_ci, relevant_bases = features_cluster)


old_new_classification  <-result_clustering_sc[[1]]
dist_matrix_sc  <- result_clustering_sc[[2]]
top_dist  <- result_clustering_sc[[3]]
common_idx  <-  result_clustering_sc[[4]]

old_classification  <- as.vector(old_new_classification[, 1])
new_classification  <- as.vector(old_new_classification[, 2])
```


Comparison between the ground truth and the new partition obtained with supervised cluster analysis

```{r}
plot_heatmap( new_classification, old_classification, (dist_matrix_sc), cluster_columns = F, cluster_rows = F, "Euclidean distance")
```

### Unsupervised cluster analysis among cells based on allele frequency values

```{r}

result_clustering_sc  <-  clustering_angular_distance(heteroplasmy_matrix_ci, allele_matrix_ci, cluster_ci, length(colnames(heteroplasmy_matrix_ci)), deepSplit_param = 1, minClusterSize_param = 15, 0.2, min_value = 0.001, index = index_ci, relevant_bases = NULL)


old_new_classification  <- result_clustering_sc[[1]]
dist_matrix_sc  <-  result_clustering_sc[[2]]
top_dist  <-  result_clustering_sc[[3]]
common_idx  <-  result_clustering_sc[[4]]

old_classification  <-  as.vector(old_new_classification[, 1])
new_classification  <-  as.vector(old_new_classification[, 2])

```


Comparison between the ground truth and the new partition obtained with unsupervised cluster analysis

```{r}
plot_heatmap(new_classification, old_classification, (dist_matrix_sc), cluster_columns = F, cluster_rows = F, "Euclidean distance")

```


Below the bases selected for the unsupervised cluster analysis

```{r}
q  <-  list()
for ( i in 1:length(top_dist)){
p  <-  plot_heteroplasmy(top_dist[i], heteroplasmy_matrix_ci, cluster_ci, index_ci)
q  <-  list(q, p)
}
q


```


## Dataset GSE115214

In the second part of the vignette it is shown the analysis performed on the dataset with GEO accession number GSE115214 from [Ludwig et al,  2019](https://doi.org/10.1016/j.cell.2019.01.022) used in fig. 5.
Primary human cells( CD34+ hematopoietic stem and progenitor cells HSPCs) from two independent donors were processed with single cell RNA-seq (Smart-seq2)
 

```{r}
load(system.file("extdata", "name_cells_fig_5_all.Rda", package = "MitoHEAR"))
load(system.file("extdata", "name_cells_fig_5_all_analysis.Rda", package = "MitoHEAR"))

```


```{r}
path_meta_data <- system.file("extdata", "CD34_colonies_table.txt", package = "MitoHEAR")
cell_convert <- read.table(path_meta_data, header = T)
cell_old <- cell_convert$ID1
cell_new <- cell_convert$ID2

cell_old_summary <- rep(0, length(name_cells_fig_5_all))

for (i in 1:length(cell_old_summary)){
  
cell_old_summary[i] <- c(paste(strsplit(name_cells_fig_5_all, "_")[[i]][3:6], collapse = "_"))
}
change <- cell_old_summary[grep("_colonies", cell_old_summary)]
cell_old_summary[grep("_colonies", cell_old_summary)] <- substr(change, 1, nchar(change)-9)


```


We don't execute the function *get_raw_counts_allele* here and  we directly load his output. 
A command line implementation of the function *get_raw_counts_allele* is also available (see github README file for info).


```{r}
path_meta_data <- load(system.file("extdata", "output_SNP_mt_fig_5_new.Rda", package = "MitoHEAR"))
output_SNP_mt_fig_5 <- result
matrix_allele_counts <- output_SNP_mt_fig_5[[1]]
name_position_allele <- output_SNP_mt_fig_5[[2]]
name_position <- output_SNP_mt_fig_5[[3]]
```

```{r}
common_old <- intersect(cell_old, cell_old_summary)
row.names(matrix_allele_counts) <- cell_old_summary

matrix_allele_counts <- matrix_allele_counts[common_old, ]

meta_old <- data.frame(cell_old, cell_new)
row.names(meta_old) <- cell_old
meta_old <- meta_old[common_old, ]
new_small <- meta_old[, 2]

row.names(matrix_allele_counts) <- new_small

```


```{r}


donor_all <- rep(0, length(row.names(matrix_allele_counts)))
donor_all[grep("Donor1_", row.names(matrix_allele_counts))] <- "Donor_1"
donor_all[grep("Donor2_", row.names(matrix_allele_counts))] <- "Donor_2"


donor_1_2 <- get_heteroplasmy(matrix_allele_counts, name_position_allele, name_position, number_reads = 50, number_positions = 2000, filtering = 2, donor_all)

```


```{r}
sum_matrix <- donor_1_2[[1]]
sum_matrix_qc <- donor_1_2[[2]]
heteroplasmy_matrix_ci <- donor_1_2[[3]]
allele_matrix_ci <- donor_1_2[[4]]

cluster_ci <- donor_all[row.names(sum_matrix)%in%row.names(sum_matrix_qc)]

index_ci <- donor_1_2[[5]]
```

```{r}

name_position_allele_qc <- name_position_allele[name_position%in%colnames(sum_matrix_qc)]
name_position_qc <- name_position[name_position%in%colnames(sum_matrix_qc)]

```

```{r}
relevant_bases <- filter_bases(heteroplasmy_matrix_ci, min_heteroplasmy = 0.01, min_cells = 50, index_ci)
```


### Identification of most different bases according to heteroplasmy between donor1 and donor2

```{r}
p_value_wilcox_test_1 <- get_wilcox_test(heteroplasmy_matrix_ci[, relevant_bases], cluster_ci, "Donor_1", "Donor_2" , index_ci)


```


```{r}
p_value_wilcox_test_sort_1 <- sort(p_value_wilcox_test_1, decreasing = F)

p_value_wilcox_test_sort_small_1 <- p_value_wilcox_test_sort_1[p_value_wilcox_test_sort_1<0.05]

p_value_wilcox_test_sort_top <- p_value_wilcox_test_sort_small_1[1:5]

```

```{r}
q <- list()
for ( i in 1:length(p_value_wilcox_test_sort_top[1:2])){
p <- plot_heteroplasmy(names(p_value_wilcox_test_sort_top)[i], heteroplasmy_matrix_ci, cluster_ci, index_ci)+ggplot2::ggtitle(paste(names(p_value_wilcox_test_sort_top)[i], round(p_value_wilcox_test_sort_small_1[i], 4), sep = "-"))
q <- list(q, p)
}
q


q <- list()
for ( i in names(p_value_wilcox_test_sort_top[1:2])){
p <- plot_allele_frequency(i, heteroplasmy_matrix_ci, allele_matrix_ci, cluster_ci, name_position_qc, name_position_allele_qc, 5, index_ci)  
q <- list(q, p)
}
q
```


```{r}

heteroplasmy_matrix_ci_small <- heteroplasmy_matrix_ci[, relevant_bases]

allele_matrix_ci_small <- allele_matrix_ci[, name_position_qc%in%relevant_bases]

```


### Unsupervised cluster analysis among cells based on allele frequency values

```{r}
result_clustering_sc <- clustering_angular_distance(heteroplasmy_matrix_ci_small, allele_matrix_ci_small, cluster_ci, length(colnames(heteroplasmy_matrix_ci_small)), deepSplit_param = 0, minClusterSize_param = 100, 0.2, min_value = 0.001, index = index_ci, relevant_bases = NULL)


old_new_classification <- result_clustering_sc[[1]]
dist_matrix_sc <- result_clustering_sc[[2]]
top_dist <- result_clustering_sc[[3]]
common_idx <- result_clustering_sc[[4]]

old_classification <- as.vector(old_new_classification[, 1])
new_classification <- as.vector(old_new_classification[, 2])


```


The unsupervised cluster analysis divides cells in two groups that perfectly coincide with donor 1 and donor 2 

```{r }
plot_heatmap(new_classification, old_classification, (dist_matrix_sc), cluster_columns = F, cluster_rows = F, "Euclidean distance")
```

The final number of clusters obtained does not change with the number of top bases used (determined with the parameter *min_value* in the function *choose_features_clustering*.
The number in the clustree plot (1, 2, 3, 4) refers to the index of the vector min_frac. So in this case 1 refers to 0.1,  2 to 0.01 and so on.

```{r, eval = FALSE}
min_frac <- c(0.1, 0.01, 0.001, 0.0001)
choose_features_clustering(heteroplasmy_matrix_ci_small, allele_matrix_ci_small, cluster_ci, top_pos = length(colnames(heteroplasmy_matrix_ci_small)), deepSplit_param = 0, minClusterSize_param = 100, min_frac, 0.2, index = index_ci)
```


## Dataset GSE115214 only with donor 1

In the second part of the vignette it is shown the analysis performed on the dataset with GEO accession number GSE115214 [Ludwig et al, 2019](https://doi.org/10.1016/j.cell.2019.01.022) used in fig. 5.
Primary human cells( CD34+ hematopoietic stem and progenitor cells HSPCs) from two independent donors were processed with single cell RNA-seq (Smart-seq2). 
Below we focus only on some colonies from donor 1.

```{r}
donor_1 <- donor_all[donor_all == "Donor_1"]
```


```{r}
colony_1 <- rep(0, length(seq(101, 135)))
for (i in 1:length(seq(101, 135))){
  colony_1[i] <- paste0("C", seq(101, 135)[i], collapse = "")
}
colony_1_all <- rep(0, length(row.names(matrix_allele_counts)))
for (i in colony_1 ){
colony_1_all[grep(i, row.names(matrix_allele_counts))] <- i
}


cluster_ci <- colony_1_all[grep("Donor1_", row.names(matrix_allele_counts))]

only_col <- c("C101", "C103", "C107", "C109", "C112", "C114", "C116", "C118", "C120", "C122", "C124", "C132", "C135")

```

```{r}

matrix_allele_counts_1 <- matrix_allele_counts[grep("Donor1_", row.names(matrix_allele_counts)), ]


donor_1 <- get_heteroplasmy(matrix_allele_counts_1[cluster_ci%in%only_col, ], name_position_allele, name_position, number_reads = 50, number_positions = 2000, filtering = 2, donor_1[cluster_ci%in%only_col])
```


```{r}
sum_matrix <- donor_1[[1]]
sum_matrix_qc <- donor_1[[2]]
heteroplasmy_matrix_ci <- donor_1[[3]]
allele_matrix_ci <- donor_1[[4]]

cluster_ci <- cluster_ci[cluster_ci%in%only_col]

cluster_ci <- cluster_ci[row.names(sum_matrix)%in%row.names(sum_matrix_qc)]

index_ci <- donor_1[[5]]
```


```{r}

name_position_allele_qc <- name_position_allele[name_position%in%colnames(sum_matrix_qc)]
name_position_qc <- name_position[name_position%in%colnames(sum_matrix_qc)]

```


```{r}
relevant_bases <- filter_bases(heteroplasmy_matrix_ci, min_heteroplasmy = 0.01, min_cells = 10, index_ci)
```

```{r}
heteroplasmy_matrix_ci_small <- heteroplasmy_matrix_ci[, relevant_bases]

allele_matrix_ci_small <- allele_matrix_ci[, name_position_qc%in%relevant_bases]

```

### Unsupervised cluster analysis among cells based on allele frequency values

```{r}
result_clustering_sc <- clustering_angular_distance(heteroplasmy_matrix_ci_small, allele_matrix_ci_small, cluster_ci, length(row.names(heteroplasmy_matrix_ci_small)), deepSplit_param = 0, minClusterSize_param = 10, 0.2, min_value = 0.001, index = index_ci, relevant_bases = NULL)


old_new_classification <- result_clustering_sc[[1]]
dist_matrix_sc <- result_clustering_sc[[2]]
top_dist <- result_clustering_sc[[3]]
common_idx <- result_clustering_sc[[4]]

old_classification <- as.vector(old_new_classification[, 1])
new_classification <- as.vector(old_new_classification[, 2])

```


Comparison between the ground truth and the new partition obtained with unsupervised cluster analysis

```{r }
plot_heatmap(new_classification, old_classification, (dist_matrix_sc), cluster_columns = F, cluster_rows = F, "Euclidean distance")
```


Below the top 4 bases selected for the unsupervised cluster analysis.

```{r}
q <- list()
for ( i in 1:length(top_dist[1:4])){
p <- plot_heteroplasmy(top_dist[i], heteroplasmy_matrix_ci, cluster_ci, index_ci)
q <- list(q, p)
}
q


```
```{r}
utils::sessionInfo()
```