import os
os.getcwd()
'/sfs/gpfs/tardis/project/iprime_storage/myles_kim/_data/LutgensLab/MM_Aotas_scRNA'
data preparation and results. more detail @github.com/wdnlotm/bulk2sc_GMVAE
import os
os.getcwd()
'/sfs/gpfs/tardis/project/iprime_storage/myles_kim/_data/LutgensLab/MM_Aotas_scRNA'
= sc.read_10x_mtx(path2mtx, var_names='gene_symbols', make_unique=True)
adata adata
AnnData object with n_obs × n_vars = 10360 × 16327
var: 'gene_ids', 'feature_types'
sc.pl.violin(
adata,"pct_counts_mt", "pct_counts_in_top_20_genes"], #"pct_counts_ribo", "pct_counts_hb",
[=0.4,
jitter=True,
multi_panel="vlplot_save.png"
save )
= sns.displot(adata.obs["n_genes_by_counts"], bins=100, kde=False) p0
def is_outlier(adata, metric: str, nmads: int):
= adata.obs[metric]
M = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
outlier + nmads * median_abs_deviation(M) < M
np.median(M)
)return outlier
"outlier"] = (
adata.obs["log1p_total_counts", 5)
is_outlier(adata, | is_outlier(adata, "log1p_n_genes_by_counts", 5)
| is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)
adata.obs.outlier.value_counts()
"mt_outlier"] = (adata.obs["pct_counts_mt"] > 15.9)
adata.obs[
adata.obs.mt_outlier.value_counts()
print(f"Total number of cells: {adata.n_obs}")
= adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()
adata
print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")
Total number of cells: 10360
Number of cells after filtering of low quality cells: 10003
= sns.displot(adata.obs["total_counts"], bins=100, kde=False)
p1 = sc.pl.violin(
p2
adata,"n_genes_by_counts", "total_counts", "pct_counts_mt"],
[=0.4,
jitter=True,
multi_panel
)= sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt") p3
adata
AnnData object with n_obs × n_vars = 10003 × 16322
obs: 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
predicted_doublet
False 10002
True 1
Name: count, dtype: int64
predicted_doublet
False 10002
Name: count, dtype: int64
/tmp/ipykernel_461757/2031553131.py:3: ImplicitModificationWarning: Setting element `.layers['counts']` of view, initializing view as actual.
adata.layers["counts"] = adata.X.copy()
AnnData object with n_obs × n_vars = 10002 × 16322
obs: 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'doublet_score', 'predicted_doublet'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
uns: 'scrublet', 'log1p'
layers: 'counts'
= plt.subplots(1, 2, figsize=(10, 5))
fig, axes = sns.histplot(adata.obs["total_counts"], bins=100, kde=False, ax=axes[0])
p1 0].set_title("Total counts")
axes[= sns.histplot(adata.X.sum(1), bins=100, kde=False, ax=axes[1])
p2 1].set_title("Shifted logarithm")
axes[ plt.show()
sc.tl.pca(adata)
# sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)
# adata
=20, n_pcs=20)
sc.pp.neighbors(adata, n_neighbors="igraph", n_iterations=100, resolution=0.5)
sc.tl.leiden(adata, flavor=0.5)
sc.tl.umap(adata, min_dist=["leiden"], alpha=0.7, palette=color25) sc.pl.umap(adata, color
"leiden"].value_counts().sort_index() adata.obs[
leiden
0 1044
1 796
2 852
3 1998
4 739
5 774
6 717
7 140
8 270
9 390
10 528
11 272
12 648
13 87
14 269
15 302
16 176
Name: count, dtype: int64
sc.tl.rank_genes_groups(="leiden", method="wilcoxon", key_added="dea_leiden"
adata, groupby )
= sc.pl.rank_genes_groups_dotplot(
fig ="leiden", standard_scale="var", n_genes=7, key="dea_leiden", # save="manual_qc_dotplot.png"
adata, groupby )
= pd.DataFrame(adata.uns['dea_leiden']['names'])
gene_table
= []
list_of_list for ii in range(7):
f'{ii}'][0:6].to_list()) list_of_list.append(gene_table[
= list(set([item for sublist in list_of_list for item in sublist])) markers
= [gene for gene in markers if gene[0:3]!="RPS"]
markers = [gene for gene in markers if gene[0:3]!="RPL"]
markers = [gene for gene in markers if gene[0:2]!="Hb"]
markers = [gene for gene in markers if gene[0:3]!="MT-"] markers
='leiden', swap_axes=True, save="gene_heatmap.png") sc.pl.heatmap(adata, markers, groupby
sc.pl.umap(
adata, =markers,
color=0, vmax="p99",
vmin=False, frameon=True,
sort_order="Reds",
cmap# save="_lung_gene_umap.png",
)
TAPE
model is adoptedThis is achieved using the TAPE model, specially its package scTAPE
. However, to make more hyperparameters accessible through the interface, a slightly modified version of the package has been developed. It is available here.
Component III, generator, starts with Component II and connects to the decoder (right half) of Component I.