import os
os.getcwd()'/sfs/gpfs/tardis/project/iprime_storage/myles_kim/_data/LutgensLab/MM_Aotas_scRNA'
data preparation and results. more detail @github.com/wdnlotm/bulk2sc_GMVAE
import os
os.getcwd()'/sfs/gpfs/tardis/project/iprime_storage/myles_kim/_data/LutgensLab/MM_Aotas_scRNA'
adata = sc.read_10x_mtx(path2mtx, var_names='gene_symbols', make_unique=True)
adataAnnData object with n_obs × n_vars = 10360 × 16327
var: 'gene_ids', 'feature_types'
sc.pl.violin(
adata,
["pct_counts_mt", "pct_counts_in_top_20_genes"], #"pct_counts_ribo", "pct_counts_hb",
jitter=0.4,
multi_panel=True,
save="vlplot_save.png"
)p0 = sns.displot(adata.obs["n_genes_by_counts"], bins=100, kde=False)def is_outlier(adata, metric: str, nmads: int):
M = adata.obs[metric]
outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
np.median(M) + nmads * median_abs_deviation(M) < M
)
return outlier
adata.obs["outlier"] = (
is_outlier(adata, "log1p_total_counts", 5)
| is_outlier(adata, "log1p_n_genes_by_counts", 5)
| is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)
adata.obs.outlier.value_counts()
adata.obs["mt_outlier"] = (adata.obs["pct_counts_mt"] > 15.9)
adata.obs.mt_outlier.value_counts()
print(f"Total number of cells: {adata.n_obs}")
adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()
print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")Total number of cells: 10360
Number of cells after filtering of low quality cells: 10003
p1 = sns.displot(adata.obs["total_counts"], bins=100, kde=False)
p2 = sc.pl.violin(
adata,
["n_genes_by_counts", "total_counts", "pct_counts_mt"],
jitter=0.4,
multi_panel=True,
)
p3 = sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")adataAnnData object with n_obs × n_vars = 10003 × 16322
obs: 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
predicted_doublet
False 10002
True 1
Name: count, dtype: int64
predicted_doublet
False 10002
Name: count, dtype: int64
/tmp/ipykernel_461757/2031553131.py:3: ImplicitModificationWarning: Setting element `.layers['counts']` of view, initializing view as actual.
adata.layers["counts"] = adata.X.copy()
AnnData object with n_obs × n_vars = 10002 × 16322
obs: 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'doublet_score', 'predicted_doublet'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
uns: 'scrublet', 'log1p'
layers: 'counts'
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
p1 = sns.histplot(adata.obs["total_counts"], bins=100, kde=False, ax=axes[0])
axes[0].set_title("Total counts")
p2 = sns.histplot(adata.X.sum(1), bins=100, kde=False, ax=axes[1])
axes[1].set_title("Shifted logarithm")
plt.show()sc.tl.pca(adata)# sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)# adatasc.pp.neighbors(adata, n_neighbors=20, n_pcs=20)
sc.tl.leiden(adata, flavor="igraph", n_iterations=100, resolution=0.5)
sc.tl.umap(adata, min_dist=0.5)
sc.pl.umap(adata, color=["leiden"], alpha=0.7, palette=color25)adata.obs["leiden"].value_counts().sort_index()leiden
0 1044
1 796
2 852
3 1998
4 739
5 774
6 717
7 140
8 270
9 390
10 528
11 272
12 648
13 87
14 269
15 302
16 176
Name: count, dtype: int64
sc.tl.rank_genes_groups(
adata, groupby="leiden", method="wilcoxon", key_added="dea_leiden"
)fig = sc.pl.rank_genes_groups_dotplot(
adata, groupby="leiden", standard_scale="var", n_genes=7, key="dea_leiden", # save="manual_qc_dotplot.png"
)gene_table = pd.DataFrame(adata.uns['dea_leiden']['names'])
list_of_list = []
for ii in range(7):
list_of_list.append(gene_table[f'{ii}'][0:6].to_list())markers = list(set([item for sublist in list_of_list for item in sublist]))markers = [gene for gene in markers if gene[0:3]!="RPS"]
markers = [gene for gene in markers if gene[0:3]!="RPL"]
markers = [gene for gene in markers if gene[0:2]!="Hb"]
markers = [gene for gene in markers if gene[0:3]!="MT-"]sc.pl.heatmap(adata, markers, groupby='leiden', swap_axes=True, save="gene_heatmap.png")sc.pl.umap(
adata,
color=markers,
vmin=0, vmax="p99",
sort_order=False, frameon=True,
cmap="Reds",
# save="_lung_gene_umap.png",
)TAPE model is adoptedThis is achieved using the TAPE model, specially its package scTAPE. However, to make more hyperparameters accessible through the interface, a slightly modified version of the package has been developed. It is available here.
Component III, generator, starts with Component II and connects to the decoder (right half) of Component I.