Commit 94b25a28 authored by joweb106's avatar joweb106
Browse files

added filter_rename_loom.py and its results

parent 6d2eae55
import numpy as np
import pandas as pd
import loompy
new_names = pd.read_csv("../data/EPDC.SCTransform.integrated_minus9_13_6_CellNames.csv").values[:,0]
new_names_renamed = np.array(["52_NGS_MI"+n.split("-")[1]+"_EPDC:"+n.split("-")[0]+"x" for n in new_names])
# input
ds = loompy.connect("../data/52_NGS_EPDC_merged.loom")
# output
dsout = loompy.new("../data/52_NGS_EPDC_merged_filtered_renamed.loom")
# write chosen cells into new loom file
mask = np.array([e in new_names_renamed for e in ds.ca["CellID"]])
cells = np.where(mask)[0]
for (ix, selection, view) in ds.scan(items=cells, axis=1, key="Accession"):
dsout.add_columns(view.layers, col_attrs=view.ca, row_attrs=view.ra)
# rename cells in new loom file
dsout.ca["CellID"] = np.array(new_names[[np.where(e == new_names_renamed)[0][0] for e in dsout.ca["CellID"]]])
dsout.close()
ds.close()
......@@ -26,6 +26,7 @@ def save_state(vlm, name):
def set_clusters_from_10x(vlm, path_to_csv):
print("set clusters from 10x reanalyze ... ", end = "")
# read the clusters from the csv file
# and rearange them to fit the cell ordering of the loom file
......@@ -33,10 +34,19 @@ def set_clusters_from_10x(vlm, path_to_csv):
clusters = np.char.mod('%d', cluster_data[:,1]) # elements to strings
names = vlm.ca["CellID"]
#new_names = pd.read_csv("../data/EPDC.SCTransform.integrated_minus9_13_6_CellNames.csv").values[:,0]
mask = np.array([e in names for e in cluster_data[:,0]])
"""
names_cl = np.array(["52_NGS_MI"+n.split("-")[1]+"_EPDC:"+n.split("-")[0]+"x" for n in cluster_data[:,0]])
ind = np.array([np.where(names_cl == n) for n in names])
clusters = clusters[ind].reshape((clusters.shape[0]))
"""
ind = np.array([np.where(cluster_data[:,0][mask] == n) for n in names])
clusters = clusters[mask]
clusters = clusters[ind].reshape((clusters.shape[0]))
# colors of the clusters
......@@ -97,25 +107,39 @@ def plot_PCA(vlm, fname):
plt.savefig(fname)
print("Finished")
def filter_data(vlm):
print("filter dataset ... ", end = "")
def filter_data(vlm, verbose = False):
if verbose:
print("filter dataset")
genes, cells = vlm.S.shape
# remove the cells with extremely low unspliced detection
vlm.filter_cells(bool_array=vlm.initial_Ucell_size > np.percentile(vlm.initial_Ucell_size, 0.5))
if verbose:
print(cells, "->", vlm.S.shape[1], "cells")
# select the genes that are expressed above a threshold of total number of molecules in any of the clusters
vlm.score_detection_levels(min_expr_counts=40, min_cells_express=30)
vlm.filter_genes(by_detection_levels=True)
print("Finished")
if verbose:
print(genes, "->", vlm.S.shape[0], "genes ...",end="")
print("Finished")
def feature_selection(vlm):
print("feature selection (score_cv_vs_mean) ... ", end = "")
def feature_selection(vlm, verbose = False):
if verbose:
print("feature selection (score_cv_vs_mean)")
genes = vlm.S.shape[0]
# remove the cells with extremely low unspliced detection
vlm.score_cv_vs_mean(3000,plot=True, max_expr_avg=35)
#vlm.score_cv_vs_mean(3000,plot=True, max_expr_avg=35)
vlm.score_cv_vs_mean(6000,plot=True, max_expr_avg=35)
vlm.filter_genes(by_cv_vs_mean=True)
plt.savefig('../plots/feature_selection.png')
print("Finished")
if verbose:
print(genes, "->", vlm.S.shape[0], "genes ...",end="")
print("Finished")
def predict_velocity(vlm):
print("predict velocity ... ", end = "")
......@@ -126,22 +150,37 @@ def predict_velocity(vlm):
print("Finished")
def print_time(start):
done = time.time()
elapsed = done - start
print("time:",elapsed)
###########################
# Start a new analysis
start = time.time()
vlm = load_data() #load loom file
###########################
total_start = time.time()
vlm = load_data(fname="../data/52_NGS_EPDC_merged_filtered_renamed.loom") #load loom file
#vlm.plot_fractions("test.png")
#plt.savefig("test2.png")
set_clusters_from_10x(vlm, "../data/52_NGS_EPDC_reanalyze/outs/analysis/clustering/graphclust/clusters.csv")
filter_data(vlm)
feature_selection(vlm)
filter_data(vlm, verbose = True)
feature_selection(vlm, verbose = True)
normalize(vlm)
print("Perform PCA ... ", end = "")
start = time.time()
vlm.perform_PCA()
print("Finished")
print_time(start)
#print("Plot PCA ... ", end = "")
#plot_PCA(vlm, '../plots/PCA.png')
......@@ -164,19 +203,11 @@ predict_velocity(vlm)
# TSNE
Tstart = time.time()
print("Calculate TSNE ... ", end = "")
bh_tsne = TSNE(n_jobs=16)
vlm.ts = bh_tsne.fit_transform(vlm.pcs[:, :25])
print("Finished")
Tdone = time.time()
Telapsed = Tdone - Tstart
print("TSNE time:",Telapsed)
print("Plot TSNE ... ", end = "")
plt.figure(None,(8,8))
......@@ -187,7 +218,7 @@ print("Finished")
# project on vlm.ts by calling estimate_transition_prob.
print("Projection into embedding ... ", end = "")
vlm.estimate_transition_prob(hidim="Sx_sz", embed="ts", transform="sqrt", psc=1, n_neighbors=3500, knn_random=True, sampled_fraction=0.5)
vlm.estimate_transition_prob(hidim="Sx_sz", embed="ts", transform="sqrt", psc=1, n_neighbors=3500, knn_random=True, sampled_fraction=0.5,n_jobs=16)
vlm.calculate_embedding_shift(sigma_corr = 0.2, expression_scaling=True)
print("Finished")
......@@ -238,19 +269,27 @@ print("Finished")
# initial divide by mean
# orgiginal min_mass=24
print("Plot velocity grid arrows ... ", end = "")
plt.figure(None,(20,10))
#plt.figure(None,(20,10))
plt.figure(None,(10,10))
vlm.plot_grid_arrows(quiver_scale=0.48,
scatter_kwargs_dict={"alpha":0.35, "lw":0.35, "edgecolor":"0.4", "s":38, "rasterized":True}, min_mass=7, angles='xy', scale_units='xy',
headaxislength=2.75, headlength=5, headwidth=4.8, minlength=1.5,
plot_random=True, scale_type="absolute")
plot_random=False, scale_type="absolute")
#vlm.plot_grid_arrows(quiver_scale=0.48,
# scatter_kwargs_dict={"alpha":0.35, "lw":0.35, "edgecolor":"0.4", "s":38, "rasterized":True}, min_mass=7, angles='xy', scale_units='xy',
# headaxislength=2.75, headlength=5, headwidth=4.8, minlength=1.5,
# plot_random=True, scale_type="absolute")
plt.savefig("../plots/velocity_gridarrows.pdf")
print("Finished")
print("Final cell count",vlm.S.shape[0])
print("tSNE shape",vlm.ts.shape)
print("Overall: ", end = "")
print_time(total_start) # 4659 for no feature sel
done = time.time()
elapsed = done - start
print("overall time:",elapsed)
# 31053, 15539 shape of vlm.S at the beginning
# 15445 cells in the end
# This does not work sadly
#save_state(vlm, "../data/after_TSNE.hdf5")
......
This diff is collapsed.
plots/TSNE.png

242 KB | W: | H:

plots/TSNE.png

261 KB | W: | H:

plots/TSNE.png
plots/TSNE.png
plots/TSNE.png
plots/TSNE.png
  • 2-up
  • Swipe
  • Onion skin
plots/feature_selection.png

64.1 KB | W: | H:

plots/feature_selection.png

62 KB | W: | H:

plots/feature_selection.png
plots/feature_selection.png
plots/feature_selection.png
plots/feature_selection.png
  • 2-up
  • Swipe
  • Onion skin
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment