import cell2cell as c2c
import scanpy as sc
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import os
data_folder = '/data/'
directory = os.fsencode(data_folder)
output_folder = '/results/Brain-ASD/'
if not os.path.isdir(output_folder):
os.mkdir(output_folder)
RNA-seq data
Expression matrix was obtained from: https://cells.ucsc.edu/autism/exprMatrix.tsv.gz
Values are 10x UMI counts from cellranger, log2-transformed
Metadata was obtained from: https://cells.ucsc.edu/autism/meta.tsv
# rnaseq = sc.read_text(data_folder + '/Brain-ASD/exprMatrix.tsv.gz')
# rnaseq = rnaseq.transpose()
# meta = pd.read_csv(data_folder + '/Brain-ASD/meta.tsv', sep='\t', index_col=0)
# rnaseq.obs = rnaseq.obs.join(meta)
# rnaseq.write_h5ad(data_folder + '/Brain-ASD/Brain_ASD.h5ad')
rnaseq = sc.read_h5ad(data_folder + '/Brain-ASD/Brain_ASD.h5ad')
rnaseq
AnnData object with n_obs × n_vars = 104559 × 36501 obs: 'cluster', 'sample', 'individual', 'region', 'age', 'sex', 'diagnosis', 'Capbatch', 'Seqbatch', 'post-mortem interval (hours)', 'RNA Integrity Number', 'genes', 'UMIs', 'RNA mitochondr. percent', 'RNA ribosomal percent'
brain_area = 'PFC'
rnaseq = rnaseq[rnaseq.obs.region == brain_area]
rnaseq
View of AnnData object with n_obs × n_vars = 62166 × 36501 obs: 'cluster', 'sample', 'individual', 'region', 'age', 'sex', 'diagnosis', 'Capbatch', 'Seqbatch', 'post-mortem interval (hours)', 'RNA Integrity Number', 'genes', 'UMIs', 'RNA mitochondr. percent', 'RNA ribosomal percent'
Ligand-Receptor pairs
lr_pairs = pd.read_csv('https://raw.githubusercontent.com/LewisLabUCSD/Ligand-Receptor-Pairs/master/Human/Human-2020-Jin-LR-pairs.csv')
lr_pairs = lr_pairs.astype(str)
lr_pairs.head(2)
interaction_name | pathway_name | ligand | receptor | agonist | antagonist | co_A_receptor | co_I_receptor | evidence | annotation | interaction_name_2 | ligand_symbol | receptor_symbol | ligand_ensembl | receptor_ensembl | interaction_symbol | interaction_ensembl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TGFB1_TGFBR1_TGFBR2 | TGFb | TGFB1 | TGFbR1_R2 | TGFb agonist | TGFb antagonist | nan | TGFb inhibition receptor | KEGG: hsa04350 | Secreted Signaling | TGFB1 - (TGFBR1+TGFBR2) | TGFB1 | TGFBR1&TGFBR2 | ENSG00000105329 | ENSG00000106799&ENSG00000163513 | TGFB1^TGFBR1&TGFBR2 | ENSG00000105329^ENSG00000106799&ENSG00000163513 |
1 | TGFB2_TGFBR1_TGFBR2 | TGFb | TGFB2 | TGFbR1_R2 | TGFb agonist | TGFb antagonist | nan | TGFb inhibition receptor | KEGG: hsa04350 | Secreted Signaling | TGFB2 - (TGFBR1+TGFBR2) | TGFB2 | TGFBR1&TGFBR2 | ENSG00000092969 | ENSG00000106799&ENSG00000163513 | TGFB2^TGFBR1&TGFBR2 | ENSG00000092969^ENSG00000106799&ENSG00000163513 |
# interaction columns:
int_columns = ('ligand_symbol', 'receptor_symbol')
Remove bidirectionality in the list of ligand-receptor pairs. That is, remove repeated interactions where both interactions are the same but in different order:
From this list:
Ligand | Receptor |
---|---|
Protein A | Protein B |
Protein B | Protein A |
We will have:
Ligand | Receptor |
---|---|
Protein A | Protein B |
lr_pairs = c2c.preprocessing.ppi.remove_ppi_bidirectionality(ppi_data=lr_pairs,
interaction_columns=int_columns
)
Removing bidirectionality of PPI network
lr_pairs.shape
(1988, 17)
Generate a dictionary with function info for each LR pairs. Keys are LIGAND_NAME^RECEPTOR_NAME and values are the function in the annotation column in the dataframe containing ligand-receptor pairs.
ppi_functions = dict()
for idx, row in lr_pairs.iterrows():
ppi_label = row[int_columns[0]] + '^' + row[int_columns[1]]
ppi_functions[ppi_label] = row['annotation']
Convert interactions from ensembl to symbol
ensembl_symbol = dict()
for idx, row in lr_pairs.iterrows():
ensembl_symbol[row['interaction_ensembl']] = row['interaction_symbol']
Organize data to create tensor
First, generate a dictionary indicating what condition is associated to each sample
context_dict = dict()
for diag, df in rnaseq.obs.groupby('diagnosis'):
for donor in df['sample'].unique():
context_dict[donor] = diag
context_dict
{'5144_PFC': 'ASD', '5278_PFC': 'ASD', '5294_BA9': 'ASD', '5403_PFC': 'ASD', '5419_PFC': 'ASD', '5531_BA9': 'ASD', '5565_BA9': 'ASD', '5841_BA9': 'ASD', '5864_BA9': 'ASD', '5939_BA9': 'ASD', '5945_PFC': 'ASD', '5978_BA9': 'ASD', '6033_BA9': 'ASD', '4341_BA46': 'Control', '5387_BA9': 'Control', '5408_PFC_Nova': 'Control', '5538_PFC_Nova': 'Control', '5577_BA9': 'Control', '5879_PFC_Nova': 'Control', '5893_PFC': 'Control', '5936_PFC_Nova': 'Control', '5958_BA9': 'Control', '5976_BA9': 'Control'}
context_names = list(context_dict.keys())
Generate list of RNA-seq data containing all contexts (samples)
rnaseq_matrices = []
for context in tqdm(context_names):
meta_context = rnaseq.obs.loc[rnaseq.obs['sample'] == context]
cells = list(meta_context.index)
meta_context.index.name = 'barcode'
tmp_data = rnaseq[cells]
# Keep genes in each sample with at least 4 single cells expressing it
sc.pp.filter_genes(tmp_data, min_cells=4)
# Aggregate gene expression of single cells into cell types
exp_df = c2c.preprocessing.aggregate_single_cells(rnaseq_data=tmp_data.to_df(),
metadata=meta_context,
barcode_col='barcode',
celltype_col='cluster',
method='nn_cell_fraction',
)
rnaseq_matrices.append(exp_df)
Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying. Trying to set attribute `.var` of view, copying.
# Change gene names to ensembl
matrices = []
for rna in rnaseq_matrices:
tmp = rna.copy()
tmp.index = [idx.split('|')[0] for idx in rna.index]
matrices.append(tmp)
genes = set()
for i, rna in enumerate(rnaseq_matrices):
if i == 0:
genes = genes.union(set(rna.index))
else:
genes.intersection(set(rna.index))
len(genes)
24298
Build 4D-Communication Tensor
how='inner'
is used to keep only cell types and genes that are across all contexts.
complex_sep='&'
is used to specify that the list of ligand-receptor pairs contains protein complexes and that subunits are separated by '&'. If the list does not have complexes, use complex_sep=None
instead.
tensor = c2c.tensor.InteractionTensor(rnaseq_matrices=matrices,
ppi_data=lr_pairs,
context_names=context_names,
how='inner',
complex_sep='&',
interaction_columns=('ligand_ensembl', 'receptor_ensembl'),
communication_score='expression_mean',
)
Getting expression values for protein complexes Building tensor for the provided context
tensor.tensor.shape
(23, 749, 16, 16)
# Put LR pair names from ensembl to symbols
tensor.order_names[1] = [ensembl_symbol[lr] for lr in tensor.order_names[1]]
Generate a list containing metadata for each tensor order/dimension - Later used for coloring factor plots
meta_tf = c2c.tensor.generate_tensor_metadata(interaction_tensor=tensor,
metadata_dicts=[context_dict, ppi_functions, None, None],
fill_with_order_elements=True
)
Elbow analysis for selecting the number of factors to use
elbow, error = tensor.elbow_rank_selection(upper_rank=25,
runs=1,
init='random',
automatic_elbow=False,
random_state=888,
)
_ = plt.plot(*error[5], 'ro')
plt.savefig(output_folder + 'Elbow.svg', dpi=300, bbox_inches='tight')
Perform tensor factorization
tensor.compute_tensor_factorization(rank=6,
init='svd',
random_state=888
)
Plot factor loadings
# Color palettes for
cmaps = ['viridis', 'Dark2_r', 'tab20', 'tab20']
factors, axes = c2c.plotting.tensor_factors_plot(interaction_tensor=tensor,
metadata = meta_tf,
sample_col='Element',
group_col='Category',
meta_cmaps=cmaps,
fontsize=14,
filename=output_folder + 'Tensor-Factorization.svg'
)
Top-5 LR pairs
for i in range(tensor.rank):
print(tensor.get_top_factor_elements('Ligand-Receptor Pairs', 'Factor {}'.format(i+1), 5))
print('')
NEGR1^NEGR1 0.245580 NRXN3^NLGN1 0.236890 NRXN1^NLGN1 0.234088 CNTN1^NRCAM 0.224620 NCAM1^NCAM1 0.217748 Name: Factor 1, dtype: float64 LAMA2^SV2B 0.169643 LAMA4^SV2B 0.165446 LAMB2^SV2B 0.162802 LAMA1^SV2B 0.159729 LAMA3^SV2B 0.158929 Name: Factor 2, dtype: float64 NRG1^ERBB2&ERBB4 0.130066 NRG1^ERBB2&ERBB3 0.129851 NRG1^ERBB3 0.129237 NRG1^ERBB4 0.119872 EFNA5^EPHB2 0.114581 Name: Factor 3, dtype: float64 PTN^ALK 0.203844 PTPRM^PTPRM 0.174730 HBEGF^ERBB4 0.162865 NRG3^ERBB4 0.162113 BTC^ERBB4 0.159788 Name: Factor 4, dtype: float64 NRG3^ERBB4 0.252508 NCAM1^NCAM2 0.234108 CADM1^CADM1 0.215948 NCAM1^NCAM1 0.211533 CNTN1^NRCAM 0.180136 Name: Factor 5, dtype: float64 VEGFA^FLT1 0.184078 PTPRM^PTPRM 0.182046 VEGFB^FLT1 0.172513 NRXN1^NLGN2 0.129017 PTN^NCL 0.128965 Name: Factor 6, dtype: float64
Export Loadings
tensor.export_factor_loadings(output_folder + 'Loadings.xlsx')
Loadings of the tensor factorization were successfully saved into /results/Brain-ASD/Loadings.xlsx
boxplot = c2c.plotting.factor_plot.context_boxplot(tensor.factors['Contexts'],
metadict=context_dict,
nrows=2,
figsize=(12, 6),
group_order=['ASD', 'Control'],
statistical_test='t-test_ind',
pval_correction='bonferroni',
cmap='viridis',
verbose=False,
filename=output_folder + 'Sample-Boxplots.svg'
)
networks = c2c.analysis.tensor_downstream.get_factor_specific_ccc_networks(tensor.factors,
sender_label='Sender Cells',
receiver_label='Receiver Cells')
Generate matrix of cell-cell pairs by factors
network_by_factors = c2c.analysis.tensor_downstream.flatten_factor_ccc_networks(networks, orderby='receivers')
Select cell-cell pairs with high potential of interaction
_ = plt.hist(network_by_factors.values.flatten(), bins = 50)
# To consider only cells with high potential to interact in each factor
cci_threshold = 0.075
Visualize networks of factors with significant differences between groups
import networkx as nx
fig, axes = plt.subplots(1, 2, figsize=(24, 12))
for i, factor in enumerate(['Factor 3', 'Factor 4']):
ax = axes.flatten()[i]
df = networks[factor].gt(cci_threshold).astype(int).multiply(networks[factor])
#Networkx Directed Network
G = nx.convert_matrix.from_pandas_adjacency(df, create_using=nx.DiGraph())
# Layout for visualization - Node positions
pos = nx.spring_layout(G,
k=1.,
seed=888
)
# Weights for edge thickness
weights = np.asarray([G.edges[e]['weight'] for e in G.edges()])
# Visualize network
nx.draw_networkx_edges(G,
pos,
alpha=0.25,
arrowsize=30,
width=weights*50,
edge_color="m",
connectionstyle="arc3,rad=-0.3",
ax=ax
)
nx.draw_networkx_nodes(G,
pos,
node_color="#210070",
node_size=1000,
alpha=0.9,
ax=ax
)
label_options = {"ec": "k", "fc": "white", "alpha": 0.7}
_ = nx.draw_networkx_labels(G,
{k : v + np.array([0.05, -0.1]) for k,v in pos.items()},
font_size=20,
bbox=label_options,
ax=ax
)
ax.set_frame_on(False)
plt.tight_layout()
plt.savefig(output_folder + 'Factor-Networks.svg', dpi=300, bbox_inches='tight')
Cluster cell-cell pairs by their potential across factors
cci_cm = c2c.plotting.loading_clustermap(network_by_factors,
use_zscore=True,
loading_threshold=cci_threshold,
row_cluster=True,
figsize=(20, 9),
tick_fontsize=10,
filename=output_folder + 'Clustermap-Factor-specific-CCI.svg'
)
Cluster LR pairs by their importance across factors
lr_cm = c2c.plotting.loading_clustermap(tensor.factors['Ligand-Receptor Pairs'],
loading_threshold=0.1, # To consider only top LRs
use_zscore=True,
figsize=(16, 9),
filename=output_folder + 'Clustermap-Factor-specific-LRs.svg'
)
Cluster samples by their importance across factors
# Generate color by ASD condition for each sample
color_dict = c2c.plotting.aesthetics.get_colors_from_labels(meta_tf[0]['Category'].unique(),
cmap='viridis')
col_colors = c2c.plotting.aesthetics.map_colors_to_metadata(metadata=meta_tf[0],
sample_col='Element',
group_col='Category',
colors=color_dict
)
col_colors = col_colors.to_frame()
# Clinical Scores
clinical_scores = pd.read_excel(data_folder + '/Brain-ASD/Clinical_Score.xlsx', index_col=0)
score_colors = c2c.plotting.aesthetics.get_colors_from_labels(clinical_scores['Clinical-Score'].dropna().astype(str).values,
cmap='Oranges')
# Generate color by clinical score for each sample
def color_score(x):
if x != 'NA':
return score_colors[str(x)]
else: return (0.7, 0.7, 0.7)
col_colors['Clinical-Score'] = [clinical_scores.loc[int(idx.split('_')[0]), 'Clinical-Score'] for idx in col_colors.index]
col_colors = col_colors.fillna('NA')
col_colors['Clinical-Score'] = col_colors['Clinical-Score'].apply(lambda x: color_score(x))
sample_cm = c2c.plotting.loading_clustermap(tensor.factors['Contexts'],
use_zscore=True,
col_colors=col_colors,
figsize=(16, 6),
dendrogram_ratio=0.3,
cbar_fontsize=12,
tick_fontsize=14,
#cmap='viridis'
)
labels = [item.get_text().split('_')[0] for item in sample_cm.ax_heatmap.xaxis.get_majorticklabels()]
sample_cm.ax_heatmap.set_xticklabels(labels)
#Insert legend
plt.sca(sample_cm.ax_heatmap)
c2c.plotting.aesthetics.generate_legend(color_dict=color_dict,
bbox_to_anchor=(1.1, 0.5),
title='Category'
)
plt.savefig(output_folder + 'Clustermap-Factor-specific-Samples.svg',
dpi=300, bbox_inches='tight')
sns.palplot(sns.color_palette("Oranges", 10))
plt.tick_params(axis=u'both', which=u'both',length=0)
plt.savefig(output_folder + 'Clinical_Score_colormap.svg',
dpi=300, bbox_inches='tight')
Evaluate Clinical Scores in each cluster
clusters = c2c.clustering.get_clusters_from_linkage(sample_cm.dendrogram_col.linkage, 4)
sample_labels = [idx.split('_')[0] for idx in tensor.factors['Contexts'].index]
cluster_map = {sample_labels[i] : k for k, v in clusters.items() for i in v}
clinical_scores['Cluster'] = [cluster_map[str(idx)] if (str(idx) in sample_labels) \
else 'NA' for idx in clinical_scores.index]
comp_scores = clinical_scores.loc[(clinical_scores['Cluster'] == 3) | (clinical_scores['Cluster'] == 4),
['Clinical-Score', 'Cluster']].dropna()
comp_scores.groupby('Cluster').describe()
Clinical-Score | ||||||||
---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | |
Cluster | ||||||||
3 | 5.0 | 25.00 | 7.483315 | 17.0 | 21.0 | 25.0 | 25.00 | 37.0 |
4 | 4.0 | 22.75 | 5.123475 | 16.0 | 20.5 | 23.5 | 25.75 | 28.0 |
import gseapy
from statsmodels.stats.multitest import fdrcorrection
Creating directory /root/.config/bioservices
lr_set = c2c.io.load_variable_with_pickle(data_folder + '/LR-Pairs/CellChat-LR-KEGG-set.pkl')
weight = 1
min_size = 15
permutations = 999
loadings = tensor.factors['Ligand-Receptor Pairs'].reset_index()
Pathways of both ligand and receptor
for factor in loadings.columns[1:]:
test = loadings[['index', factor]]
test.columns = [0, 1]
test = test.sort_values(by=1, ascending=False)
test.reset_index(drop=True, inplace=True)
gseapy.prerank(rnk=test,
gene_sets=lr_set,
min_size=min_size,
weighted_score_type=weight,
processes=6,
permutation_num=permutations, # reduce number to speed up testing
outdir=output_folder + '/GSEA/' + factor, format='png', seed=6)
Adjust P-values across all factors (GSEA only adjusts within factor)
pvals = []
terms = []
factors = []
nes = []
for factor in loadings.columns[1:]:
p_report = pd.read_csv(output_folder + '/GSEA/' + factor + '/gseapy.prerank.gene_sets.report.csv')
pval = p_report['pval'].values.tolist()
pvals.extend(pval)
terms.extend(p_report.Term.values.tolist())
factors.extend([factor] * len(pval))
nes.extend(p_report['nes'].values.tolist())
pval_df = pd.DataFrame(np.asarray([factors, terms, nes, pvals]).T, columns=['Factor', 'Term', 'NES', 'P-value'])
pval_df = pval_df.loc[pval_df['P-value'] != 'nan']
pval_df['P-value'] = pd.to_numeric(pval_df['P-value'])
pval_df['P-value'] = pval_df['P-value'].replace(0., 1./(permutations+1))
pval_df['NES'] = pd.to_numeric(pval_df['NES'])
pval_df['Adj. P-value'] = fdrcorrection(pval_df['P-value'].values, alpha=0.05)[1]
Enriched LR pairs
pval_df.loc[(pval_df['Adj. P-value'] < 0.05) & (pval_df['NES'] > 0.)]
Factor | Term | NES | P-value | Adj. P-value | |
---|---|---|---|---|---|
1 | Factor 1 | KEGG_ERBB_SIGNALING_PATHWAY | 1.454242 | 0.001000 | 0.008709 |
2 | Factor 1 | KEGG_CELL_ADHESION_MOLECULES_CAMS | 1.398443 | 0.001000 | 0.008709 |
3 | Factor 1 | KEGG_AXON_GUIDANCE | 1.241130 | 0.006006 | 0.029029 |
17 | Factor 2 | KEGG_AXON_GUIDANCE | 1.563928 | 0.001000 | 0.008709 |
18 | Factor 2 | KEGG_CELL_ADHESION_MOLECULES_CAMS | 1.309822 | 0.005005 | 0.029029 |
31 | Factor 3 | KEGG_ERBB_SIGNALING_PATHWAY | 1.486924 | 0.003003 | 0.018662 |
32 | Factor 3 | KEGG_AXON_GUIDANCE | 1.388006 | 0.001001 | 0.008709 |
34 | Factor 3 | KEGG_ECM_RECEPTOR_INTERACTION | 1.327261 | 0.001000 | 0.008709 |
35 | Factor 3 | KEGG_SMALL_CELL_LUNG_CANCER | 1.335054 | 0.008008 | 0.033176 |
36 | Factor 3 | KEGG_FOCAL_ADHESION | 1.279068 | 0.001001 | 0.008709 |
45 | Factor 4 | KEGG_ERBB_SIGNALING_PATHWAY | 1.741709 | 0.001000 | 0.008709 |
46 | Factor 4 | KEGG_CELL_ADHESION_MOLECULES_CAMS | 1.287627 | 0.003003 | 0.018662 |
47 | Factor 4 | KEGG_AXON_GUIDANCE | 1.254568 | 0.007007 | 0.032085 |
61 | Factor 5 | KEGG_CELL_ADHESION_MOLECULES_CAMS | 1.422309 | 0.001000 | 0.008709 |
62 | Factor 5 | KEGG_MAPK_SIGNALING_PATHWAY | 1.310704 | 0.008008 | 0.033176 |
63 | Factor 5 | KEGG_REGULATION_OF_ACTIN_CYTOSKELETON | 1.312981 | 0.002002 | 0.015834 |
64 | Factor 5 | KEGG_ERBB_SIGNALING_PATHWAY | 1.326159 | 0.006006 | 0.029029 |
75 | Factor 6 | KEGG_CELL_ADHESION_MOLECULES_CAMS | 1.297625 | 0.003003 | 0.018662 |
78 | Factor 6 | KEGG_FOCAL_ADHESION | 1.190575 | 0.006006 | 0.029029 |
Depleted LR pairs
pval_df.loc[(pval_df['Adj. P-value'] < 0.05) & (pval_df['NES'] < 0.)]
Factor | Term | NES | P-value | Adj. P-value | |
---|---|---|---|---|---|
0 | Factor 1 | KEGG_NOTCH_SIGNALING_PATHWAY | -1.380117 | 0.001 | 0.008709 |
15 | Factor 2 | KEGG_NOTCH_SIGNALING_PATHWAY | -2.423423 | 0.001 | 0.008709 |
pval_df.to_excel(output_folder + '/GSEA-Adj-Pvals.xlsx')
DataFrame
pval_pivot = pval_df.pivot(index="Term", columns="Factor", values="Adj. P-value").fillna(1.)
scores = pval_df.pivot(index="Term", columns="Factor", values="NES").fillna(0)
Dot Plot
with sns.axes_style("darkgrid"):
dotplot = c2c.plotting.pval_plot.generate_dot_plot(pval_df=pval_pivot,
score_df=scores,
significance=0.05,
xlabel='',
ylabel='KEGG Pathways',
cbar_title='NES',
cmap='PuOr',
figsize=(9,16),
label_size=20,
title_size=20,
tick_size=14,
filename=output_folder + '/GSEA-Dotplot.svg'
)