import cell2cell as c2c
import scanpy as sc

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import warnings
warnings.filterwarnings('ignore')


import os

data_folder = '/data/'
directory = os.fsencode(data_folder)

output_folder = '/results/Brain-ASD/'
if not os.path.isdir(output_folder):
    os.mkdir(output_folder)


# rnaseq = sc.read_text(data_folder + '/Brain-ASD/exprMatrix.tsv.gz')
# rnaseq = rnaseq.transpose()
# meta = pd.read_csv(data_folder + '/Brain-ASD/meta.tsv', sep='\t', index_col=0)
# rnaseq.obs = rnaseq.obs.join(meta)
# rnaseq.write_h5ad(data_folder + '/Brain-ASD/Brain_ASD.h5ad')


rnaseq = sc.read_h5ad(data_folder + '/Brain-ASD/Brain_ASD.h5ad')


rnaseq

AnnData object with n_obs × n_vars = 104559 × 36501
    obs: 'cluster', 'sample', 'individual', 'region', 'age', 'sex', 'diagnosis', 'Capbatch', 'Seqbatch', 'post-mortem interval (hours)', 'RNA Integrity Number', 'genes', 'UMIs', 'RNA mitochondr. percent', 'RNA ribosomal percent'


brain_area = 'PFC'


rnaseq = rnaseq[rnaseq.obs.region == brain_area]


rnaseq

View of AnnData object with n_obs × n_vars = 62166 × 36501
    obs: 'cluster', 'sample', 'individual', 'region', 'age', 'sex', 'diagnosis', 'Capbatch', 'Seqbatch', 'post-mortem interval (hours)', 'RNA Integrity Number', 'genes', 'UMIs', 'RNA mitochondr. percent', 'RNA ribosomal percent'


lr_pairs = pd.read_csv('https://raw.githubusercontent.com/LewisLabUCSD/Ligand-Receptor-Pairs/master/Human/Human-2020-Jin-LR-pairs.csv')
lr_pairs = lr_pairs.astype(str)


lr_pairs.head(2)


# interaction columns:
int_columns = ('ligand_symbol', 'receptor_symbol')


lr_pairs = c2c.preprocessing.ppi.remove_ppi_bidirectionality(ppi_data=lr_pairs, 
                                                             interaction_columns=int_columns
                                                             )

Removing bidirectionality of PPI network


lr_pairs.shape

(1988, 17)


ppi_functions = dict()

for idx, row in lr_pairs.iterrows():
    ppi_label = row[int_columns[0]] + '^' + row[int_columns[1]]
    ppi_functions[ppi_label] = row['annotation']


ensembl_symbol = dict()

for idx, row in lr_pairs.iterrows():
    ensembl_symbol[row['interaction_ensembl']] = row['interaction_symbol']


context_dict = dict()

for diag, df in rnaseq.obs.groupby('diagnosis'):
    for donor in df['sample'].unique():
        context_dict[donor] = diag


context_dict

{'5144_PFC': 'ASD',
 '5278_PFC': 'ASD',
 '5294_BA9': 'ASD',
 '5403_PFC': 'ASD',
 '5419_PFC': 'ASD',
 '5531_BA9': 'ASD',
 '5565_BA9': 'ASD',
 '5841_BA9': 'ASD',
 '5864_BA9': 'ASD',
 '5939_BA9': 'ASD',
 '5945_PFC': 'ASD',
 '5978_BA9': 'ASD',
 '6033_BA9': 'ASD',
 '4341_BA46': 'Control',
 '5387_BA9': 'Control',
 '5408_PFC_Nova': 'Control',
 '5538_PFC_Nova': 'Control',
 '5577_BA9': 'Control',
 '5879_PFC_Nova': 'Control',
 '5893_PFC': 'Control',
 '5936_PFC_Nova': 'Control',
 '5958_BA9': 'Control',
 '5976_BA9': 'Control'}


context_names = list(context_dict.keys())


rnaseq_matrices = []

for context in tqdm(context_names):
        meta_context = rnaseq.obs.loc[rnaseq.obs['sample'] == context]
        cells = list(meta_context.index)
        
        meta_context.index.name = 'barcode'
        tmp_data = rnaseq[cells]
        # Keep genes in each sample with at least 4 single cells expressing it
        sc.pp.filter_genes(tmp_data, min_cells=4)
        
        # Aggregate gene expression of single cells into cell types
        exp_df = c2c.preprocessing.aggregate_single_cells(rnaseq_data=tmp_data.to_df(),
                                                          metadata=meta_context,
                                                          barcode_col='barcode',
                                                          celltype_col='cluster',
                                                          method='nn_cell_fraction',
                                                         )
        
        rnaseq_matrices.append(exp_df)

Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.


# Change gene names to ensembl
matrices = []
for rna in rnaseq_matrices:
    tmp = rna.copy()
    tmp.index = [idx.split('|')[0] for idx in rna.index]
    matrices.append(tmp)


genes = set()
for i, rna in enumerate(rnaseq_matrices):
    if i == 0:
        genes = genes.union(set(rna.index))
    else:
        genes.intersection(set(rna.index))


len(genes)

24298


tensor = c2c.tensor.InteractionTensor(rnaseq_matrices=matrices,
                                      ppi_data=lr_pairs,
                                      context_names=context_names,
                                      how='inner',
                                      complex_sep='&',
                                      interaction_columns=('ligand_ensembl', 'receptor_ensembl'),
                                      communication_score='expression_mean',
                                     )

Getting expression values for protein complexes
Building tensor for the provided context


tensor.tensor.shape

(23, 749, 16, 16)


# Put LR pair names from ensembl to symbols
tensor.order_names[1] = [ensembl_symbol[lr] for lr in tensor.order_names[1]]


meta_tf = c2c.tensor.generate_tensor_metadata(interaction_tensor=tensor,
                                              metadata_dicts=[context_dict, ppi_functions, None, None],
                                              fill_with_order_elements=True
                                             )


elbow, error = tensor.elbow_rank_selection(upper_rank=25,
                                           runs=1,
                                           init='random',
                                           automatic_elbow=False,
                                           random_state=888,
                                          )

_ = plt.plot(*error[5], 'ro')
plt.savefig(output_folder + 'Elbow.svg', dpi=300, bbox_inches='tight')


tensor.compute_tensor_factorization(rank=6,
                                    init='svd',
                                    random_state=888
                                   )


# Color palettes for 
cmaps = ['viridis', 'Dark2_r', 'tab20', 'tab20']


factors, axes = c2c.plotting.tensor_factors_plot(interaction_tensor=tensor,
                                                 metadata = meta_tf,
                                                 sample_col='Element',
                                                 group_col='Category',
                                                 meta_cmaps=cmaps,
                                                 fontsize=14,
                                                 filename=output_folder + 'Tensor-Factorization.svg'
                                                )


for i in range(tensor.rank):
        print(tensor.get_top_factor_elements('Ligand-Receptor Pairs', 'Factor {}'.format(i+1), 5))
        print('')

NEGR1^NEGR1    0.245580
NRXN3^NLGN1    0.236890
NRXN1^NLGN1    0.234088
CNTN1^NRCAM    0.224620
NCAM1^NCAM1    0.217748
Name: Factor 1, dtype: float64

LAMA2^SV2B    0.169643
LAMA4^SV2B    0.165446
LAMB2^SV2B    0.162802
LAMA1^SV2B    0.159729
LAMA3^SV2B    0.158929
Name: Factor 2, dtype: float64

NRG1^ERBB2&ERBB4    0.130066
NRG1^ERBB2&ERBB3    0.129851
NRG1^ERBB3          0.129237
NRG1^ERBB4          0.119872
EFNA5^EPHB2         0.114581
Name: Factor 3, dtype: float64

PTN^ALK        0.203844
PTPRM^PTPRM    0.174730
HBEGF^ERBB4    0.162865
NRG3^ERBB4     0.162113
BTC^ERBB4      0.159788
Name: Factor 4, dtype: float64

NRG3^ERBB4     0.252508
NCAM1^NCAM2    0.234108
CADM1^CADM1    0.215948
NCAM1^NCAM1    0.211533
CNTN1^NRCAM    0.180136
Name: Factor 5, dtype: float64

VEGFA^FLT1     0.184078
PTPRM^PTPRM    0.182046
VEGFB^FLT1     0.172513
NRXN1^NLGN2    0.129017
PTN^NCL        0.128965
Name: Factor 6, dtype: float64


tensor.export_factor_loadings(output_folder + 'Loadings.xlsx')

Loadings of the tensor factorization were successfully saved into /results/Brain-ASD/Loadings.xlsx


boxplot = c2c.plotting.factor_plot.context_boxplot(tensor.factors['Contexts'],
                                                   metadict=context_dict,
                                                   nrows=2,
                                                   figsize=(12, 6),
                                                   group_order=['ASD', 'Control'],
                                                   statistical_test='t-test_ind',
                                                   pval_correction='bonferroni',
                                                   cmap='viridis',
                                                   verbose=False,
                                                   filename=output_folder + 'Sample-Boxplots.svg'
                                                  )


networks = c2c.analysis.tensor_downstream.get_factor_specific_ccc_networks(tensor.factors, 
                                                                           sender_label='Sender Cells',
                                                                           receiver_label='Receiver Cells')


network_by_factors = c2c.analysis.tensor_downstream.flatten_factor_ccc_networks(networks, orderby='receivers')


_ = plt.hist(network_by_factors.values.flatten(), bins = 50)


# To consider only cells with high potential to interact in each factor
cci_threshold = 0.075


import networkx as nx


fig, axes = plt.subplots(1, 2, figsize=(24, 12))

for i, factor in enumerate(['Factor 3', 'Factor 4']):
    ax = axes.flatten()[i]
    df = networks[factor].gt(cci_threshold).astype(int).multiply(networks[factor])
    #Networkx Directed Network
    G = nx.convert_matrix.from_pandas_adjacency(df, create_using=nx.DiGraph())

    # Layout for visualization - Node positions
    pos = nx.spring_layout(G, 
                           k=1.,
                           seed=888
                           )

    # Weights for edge thickness
    weights = np.asarray([G.edges[e]['weight'] for e in G.edges()])

    # Visualize network
    nx.draw_networkx_edges(G,
                           pos,
                           alpha=0.25,
                           arrowsize=30,
                           width=weights*50,
                           edge_color="m",
                           connectionstyle="arc3,rad=-0.3",
                           ax=ax
                          )
    nx.draw_networkx_nodes(G,
                           pos,
                           node_color="#210070", 
                           node_size=1000,
                           alpha=0.9,
                           ax=ax
                          )
    label_options = {"ec": "k", "fc": "white", "alpha": 0.7}
    _ = nx.draw_networkx_labels(G, 
                                {k : v + np.array([0.05, -0.1]) for k,v in pos.items()},
                                font_size=20,
                                bbox=label_options,
                                ax=ax
                               )
    
    ax.set_frame_on(False)
    
plt.tight_layout()
plt.savefig(output_folder + 'Factor-Networks.svg', dpi=300, bbox_inches='tight')


cci_cm = c2c.plotting.loading_clustermap(network_by_factors,
                                         use_zscore=True, 
                                         loading_threshold=cci_threshold,
                                         row_cluster=True,
                                         figsize=(20, 9),
                                         tick_fontsize=10,
                                         filename=output_folder + 'Clustermap-Factor-specific-CCI.svg'
                                        )


lr_cm = c2c.plotting.loading_clustermap(tensor.factors['Ligand-Receptor Pairs'],
                                        loading_threshold=0.1, # To consider only top LRs
                                        use_zscore=True,
                                        figsize=(16, 9),
                                        filename=output_folder + 'Clustermap-Factor-specific-LRs.svg'
                                       )


# Generate color by ASD condition for each sample
color_dict = c2c.plotting.aesthetics.get_colors_from_labels(meta_tf[0]['Category'].unique(), 
                                                            cmap='viridis')
col_colors = c2c.plotting.aesthetics.map_colors_to_metadata(metadata=meta_tf[0], 
                                                            sample_col='Element',
                                                            group_col='Category',
                                                            colors=color_dict
                                                           )
col_colors = col_colors.to_frame()


# Clinical Scores
clinical_scores = pd.read_excel(data_folder + '/Brain-ASD/Clinical_Score.xlsx', index_col=0)
score_colors = c2c.plotting.aesthetics.get_colors_from_labels(clinical_scores['Clinical-Score'].dropna().astype(str).values, 
                                                              cmap='Oranges')


# Generate color by clinical score for each sample
def color_score(x):
    if x != 'NA':
        return score_colors[str(x)] 
    else: return (0.7, 0.7, 0.7)

col_colors['Clinical-Score'] = [clinical_scores.loc[int(idx.split('_')[0]), 'Clinical-Score'] for idx in col_colors.index]
col_colors = col_colors.fillna('NA')
col_colors['Clinical-Score'] = col_colors['Clinical-Score'].apply(lambda x: color_score(x))


sample_cm = c2c.plotting.loading_clustermap(tensor.factors['Contexts'],
                                            use_zscore=True,
                                            col_colors=col_colors,
                                            figsize=(16, 6),
                                            dendrogram_ratio=0.3,
                                            cbar_fontsize=12,
                                            tick_fontsize=14,
                                            #cmap='viridis'
                                           )

labels = [item.get_text().split('_')[0] for item in sample_cm.ax_heatmap.xaxis.get_majorticklabels()]
sample_cm.ax_heatmap.set_xticklabels(labels)

#Insert legend
plt.sca(sample_cm.ax_heatmap)
c2c.plotting.aesthetics.generate_legend(color_dict=color_dict,
                                        bbox_to_anchor=(1.1, 0.5),
                                        title='Category'
                                       )

plt.savefig(output_folder + 'Clustermap-Factor-specific-Samples.svg',
            dpi=300, bbox_inches='tight')


sns.palplot(sns.color_palette("Oranges", 10))
plt.tick_params(axis=u'both', which=u'both',length=0)
plt.savefig(output_folder + 'Clinical_Score_colormap.svg',
            dpi=300, bbox_inches='tight')


clusters = c2c.clustering.get_clusters_from_linkage(sample_cm.dendrogram_col.linkage, 4)


sample_labels = [idx.split('_')[0] for idx in tensor.factors['Contexts'].index]
cluster_map = {sample_labels[i] : k for k, v in clusters.items() for i in v}


clinical_scores['Cluster'] = [cluster_map[str(idx)] if (str(idx) in sample_labels) \
                              else 'NA' for idx in clinical_scores.index]


comp_scores = clinical_scores.loc[(clinical_scores['Cluster'] == 3) | (clinical_scores['Cluster'] == 4), 
                                  ['Clinical-Score', 'Cluster']].dropna()


comp_scores.groupby('Cluster').describe()


import gseapy
from statsmodels.stats.multitest import fdrcorrection

Creating directory /root/.config/bioservices


lr_set = c2c.io.load_variable_with_pickle(data_folder + '/LR-Pairs/CellChat-LR-KEGG-set.pkl')


weight = 1
min_size = 15
permutations = 999


loadings = tensor.factors['Ligand-Receptor Pairs'].reset_index()


for factor in loadings.columns[1:]:
    test = loadings[['index', factor]]
    test.columns = [0, 1]
    test = test.sort_values(by=1, ascending=False)
    test.reset_index(drop=True, inplace=True)

    gseapy.prerank(rnk=test,
                   gene_sets=lr_set,
                   min_size=min_size,
                   weighted_score_type=weight,
                   processes=6,
                   permutation_num=permutations, # reduce number to speed up testing
                   outdir=output_folder + '/GSEA/' + factor, format='png', seed=6)


pvals = []
terms = []
factors = []
nes = []
for factor in loadings.columns[1:]:
    p_report = pd.read_csv(output_folder + '/GSEA/' + factor + '/gseapy.prerank.gene_sets.report.csv')
    pval = p_report['pval'].values.tolist()
    pvals.extend(pval)
    terms.extend(p_report.Term.values.tolist())
    factors.extend([factor] * len(pval))
    nes.extend(p_report['nes'].values.tolist())
pval_df = pd.DataFrame(np.asarray([factors, terms, nes, pvals]).T, columns=['Factor', 'Term', 'NES', 'P-value'])
pval_df = pval_df.loc[pval_df['P-value'] != 'nan']
pval_df['P-value'] = pd.to_numeric(pval_df['P-value'])
pval_df['P-value'] = pval_df['P-value'].replace(0., 1./(permutations+1))
pval_df['NES'] = pd.to_numeric(pval_df['NES'])


pval_df['Adj. P-value'] = fdrcorrection(pval_df['P-value'].values, alpha=0.05)[1]


pval_df.loc[(pval_df['Adj. P-value'] < 0.05) & (pval_df['NES'] > 0.)]


pval_df.loc[(pval_df['Adj. P-value'] < 0.05) & (pval_df['NES'] < 0.)]


pval_df.to_excel(output_folder + '/GSEA-Adj-Pvals.xlsx')


pval_pivot = pval_df.pivot(index="Term", columns="Factor", values="Adj. P-value").fillna(1.)
scores = pval_df.pivot(index="Term", columns="Factor", values="NES").fillna(0)


with sns.axes_style("darkgrid"):
    dotplot = c2c.plotting.pval_plot.generate_dot_plot(pval_df=pval_pivot,
                                                       score_df=scores,
                                                       significance=0.05,
                                                       xlabel='',
                                                       ylabel='KEGG Pathways',
                                                       cbar_title='NES',
                                                       cmap='PuOr',
                                                       figsize=(9,16),
                                                       label_size=20,
                                                       title_size=20,
                                                       tick_size=14,
                                                       filename=output_folder + '/GSEA-Dotplot.svg'
                                                      )

	interaction_name	pathway_name	ligand	receptor	agonist	antagonist	co_A_receptor	co_I_receptor	evidence	annotation	interaction_name_2	ligand_symbol	receptor_symbol	ligand_ensembl	receptor_ensembl	interaction_symbol	interaction_ensembl
0	TGFB1_TGFBR1_TGFBR2	TGFb	TGFB1	TGFbR1_R2	TGFb agonist	TGFb antagonist	nan	TGFb inhibition receptor	KEGG: hsa04350	Secreted Signaling	TGFB1 - (TGFBR1+TGFBR2)	TGFB1	TGFBR1&TGFBR2	ENSG00000105329	ENSG00000106799&ENSG00000163513	TGFB1^TGFBR1&TGFBR2	ENSG00000105329^ENSG00000106799&ENSG00000163513
1	TGFB2_TGFBR1_TGFBR2	TGFb	TGFB2	TGFbR1_R2	TGFb agonist	TGFb antagonist	nan	TGFb inhibition receptor	KEGG: hsa04350	Secreted Signaling	TGFB2 - (TGFBR1+TGFBR2)	TGFB2	TGFBR1&TGFBR2	ENSG00000092969	ENSG00000106799&ENSG00000163513	TGFB2^TGFBR1&TGFBR2	ENSG00000092969^ENSG00000106799&ENSG00000163513

Ligand	Receptor
Protein A	Protein B
Protein B	Protein A

	Clinical-Score
	count	mean	std	min	25%	50%	75%	max
Cluster
3	5.0	25.00	7.483315	17.0	21.0	25.0	25.00	37.0
4	4.0	22.75	5.123475	16.0	20.5	23.5	25.75	28.0

	Factor	Term	NES	P-value	Adj. P-value
1	Factor 1	KEGG_ERBB_SIGNALING_PATHWAY	1.454242	0.001000	0.008709
2	Factor 1	KEGG_CELL_ADHESION_MOLECULES_CAMS	1.398443	0.001000	0.008709
3	Factor 1	KEGG_AXON_GUIDANCE	1.241130	0.006006	0.029029
17	Factor 2	KEGG_AXON_GUIDANCE	1.563928	0.001000	0.008709
18	Factor 2	KEGG_CELL_ADHESION_MOLECULES_CAMS	1.309822	0.005005	0.029029
31	Factor 3	KEGG_ERBB_SIGNALING_PATHWAY	1.486924	0.003003	0.018662
32	Factor 3	KEGG_AXON_GUIDANCE	1.388006	0.001001	0.008709
34	Factor 3	KEGG_ECM_RECEPTOR_INTERACTION	1.327261	0.001000	0.008709
35	Factor 3	KEGG_SMALL_CELL_LUNG_CANCER	1.335054	0.008008	0.033176
36	Factor 3	KEGG_FOCAL_ADHESION	1.279068	0.001001	0.008709
45	Factor 4	KEGG_ERBB_SIGNALING_PATHWAY	1.741709	0.001000	0.008709
46	Factor 4	KEGG_CELL_ADHESION_MOLECULES_CAMS	1.287627	0.003003	0.018662
47	Factor 4	KEGG_AXON_GUIDANCE	1.254568	0.007007	0.032085
61	Factor 5	KEGG_CELL_ADHESION_MOLECULES_CAMS	1.422309	0.001000	0.008709
62	Factor 5	KEGG_MAPK_SIGNALING_PATHWAY	1.310704	0.008008	0.033176
63	Factor 5	KEGG_REGULATION_OF_ACTIN_CYTOSKELETON	1.312981	0.002002	0.015834
64	Factor 5	KEGG_ERBB_SIGNALING_PATHWAY	1.326159	0.006006	0.029029
75	Factor 6	KEGG_CELL_ADHESION_MOLECULES_CAMS	1.297625	0.003003	0.018662
78	Factor 6	KEGG_FOCAL_ADHESION	1.190575	0.006006	0.029029

Load Data¶

Data Preprocessing¶

LR pairs¶

RNA-seq¶

Tensor-cell2cell Analysis¶

Run Analysis¶

Downstream Analyses¶

Boxplots to compare group of samples¶

Generate factor-specific networks¶

Clustermaps¶

GSEA¶

Visualization¶

	Factor	Term	NES	P-value	Adj. P-value
0	Factor 1	KEGG_NOTCH_SIGNALING_PATHWAY	-1.380117	0.001	0.008709
15	Factor 2	KEGG_NOTCH_SIGNALING_PATHWAY	-2.423423	0.001	0.008709