import pandas as pd
import cell2cell as c2c
from tqdm.auto import tqdm
import pickle
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
%matplotlib inline
c2c.__version__
'0.5.9'
import warnings
warnings.filterwarnings('ignore')
Directories
import os
data_folder = '/data/COVID-19/'
directory = os.fsencode(data_folder)
output_folder = '/results/COVID-19/'
if not os.path.isdir(output_folder):
os.mkdir(output_folder)
RNA-seq
Open aggregated expression matrices from previous notebook. It contains cell-type-based gene expressions as the fraction of single cells with non-zero counts.
data = dict()
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".CellFract.csv.gz"):
print(filename)
basename = os.path.basename(filename)
sample = basename.split('.')[0]
data[sample] = pd.read_csv(data_folder + filename, index_col=0)
else:
continue
C149.CellFract.csv.gz C142.CellFract.csv.gz C141.CellFract.csv.gz C146.CellFract.csv.gz C52.CellFract.csv.gz C148.CellFract.csv.gz C143.CellFract.csv.gz C100.CellFract.csv.gz C145.CellFract.csv.gz C152.CellFract.csv.gz C144.CellFract.csv.gz C51.CellFract.csv.gz
Metadata
meta_df = pd.read_csv(data_folder + '/metadata.txt', sep='\t')
meta_df = meta_df.sort_values(['sample_new', 'sample'])
meta_df.head()
ID | sample | sample_new | group | disease | hasnCoV | cluster | celltype | |
---|---|---|---|---|---|---|---|---|
0 | AAACCTGAGACACTAA_1 | C51 | HC1 | HC | N | N | 3 | Macrophages |
1 | AAACCTGAGGAGTACC_1 | C51 | HC1 | HC | N | N | 3 | Macrophages |
2 | AAACCTGAGGATATAC_1 | C51 | HC1 | HC | N | N | 3 | Macrophages |
3 | AAACCTGAGGTCATCT_1 | C51 | HC1 | HC | N | N | 3 | Macrophages |
4 | AAACCTGCACGGATAG_1 | C51 | HC1 | HC | N | N | 5 | Macrophages |
Generate metadata for samples
sample_meta = meta_df[['sample', 'sample_new', 'disease']].drop_duplicates().reset_index(drop=True)
sample_meta = sample_meta.loc[sample_meta['sample'].isin(list(data.keys()))]
sample_meta
sample | sample_new | disease | |
---|---|---|---|
0 | C51 | HC1 | N |
1 | C52 | HC2 | N |
2 | C100 | HC3 | N |
4 | C141 | M1 | Y |
5 | C142 | M2 | Y |
6 | C144 | M3 | Y |
7 | C145 | S1 | Y |
8 | C143 | S2 | Y |
9 | C146 | S3 | Y |
10 | C148 | S4 | Y |
11 | C149 | S5 | Y |
12 | C152 | S6 | Y |
def meta_disease(x):
if 'HC' in x:
return 'Control'
elif 'M' in x:
return 'Moderate COVID-19'
elif 'S' in x:
return 'Severe COVID-19'
else:
return 'NA'
sample_disease = dict()
for idx, row in sample_meta.iterrows():
sample_disease[row['sample_new']] = meta_disease(row['sample_new'])
Ligand-Receptor Pairs
lr_pairs = pd.read_csv('/data/LR-Pairs/Human-2020-Jin-LR-pairs.csv')
lr_pairs.head(2)
interaction_name | pathway_name | ligand | receptor | agonist | antagonist | co_A_receptor | co_I_receptor | evidence | annotation | interaction_name_2 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | TGFB1_TGFBR1_TGFBR2 | TGFb | TGFB1 | TGFbR1_R2 | TGFb agonist | TGFb antagonist | NaN | TGFb inhibition receptor | KEGG: hsa04350 | Secreted Signaling | TGFB1 - (TGFBR1+TGFBR2) |
1 | TGFB2_TGFBR1_TGFBR2 | TGFb | TGFB2 | TGFbR1_R2 | TGFb agonist | TGFb antagonist | NaN | TGFb inhibition receptor | KEGG: hsa04350 | Secreted Signaling | TGFB2 - (TGFBR1+TGFBR2) |
Change annotations of protein complexes: Use all capital letters for names of proteins and separate subunits by "&".
For example, complex composed by ProteinX and ProteinY would be PROTEINX&PROTEINY
# Change complex annotations
lr_pairs['ligand2'] = lr_pairs.interaction_name_2.apply(lambda x: x.split(' - ')[0].upper())
lr_pairs['receptor2'] = lr_pairs.interaction_name_2.apply(lambda x: x.split(' - ')[1].upper() \
.replace('(', '').replace(')', '').replace('+', '&'))
lr_pairs['c2c_interaction'] = lr_pairs.apply(lambda row: row['ligand2'] + '^' + row['receptor2'], axis=1)
Specify columns containing ligands and receptors
int_cols = ('ligand2', 'receptor2')
Remove bidirectionality in the list of ligand-receptor pairs. That is, remove repeated interactions where both interactions are the same but in different order:
From this list:
Ligand | Receptor |
---|---|
Protein A | Protein B |
Protein B | Protein A |
We will have:
Ligand | Receptor |
---|---|
Protein A | Protein B |
lr_pairs = c2c.preprocessing.ppi.remove_ppi_bidirectionality(ppi_data=lr_pairs,
interaction_columns=int_cols
)
Removing bidirectionality of PPI network
lr_pairs.shape
(1991, 14)
Generate a dictionary with function info for each LR pairs. Keys are LIGAND_NAME^RECEPTOR_NAME and values are the function in the annotation column in the dataframe containing ligand-receptor pairs.
ppi_functions = dict()
for idx, row in lr_pairs.iterrows():
ppi_label = row[int_cols[0]] + '^' + row[int_cols[1]]
ppi_functions[ppi_label] = row['annotation']
Generate list of gene expression matrices, sorted by severity group. Expression here is the cell fraction of non-zero counts for each cell type
rnaseq_matrices = []
for k, row in sample_meta.set_index('sample').iterrows():
df = data[k]
rnaseq_matrices.append(df.fillna(0))
Context Names for each of the rnaseq_matrices. Positions in this list are exactly the same as the one above
context_labels = sample_meta['sample_new'].tolist()
Build 4D-Communication Tensor
how='inner'
is used to keep only cell types that are across all samples/patients.
complex_sep='&'
is used to specify that the list of ligand-receptor pairs contains protein complexes and that subunits are separated by '&'. If the list does not have complexes, use complex_sep=None
instead.
tensor = c2c.tensor.InteractionTensor(rnaseq_matrices=rnaseq_matrices,
ppi_data=lr_pairs,
context_names=context_labels,
how='inner',
complex_sep='&',
interaction_columns=int_cols,
communication_score='expression_mean'
)
Getting expression values for protein complexes Building tensor for the provided context
tensor.tensor.shape
(12, 189, 6, 6)
Generate a list containing metadata for each tensor order/dimension - Later used for coloring factor plots
meta_tf = c2c.tensor.generate_tensor_metadata(interaction_tensor=tensor,
metadata_dicts=[sample_disease, ppi_functions, None, None],
fill_with_order_elements=True
)
Elbow analysis for selecting Rank for Tensor-Factorization
fig, error = tensor.elbow_rank_selection(upper_rank=25,
runs=1,
init='svd', # If you get a Memory error, replace by 'random'
automatic_elbow=False,
random_state=888)
_ = plt.plot(*error[9], 'ro')
plt.savefig(output_folder + '/BALF-Elbow.svg', dpi=300, bbox_inches='tight')
Perform tensor factorization
r = 10
tensor.compute_tensor_factorization(rank=r,
init='svd',
random_state=888,
var_ordered_factors=False
)
Export Tensor and Metadata
c2c.io.export_variable_with_pickle(tensor, '/results/COVID-19/Tensor-BALF.pkl')
/results/COVID-19/Tensor-BALF.pkl was correctly saved.
c2c.io.export_variable_with_pickle(meta_tf, '/results/COVID-19/Metadata-BALF.pkl')
/results/COVID-19/Metadata-BALF.pkl was correctly saved.
Plot Factors
Colors for metadatas
cmaps = ['plasma', 'Dark2_r', 'tab20', 'tab20']
fig, axes = c2c.plotting.tensor_factors_plot(interaction_tensor=tensor,
order_labels=['Samples', 'Ligand-Receptor Pairs', 'Sender Cells', 'Receiver Cells'],
metadata = meta_tf,
sample_col='Element',
group_col='Category',
meta_cmaps=cmaps,
fontsize=14,
filename=output_folder + '/BALF-TensorFactorization.svg'
)
Top-5 LR pairs for each factor
for i in range(tensor.rank):
print(tensor.get_top_factor_elements('Ligand-Receptor Pairs', 'Factor {}'.format(i+1), 5))
print('')
APP^CD74 0.267985 MDK^NCL 0.245786 MIF^CD74&CD44 0.230703 MDK^ITGA4&ITGB1 0.227003 CD99^CD99 0.220466 Name: Factor 1, dtype: float64 SEMA4D^PLXNB2 0.212349 SEMA4A^PLXNB2 0.202500 MDK^SDC4 0.201804 COL9A2^SDC4 0.186084 F11R^F11R 0.185700 Name: Factor 2, dtype: float64 SIGLEC1^SPN 0.194272 RETN^CAP1 0.181667 MDK^NCL 0.176625 FN1^ITGA4&ITGB1 0.175839 FN1^ITGA4&ITGB7 0.172482 Name: Factor 3, dtype: float64 MIF^CD74&CD44 0.321096 LGALS9^CD44 0.288686 COL9A2^CD44 0.245075 LAMB3^CD44 0.242210 LAMB2^CD44 0.234265 Name: Factor 4, dtype: float64 CD99^CD99 0.213391 ITGB2^CD226 0.211347 CD86^CTLA4 0.210150 ITGB2^ICAM2 0.200653 ITGB2^ICAM1 0.192306 Name: Factor 5, dtype: float64 CD99^CD99 0.332627 CCL5^CCR5 0.304814 CCL5^CCR1 0.292738 GZMA^F2R 0.240836 PTPRC^MRC1 0.221839 Name: Factor 6, dtype: float64 CD99^CD99 0.306729 MIF^CD74&CXCR4 0.240767 CD22^PTPRC 0.231365 MIF^CD74&CD44 0.228576 SELL^MADCAM1 0.190805 Name: Factor 7, dtype: float64 CCL2^CCR2 0.297309 CCL3^CCR1 0.275435 CCL8^CCR1 0.260609 CCL3^CCR5 0.238240 CCL3L1^CCR1 0.237707 Name: Factor 8, dtype: float64 FN1^CD44 0.274057 MIF^CD74&CD44 0.269390 APP^CD74 0.266376 PTPRC^MRC1 0.262805 RETN^CAP1 0.242692 Name: Factor 9, dtype: float64 CD99^PILRA 0.190719 LGALS9^HAVCR2 0.170104 ANXA1^FPR1 0.162624 MDK^LRP1 0.153972 PTPRC^MRC1 0.153037 Name: Factor 10, dtype: float64
Export All Loadings
tensor.export_factor_loadings(output_folder + 'BALF-Loadings.xlsx')
Loadings of the tensor factorization were successfully saved into /results/COVID-19/BALF-Loadings.xlsx
tensor.export_factor_loadings('/data/Classification/BALF-Loadings.xlsx')
Loadings of the tensor factorization were successfully saved into /data/Classification/BALF-Loadings.xlsx
severity = pd.concat([tensor.get_top_factor_elements('Contexts', 'Factor {}'.format(i), 12).to_frame() \
for i in range(1, r+1)], axis=1)
severity['Group'] = [sample_disease[idx] for idx in severity.index]
severity.index.name = 'Sample'
severity.reset_index(inplace=True)
severity.sort_values('Group', inplace=True)
Evaluate correlation
def sev_rank(x):
if 'HC' in x:
return 1
elif 'M' in x:
return 2
elif 'S' in x:
return 3
severity['Rank'] = severity['Sample'].apply(sev_rank)
with open(output_folder + '/BALF-SampleLoading-Correlation.txt', 'w') as file:
for k in range(1, tensor.rank+1):
factor = 'Factor {}'.format(k)
print(factor, scipy.stats.spearmanr(severity['Rank'].values, severity[factor].values))
file.write(factor + '\t{}'.format(str(scipy.stats.spearmanr(severity['Rank'].values, severity[factor].values))) + '\n')
Factor 1 SpearmanrResult(correlation=0.716928179098865, pvalue=0.008690936968371088) Factor 2 SpearmanrResult(correlation=0.614509867799027, pvalue=0.033491784149194105) Factor 3 SpearmanrResult(correlation=-0.2617356844329189, pvalue=0.41120815320396287) Factor 4 SpearmanrResult(correlation=0.38691362046605404, pvalue=0.21405134607765433) Factor 5 SpearmanrResult(correlation=0.3982934328327027, pvalue=0.19971505596788536) Factor 6 SpearmanrResult(correlation=0.2503558720662703, pvalue=0.4325597712381267) Factor 7 SpearmanrResult(correlation=0.5120915564991892, pvalue=0.08874053918241111) Factor 8 SpearmanrResult(correlation=0.9217648016985405, pvalue=2.02196826066901e-05) Factor 9 SpearmanrResult(correlation=-0.5120915564991892, pvalue=0.08874053918241111) Factor 10 SpearmanrResult(correlation=-0.022759624733297297, pvalue=0.9440286522660641)
Evaluate loading differences
boxplot = c2c.plotting.factor_plot.context_boxplot(tensor.factors['Contexts'],
metadict=sample_disease,
nrows=2,
figsize=(16, 9),
group_order=['Control', 'Moderate COVID-19', 'Severe COVID-19'],
statistical_test='t-test_ind',
pval_correction='bonferroni',
cmap='plasma',
verbose=False,
filename=output_folder + '/BALF-Severity-Boxplots.svg'
)
Explore changes in macrophage gene expression given severity
genes = ['LRP1',
'PILRA',
'SIGLEC1',
'FPR1',
'CCL2',
'CCL3',
'CCL20',
'CXCL1',
'CXCL3',
'CXCL10',
'IL1B',
'TNF',
'IL10',
'TGFB1'
]
expression = pd.DataFrame()
for sample, df in zip(tensor.order_names[0], rnaseq_matrices):
for gene in genes:
expression.at[sample, gene] = df.at[gene, 'Macrophages']
expression['Group'] = [sample_disease[idx] for idx in expression.index]
expression.index.name = 'Sample'
expression.reset_index(inplace=True)
def trim_axs(axs, N):
"""little helper to massage the axs list to have correct length..."""
axs = axs.flat
for ax in axs[N:]:
ax.remove()
return axs[:N]
with sns.axes_style("darkgrid"):
fig, axes = plt.subplots(nrows=int(len(genes) / 2),
ncols=2,
figsize=(12, 20),
)
axes = trim_axs(axes, len(genes))
data = expression
for k, gene in enumerate(genes):
x, y = 'Group', gene
order = ['Control', 'Moderate COVID-19', 'Severe COVID-19']
# Plot the orbital period with horizontal boxes
ax = sns.boxplot(x=x,
y=y,
data=data,
order=order,
whis=[0, 100],
width=.6,
palette="plasma",
boxprops=dict(alpha=.5),
ax=axes[k]
)
# Add annotations about statistical test
stat_test = add_stat_annotation(ax,
data=data,
x=x,
y=y,
order=order,
box_pairs=[(order[0], order[1]),
(order[0], order[2]),
(order[1], order[2])],
test='t-test_ind',
text_format='star',
loc='inside',
comparisons_correction='bonferroni',
verbose=(k == 0),
fontsize=14
)
# Add in points to show each observation
sns.stripplot(x=x,
y=y,
data=data,
size=6,
color='lightsalmon',
edgecolor='brown',
linewidth=0.6,
jitter=False,
ax=ax
)
ax.set_xlabel('')
ax.set_ylabel('Fraction of cells with\nnon-zero expression',fontsize=16)
ax.set_title(gene,fontsize=18)
ax.tick_params(labelsize=12)
fig.align_ylabels(axes)
plt.tight_layout()
plt.savefig(output_folder + '/BALF-Expression-Boxplots.svg', dpi=300, bbox_inches='tight')
p-value annotation legend: ns: 5.00e-02 < p <= 1.00e+00 *: 1.00e-02 < p <= 5.00e-02 **: 1.00e-03 < p <= 1.00e-02 ***: 1.00e-04 < p <= 1.00e-03 ****: p <= 1.00e-04 Control v.s. Moderate COVID-19: t-test independent samples with Bonferroni correction, P_val=9.837e-01 stat=-1.113e+00 Moderate COVID-19 v.s. Severe COVID-19: t-test independent samples with Bonferroni correction, P_val=6.120e-03 stat=4.768e+00 Control v.s. Severe COVID-19: t-test independent samples with Bonferroni correction, P_val=3.994e-01 stat=1.699e+00
def gini_coefficient(x):
"""
Compute Gini coefficient of array of values
Borrowed from: https://stackoverflow.com/questions/39512260/calculating-gini-coefficient-in-python-numpy
"""
diffsum = 0
for i, xi in enumerate(x[:-1], 1):
diffsum += np.sum(np.abs(xi - x[i:]))
return diffsum / (len(x)**2 * np.mean(x))
with open(output_folder + '/COVID-19-BALF-Gini.txt', 'w') as file:
for i in range(tensor.rank):
l_sender = tensor.factors['Sender Cells']['Factor {}'.format(i+1)].values
l_receiver = tensor.factors['Receiver Cells']['Factor {}'.format(i+1)].values
outer = np.outer(l_sender, l_receiver)
print('Factor {}'.format(i+1), np.round(gini_coefficient(outer.flatten()), 3))
file.write('Factor-{}'.format(i+1) + '\t{}'.format(np.round(gini_coefficient(outer.flatten()), 3)) + '\n')
Factor 1 0.5 Factor 2 0.761 Factor 3 0.745 Factor 4 0.482 Factor 5 0.593 Factor 6 0.649 Factor 7 0.676 Factor 8 0.243 Factor 9 0.091 Factor 10 0.74