import cell2cell as c2c
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
output = '/results/Celegans/'
if not os.path.isdir(output):
os.mkdir(output)
import warnings
warnings.filterwarnings('ignore')
RNA-seq
Available in the supplementary material from Warner AD, Gevirtzman L, Hillier LW, Ewing B, Waterston RH. The C. elegans embryonic transcriptome with tissue, time, and alternative splicing resolution. Genome Res. 2019 Jun;29(6):1036-1045. doi: 10.1101/gr.243394.118.: https://genome.cshlp.org/content/suppl/2019/05/14/gr.243394.118.DC1/Supplemental_Table_S22.txt
rnaseq_data = c2c.io.load_rnaseq('/data/Celegans/Celegans_Embryonic_RNASeqData_Warner.txt',
gene_column='CommonName',
drop_nangenes=True,
log_transformation=False,
format='csv',
sep=','
)
Opening RNAseq datasets from /data/Celegans/Celegans_Embryonic_RNASeqData_Warner.txt /data/Celegans/Celegans_Embryonic_RNASeqData_Warner.txt was correctly loaded
rnaseq_data.head()
ceh32_T0 | ceh32_T1 | ceh32_T2 | ceh32_T3 | ceh32_T4 | cnd1_T0 | cnd1_T1 | cnd1_T2 | cnd1_T3 | cnd1_T4 | ... | pha4_T0 | pha4_T1 | pha4_T2 | pha4_T3 | pha4_T4 | tbx37_T0 | tbx37_T1 | tbx37_T2 | tbx37_T3 | tbx37_T4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CommonName | |||||||||||||||||||||
aap-1 | 43.133459 | 50.658138 | 50.414970 | 35.766857 | 33.895195 | 58.599474 | 66.939721 | 61.222382 | 65.702870 | 47.982343 | ... | 52.616365 | 60.825204 | 51.979991 | 43.455773 | 30.967278 | 51.326500 | 50.654093 | 49.112858 | 43.996665 | 33.075259 |
aat-1 | 1.621625 | 3.189402 | 9.831904 | 23.548271 | 22.726582 | 4.778361 | 11.748554 | 20.488394 | 23.204268 | 30.890648 | ... | 6.237194 | 59.957473 | 130.003444 | 388.127157 | 433.615700 | 11.542751 | 15.744846 | 37.220783 | 102.917061 | 144.728664 |
aat-2 | 5.496731 | 12.937667 | 21.763256 | 29.827811 | 37.708772 | 12.952399 | 24.959862 | 25.108797 | 39.361573 | 44.727870 | ... | 11.464360 | 41.773713 | 51.860603 | 57.235038 | 45.479755 | 7.449374 | 12.206810 | 18.608906 | 31.577895 | 37.431819 |
aat-3 | 61.400820 | 136.940807 | 198.831633 | 197.640775 | 219.688245 | 181.914836 | 260.822516 | 225.358789 | 303.181266 | 218.492232 | ... | 111.998325 | 289.777602 | 288.678800 | 285.776295 | 186.243517 | 151.952176 | 203.938290 | 266.344493 | 303.632732 | 272.697353 |
aat-4 | 0.204774 | 1.159116 | 0.832428 | 0.591529 | 0.827419 | 0.285891 | 0.305862 | 0.466678 | 0.293390 | 0.913105 | ... | 0.106712 | 0.948339 | 0.866276 | 1.472586 | 1.381153 | 0.406641 | 0.378540 | 0.714584 | 0.477434 | 0.734048 |
5 rows × 35 columns
Ligand-Receptor pairs
lr_pairs = pd.read_excel('/data/LR-Pairs/Celegans-2020-Armingol-LR-pairs.xlsx')
lr_pairs.shape
(245, 9)
Specify columns containing ligands and receptors
int_cols = ('Ligand_symbol', 'Receptor_symbol')
Keep LR pairs that are present in the RNAseq dataset
lr_pairs = lr_pairs.loc[(lr_pairs[int_cols[0]].isin(rnaseq_data.index)) \
& (lr_pairs[int_cols[1]].isin(rnaseq_data.index))]
Filter LR pairs by functions. Keep Wnt, TGF-B, Hedgehog and Notch signalings.
function_filter = ['Wnt signaling', 'TGF-B signaling', 'Hedgehog signaling', 'Notch signaling']
lr_pairs = lr_pairs.loc[lr_pairs['LR Function'].isin(function_filter)]
lr_pairs.sort_values(by=['LR Function', int_cols[0]], inplace=True)
lr_pairs.head(2)
Ligand_WB | Receptor_WB | Ligand_symbol | Receptor_symbol | LR Function | L Function | R Function | Ligand_desc | Receptor_desc | |
---|---|---|---|---|---|---|---|---|---|
50 | WBGene00001700 | WBGene00004208 | grd-11 | ptc-1 | Hedgehog signaling | DHH | PTCH1 | Is an ortholog of human DHH (desert hedgehog s... | Is an ortholog of human PTCH1 (patched 1) and ... |
148 | WBGene00004264 | WBGene00004210 | qua-1 | ptc-3 | Hedgehog signaling | IHH | PTCH2 | Is an ortholog of human DHH (desert hedgehog s... | Is an ortholog of human PTCH1 (patched 1) and ... |
lr_pairs.shape
(94, 9)
Generate a dictionary with function info for each LR pairs. Keys are ligand_name^receptor_name and values are the function in the LR Function column in the dataframe containing ligand-receptor pairs.
lr_functions = {}
for idx, row in lr_pairs.iterrows():
lr_name = row[int_cols[0]] + '^' + row[int_cols[1]]
lr_functions[lr_name] = row['LR Function']
Metadata
markers = {'hlh1' : 'Muscles & Coelomocytes',
'end1' : 'Intestine',
'cnd1' : 'Neurons',
'pha4' : 'Pharynx',
'nhr25end1' : 'Hypodermis',
'ceh32' : 'ABala-derived (Neurons & Glial Cells)',
'tbx37' : 'ABa-derived (Neurons, Phrarynx & Hypodermis)'
}
Names for different time points and tissues. Later used for naming elements in respective orders/dimensions of the tensor.
times = ['T0', 'T1', 'T2', 'T3', 'T4']
time_labels = ['120 min', '210 min', '300 min', '390 min', '480 min']
time_dict = dict(zip(times, time_labels))
total_cells = ['cnd1',
'ceh32',
'tbx37',
'pha4',
'end1',
'nhr25end1',
'hlh1']
cell_labels = [markers[c] + ' - {}'.format(c) for c in total_cells]
Log1p transform RNA-seq data
continuous_rnaseq = np.log2(rnaseq_data+1.0)
Generate list of condition-specific expression matrices
rnaseq_matrices = []
for T in times:
cell_time = [c + '_' + T for c in total_cells]
df = continuous_rnaseq[cell_time]
df.columns = cell_labels
rnaseq_matrices.append(df)
Build 4D-Communication Tensor
how='inner'
is used to keep only cell types that are across all samples/patients.
upper_letter_comparison
is used to integrate list of LR pairs and gene expression matrix. Since here we are sure that both dataframes use exactly the same gene symbols, we disabled this option and avoided using all capital leters in their names.
tensor = c2c.tensor.InteractionTensor(rnaseq_matrices=rnaseq_matrices,
ppi_data=lr_pairs,
context_names=time_labels,
how='inner',
upper_letter_comparison=False,
interaction_columns=int_cols,
communication_score='expression_product'
)
Building tensor for the provided context
tensor.tensor.shape
(5, 94, 7, 7)
Generate a list containing metadata for each tensor order/dimension - Later used for coloring factor plots
meta = c2c.tensor.generate_tensor_metadata(interaction_tensor=tensor,
metadata_dicts=[None, lr_functions, None, None],
fill_with_order_elements=True
)
Elbow analysis for selecting Rank for Tensor-Factorization
fig, error = tensor.elbow_rank_selection(upper_rank=25,
runs=1,
init='svd',
automatic_elbow=False,
random_state=888)
_ = plt.plot(*error[9], 'ro')
plt.savefig(output + '/Celegans-Elbow.svg', dpi=300, bbox_inches='tight')
error[9]
(10, array(0.17345209))
Perform tensor factorization
r = 10
tensor.compute_tensor_factorization(rank=r, random_state=888)
Plot Factors
Colors for metadatas
cmaps = ['viridis', 'tab20', 'Pastel1', 'Pastel1']
fig, axes = c2c.plotting.tensor_factors_plot(interaction_tensor=tensor,
order_labels=['Time', 'Ligand-Receptor Pairs', 'Sender Tissues', 'Receiver Tissues'],
metadata = meta,
sample_col='Element',
group_col='Category',
meta_cmaps=cmaps,
fontsize=14,
filename=output + '/Celegans-TensorFactorization.svg'
)
Top-5 LR pairs for each factor
for i in range(tensor.rank):
print(tensor.get_top_factor_elements('Ligand-Receptor Pairs', 'Factor {}'.format(i+1), 5))
print('')
cwn-1^cam-1 0.346320 cwn-1^cfz-2 0.312025 cwn-1^vang-1 0.299701 cwn-1^lin-17 0.283951 cwn-1^mom-5 0.252656 Name: Factor 1, dtype: float64 cwn-2^cam-1 0.411229 cwn-2^lin-17 0.358747 cwn-2^cfz-2 0.355586 cwn-2^vang-1 0.345109 cwn-2^mig-1 0.324834 Name: Factor 2, dtype: float64 wrt-1^ptc-3 0.388149 wrt-2^ptc-3 0.348375 wrt-5^ptc-1 0.260977 apl-1^F14B4.1 0.244969 cwn-2^cam-1 0.222672 Name: Factor 3, dtype: float64 mom-2^cfz-2 0.323600 lin-44^cam-1 0.306039 mom-2^cam-1 0.302202 lin-44^cfz-2 0.294243 lin-44^vang-1 0.265855 Name: Factor 4, dtype: float64 apl-1^F14B4.1 0.689900 srp-7^F14B4.1 0.520561 srp-2^F14B4.1 0.233917 sfrp-1^cfz-2 0.159949 sup-17^glp-1 0.156285 Name: Factor 5, dtype: float64 dsl-2^lin-12 0.340190 cwn-2^vang-1 0.256561 cwn-2^cam-1 0.248999 dsl-3^lin-12 0.244976 cwn-2^mom-5 0.242987 Name: Factor 6, dtype: float64 sfrp-1^mig-1 0.412471 sfrp-1^cfz-2 0.408550 sfrp-1^mom-5 0.402333 dsl-3^lin-12 0.280181 sup-17^lin-12 0.257853 Name: Factor 7, dtype: float64 egl-20^cam-1 0.398236 egl-20^cfz-2 0.329903 egl-20^lin-17 0.329275 egl-20^fmi-1 0.319805 egl-20^vang-1 0.313279 Name: Factor 8, dtype: float64 mom-2^mig-1 0.393997 cwn-1^mig-1 0.310323 mom-2^lin-18 0.287104 cwn-2^mig-1 0.249182 sfrp-1^mig-1 0.239089 Name: Factor 9, dtype: float64 sfrp-1^cfz-2 0.385740 sfrp-1^mom-5 0.327037 dsl-2^glp-1 0.305708 mom-2^cfz-2 0.259814 sup-17^glp-1 0.251987 Name: Factor 10, dtype: float64
Export All Loadings
tensor.export_factor_loadings(output + 'Celegans-Loadings.xlsx')
Loadings of the tensor factorization were successfully saved into /results/Celegans/Celegans-Loadings.xlsx