B_HIT.sVDJ tutorial#

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import B_HIT.sVDJ.tl as tl
from scipy.stats import pearsonr
BcellAggLoc = pd.read_csv('data/BcellAggLocAnno.tsv', sep='\t',index_col=0)
BcellAggLoc = BcellAggLoc[[ 'Bcell_aggregate_label', 'gex_x', 'gex_y', 'gex_Bx', 'gex_By']].copy()
BcellAggLoc['patient'] = BcellAggLoc['Bcell_aggregate_label'].str.split('-').str[0].str.split('_').str[0]
BcellAggLoc['tissue'] = BcellAggLoc['Bcell_aggregate_label'].str.split('-').str[0].str.split('_').str[1]
BcellAggLoc['on'] = BcellAggLoc['patient'].astype(str) + '-' + BcellAggLoc['tissue'].astype(str) +'_' + np.array(BcellAggLoc['gex_Bx'].astype(str)) +'_'+ np.array(BcellAggLoc['gex_By'].astype(str))
BcellAggLoc = BcellAggLoc.drop(['gex_x','gex_y','gex_Bx','gex_By','patient', 'tissue'], axis=1)
BcellAggLoc['BaggArea'] = BcellAggLoc['Bcell_aggregate_label'].map(BcellAggLoc['Bcell_aggregate_label'].value_counts().to_dict()) * 100
BcellAggLoc['Bagg_Anno_res'] = 'B Lymphocytes Aggregate'
rep_file = pd.read_csv('./data/rep2_filtered_loc_3_reg1.data.tsv', sep='\t', index_col=0)
rep_file['on'] = rep_file['sample_xy_orignial_tissueCut_B'].apply(lambda x: '_'.join([x.split('_')[0]] + x.split('_')[-2:])) 
rep_loc = pd.merge(rep_file, BcellAggLoc, on = 'on', how= 'left')  
rep_loc = rep_loc[rep_loc['Cregion_simple'].str.contains('IG')].copy() 
groupby_cols = ['sample', 'Cregion_simple', 'family_id', 'Bcell_aggregate_label']
extra_cols = ['Bx', 'By']
count_col = 'clone'
_Index_compute = tl.compute_clone_counts(rep_loc, groupby_cols, count_col, extra_cols)

rep_agg = rep_loc[groupby_cols + extra_cols + [count_col]].value_counts().reset_index(name='UmiCount')

_Index_compute['cloneRich'] = tl.compute_richness(rep_agg, clone_extra = ['sample', 'family_id', 'clone' ] 
                                                  clone_group = ['Bcell_aggregate_label', 'Cregion_simple'], 
                                                  richness_name = 'cloneRichness')

richness_name = 'cloneFamilyRichness'
cloneFamily_group = ['Bcell_aggregate_label', 'Cregion_simple']
cloneFamily_extra = ['sample', 'family_id']
_Index_compute['cloneFamilyRich'] = tl.compute_richness(rep_agg, cloneFamily_extra, 
                                                        cloneFamily_group, richness_name)

_Index_compute['CDR3nt'] = _Index_compute['clone'].str.split('_').str[0]
_Index_compute.head() 
sample Cregion_simple family_id Bcell_aggregate_label clone freq count cloneRich cloneFamilyRich CDR3nt
0 P0411-CC IGH IGH_family_0 P0411_CC-10 TGTGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGGGG... 1.000000 1 2088 253 TGTGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGGGG...
1 P0411-CC IGH IGH_family_0 P0411_CC-11 TGTGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGAGG... 0.500000 1 2088 253 TGTGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGAGG...
2 P0411-CC IGH IGH_family_0 P0411_CC-11 TGTGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGGGG... 0.500000 1 1721 372 TGTGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGGGG...
3 P0411-CC IGH IGH_family_0 P0411_CC-3 TGTGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGGGG... 0.168421 16 1925 223 TGTGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGGGG...
4 P0411-CC IGH IGH_family_0 P0411_CC-3 TGGGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGGGG... 0.052632 5 1925 223 TGGGCGAGACCAGATTTTGATATCGTGACTAATTATTATGGGGGGG...
rep_file_stat = rep_file[['sample', 'Cregion_simple', 'clone', 'family_id', 'Bx', 'By' ]].value_counts().reset_index(name='cloneUmiInBin20')
bin20clonecount = rep_file_stat.groupby(['sample', 'Cregion_simple', 'Bx', 'By']).size().reset_index(name='cloneRichInBin20') 
sns.boxplot(data=bin20clonecount, x='sample', y='cloneRichInBin20')
plt.yscale('log')
plt.xticks(rotation=90)
plt.show()
../_images/5ea1f786367b2eb21dc22112223acaba6559685dbdf84422a800a0a1b20ef304.png
rep_file_stat = rep_file[['sample', 'Cregion_simple', 'clone', 'family_id', 'Bx', 'By' ]].value_counts().reset_index(name='cloneUmiInBin20')
familyBin20Count = rep_file_stat.groupby(['sample', 'Cregion_simple', 'family_id']).size().reset_index(name='FamilyBin20Num') 

sns.boxplot(data=familyBin20Count, x='sample', y='FamilyBin20Num')
plt.yscale('log')
plt.xticks(rotation=90)
plt.show()
../_images/2550dd8513ef59af4b73343b6db6419cdcbed76b1558c89fe5df06dcab9fa7d3.png
rep_agg_all = rep_loc.copy() 
rep_agg_all['Bcell_aggregate_label_2'] = np.array(rep_agg_all['Bcell_aggregate_label']) 
rep_agg_all.loc[~rep_agg_all['Bcell_aggregate_label_2'].isna(), 'Bcell_aggregate_label_2'] = 'Aggregates' 
rep_agg_all.loc[rep_agg_all['Bcell_aggregate_label_2'].isna(), 'Bcell_aggregate_label_2'] = 'Scattered' 
rep_agg_all_1 = rep_agg_all[['family_id', 'sample', 'Cregion_simple', 'Bx', 'By']].value_counts().reset_index(name='UmiCount')  
_Index_compute_count = rep_agg_all_1.groupby(['sample', 'Cregion_simple'])['family_id'].value_counts().reset_index(name='bin20Count') 
family_id_dict = _Index_compute_count.set_index(['sample', 'Cregion_simple', 'family_id']).to_dict()['bin20Count'] 
rep_agg_all_1 = rep_agg_all[['sample', 'Cregion_simple', 'family_id', 'Bcell_aggregate_label_2']].drop_duplicates() 
rep_agg_all_1['family_id_bin20Count'] = rep_agg_all_1.apply(lambda row: family_id_dict.get((row['sample'], row['Cregion_simple'], row['family_id'])), axis=1) 

df = rep_agg_all_1.pivot_table(index=['sample', 'Cregion_simple', 'family_id'], columns=['Bcell_aggregate_label_2'])
df.columns = df.columns.get_level_values(1)
df['class'] = 'Not_InAgg'
df.loc[df['Scattered'].isna(),'class'] = 'InAgg'
df.loc[(~df['Scattered'].isna()) & (~df['Aggregates'].isna()), 'class'] = 'Shared' 
df = df.reset_index() 
df['family_id_bin20Count'] = df.apply(lambda row: family_id_dict.get((row['sample'], row['Cregion_simple'], row['family_id'])), axis=1) 
cloneLocClassDict = df[['sample', 'Cregion_simple', 'family_id', 'class']].set_index(['sample', 'Cregion_simple', 'family_id']).to_dict()['class']

rep_agg = rep_loc[['family_id', 'sample', 'Cregion_simple', 'Bx', 'By']].value_counts().reset_index(name='UmiCount') 
rep_agg['familyLocClass'] = rep_agg.apply(lambda row: cloneLocClassDict.get((row['sample'], row['Cregion_simple'], row['family_id'])), axis=1) 
_Index_compute_count = rep_agg.groupby(['sample', 'Cregion_simple', 'familyLocClass'])['family_id'].value_counts().reset_index(name='count')
_Index_compute_freq = rep_agg.groupby(['sample', 'Cregion_simple', 'familyLocClass'])['family_id'].value_counts(normalize=True).reset_index(name='freq') 
_Index_compute = pd.merge(_Index_compute_freq, _Index_compute_count, on = ['sample', 'Cregion_simple', 'familyLocClass', 'family_id']) 

_Index_compute = _Index_compute[~_Index_compute['sample'].str.contains('1204')]

# 变量名是familyLocClass
tmp_df = tl.compute_grouped_index(_Index_compute, index = 'gini_index', 
                                  groups = ['sample', 'Cregion_simple','familyLocClass'], 
                                  column_name = 'count', exclude_values=['Shared'], 
                                  exclude_name='familyLocClass')

fig, ax = plt.subplots(1, 1, figsize=(4,3))
sns.boxplot(data=tmp_df[tmp_df['sample'].str.contains('CC')], x='Cregion_simple', y='gini_index', hue='familyLocClass', ax=ax, palette='tab20')   
plt.title('Primary tumor')
plt.legend(bbox_to_anchor=(1,1))
plt.xlabel('') 
plt.ylabel('Gini index ')   #(clone family)
plt.xticks(rotation=0)
plt.show() 

fig, ax = plt.subplots(1, 1, figsize=(4,3))
sns.boxplot(data=tmp_df[tmp_df['sample'].str.contains('LM')], x='Cregion_simple', y='gini_index', hue='familyLocClass', ax=ax, palette='tab20')   
plt.title('Metastatic tumor')
plt.legend(bbox_to_anchor=(1,1))
plt.xlabel('') 
plt.ylabel('Gini index ')   #(clone family)
plt.xticks(rotation=0)
plt.show() 
../_images/a6ae634ec93c767c5f72d27c11ff80f7b6fec96c52cbbcc8d36e5a4fab0fd5bd.png ../_images/de79798bee02c8f68612eb9ed5c32e1e01e75c2ab936f33fae489a860cd7177f.png
tmp_df = tl.compute_grouped_index(_Index_compute, index = 'Clonality', 
                                  groups = ['sample', 'Cregion_simple','familyLocClass'], 
                                  column_name = 'freq', exclude_values=['Shared'], 
                                  exclude_name='familyLocClass')


fig, ax = plt.subplots(1, 1, figsize=(4,3))
sns.boxplot(data=tmp_df[tmp_df['sample'].str.contains('CC')], x='Cregion_simple', y='Clonality', hue='familyLocClass', ax=ax, palette='tab20')  
plt.legend(bbox_to_anchor=(1,1))
plt.title('Primary tumor')
plt.xlabel('') 
plt.ylabel('Clonality')   #  (clone family)
plt.xticks(rotation=0)
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(4,3))
sns.boxplot(data=tmp_df[tmp_df['sample'].str.contains('LM')], x='Cregion_simple', y='Clonality', hue='familyLocClass', ax=ax, palette='tab20')  
plt.legend(bbox_to_anchor=(1,1))
plt.title('Metastatic tumor')
plt.xlabel('') 
plt.ylabel('Clonality')   #  (clone family)
plt.xticks(rotation=0)
plt.show()
../_images/c5c17fdeb66f1dcdbca82979463681b22a24fdc642bd3ccb66943a923b79786a.png ../_images/e8d31002e2b91c2c76e47f4fb962a403d795e44c55d2651f41079c6a56f2ad5e.png

Aggrates面积与克隆数量、扩增#

克隆数量#

rep_agg = rep_loc[['family_id', 'sample', 'Cregion_simple', 'Bcell_aggregate_label', 'Bagg_Anno_res', 'Bx', 'By', 'BaggArea']].value_counts().reset_index(name='UmiCount') 
cloneRich = tl.compute_richness(df = rep_agg, extra_cols = ['sample', 'family_id'], 
                             groupby_cols = ['Bcell_aggregate_label', 'Bagg_Anno_res', 'BaggArea', 'Cregion_simple'], 
                             richness_name = 'cloneFamilyRichness', default_value=0, return_df=True)

plt.figure(figsize=(5,4))
sns.scatterplot(x='BaggArea', y='cloneFamilyRichness', data=cloneRich, hue='Cregion_simple', palette={'IGH':'#B89AE3', 'IGK':'#f27979', 'IGL':'#79b9f2'}, edgecolor='black' ) 
plt.yscale('log') 
plt.xscale('log') 
plt.xlabel('Size of B cell aggregates (um^2)') 
plt.ylabel('Clonetype family number (cloneRichness)') 
plt.show()
../_images/632784cae4c76f16501f081373a2c7c6be12dccaf6faf7a8143f25625345c91c.png
cloneRich['tissue'] = cloneRich['Bcell_aggregate_label'].str.split('-').str[0].str.split('_').str[1]
corrDf = tl.compute_correlation(cloneRich, groupby_cols = ['Cregion_simple','tissue'], 
                                corr1 = 'BaggArea', corr2 = 'cloneFamilyRichness', 
                                save=False, path=None, compute_corr_matrix=False)

扩增#

_Index_compute = tl.compute_clone_counts(rep_loc, groupby_cols = ['sample', 'Cregion_simple', 'Bcell_aggregate_label','Bagg_Anno_res', 'BaggArea'], 
                                         count_col = 'family_id', extra_cols =['Bx', 'By'], 
                                         count_name='count', freq_name='freq', if_count=True, if_freq=True)
tmp_df = tl.compute_grouped_index(_Index_compute, index = 'gini_index', groups = ['sample', 'Cregion_simple', 'Bcell_aggregate_label', 'Bagg_Anno_res','BaggArea'], 
                                 column_name = 'count', exclude_values=None, exclude_name=None, check_column=None)

plt.figure(figsize=(5,4)) 
sns.scatterplot(x='BaggArea', y='gini_index', data=tmp_df, hue='Cregion_simple', palette={'IGH':'#B89AE3', 'IGK':'#f27979', 'IGL':'#79b9f2'}, edgecolor='black' )  
plt.xscale('log')
plt.xlabel('Size of B cell aggregates (um^2)') 
plt.ylabel('Gini index (clone family)')  
plt.show()
../_images/d2b4e45105065c668caa5276ad5e1f476d17aa027bf7774d96edd7e17616b417.png
tmp_df['tissue'] = tmp_df['sample'].str.split('-').str[1]

corrDf, corrmat, Pmat = tl.compute_correlation(cloneRich = tmp_df, groupby_cols =['Cregion_simple','tissue'], 
                                               corr1 = 'BaggArea', corr2 = 'gini_index', save=False, 
                                               path=None, compute_corr_matrix=True)

print(corrmat)
region        CC        LM
chain                     
IGH     0.341936  0.398674
IGK     0.370056  0.441915
IGL     0.362810  0.418741
fig, ax = plt.subplots(figsize=(5, 3))
sns.heatmap(corrmat, annot=False, fmt=".2f",vmin=0,  square=True, linewidths=0.5, ax=ax, cmap='Blues') 
for i in range(corrmat.shape[0]):
    for j in range(corrmat.shape[1]):
        x, y = j + 0.5, i + 0.5 
        if Pmat.iloc[i, j] < 0.001:  # FDR > 0.1
            ax.text(x, y, "***", ha="center", va="center", color="red", fontsize=15)
        elif Pmat.iloc[i, j] < 0.01: 
            ax.text(x, y, "**", ha="center", va="center", color="red", fontsize=15)
        elif Pmat.iloc[i, j] < 0.05: 
            ax.text(x, y, "*", ha="center", va="center", color="red", fontsize=15)
plt.title('Correlation between \n gini index of BCR\nand region area', fontsize=10)
plt.xlabel('')
plt.show()
../_images/ee4a350786708146ba9458961900b6a1e9710e6bbc214c2d6f648f461ed2239d.png

clonal diversification index#

groupby_cols = ['sample', 'Cregion_simple', 'family_id','Bcell_aggregate_label', 'Bagg_Anno_res','BaggArea']
extra_cols = ['Bx', 'By']
count_col = 'clone'
_Index_compute = tl.compute_clone_counts(rep_loc, groupby_cols, count_col, extra_cols)


_family_Index_compute = _Index_compute.copy()
_family_Index_compute = _family_Index_compute[['sample', 'Cregion_simple', 'family_id', 'clone', 'Bcell_aggregate_label', 'Bagg_Anno_res', 'BaggArea' ]].drop_duplicates()
_family_Index_compute = _family_Index_compute.groupby(['sample', 'Cregion_simple', 'Bcell_aggregate_label', 'Bagg_Anno_res', 'BaggArea'])['family_id'].value_counts().reset_index(name='count')  
tmp_df = tl.compute_grouped_index(_family_Index_compute, index = 'gini_index', groups = ['sample', 'Cregion_simple', 'Bcell_aggregate_label', 'Bagg_Anno_res', 'BaggArea'], 
                                  column_name = 'count', exclude_values=None, exclude_name=None, check_column='Clonal_diversification')
tmp_df['tissue'] = tmp_df['sample'].str.split('-').str[1]


sns.scatterplot(data=tmp_df, x='BaggArea', y='Clonal_diversification', hue='Cregion_simple', palette={'IGH':'#B89AE3', 'IGK':'#f27979', 'IGL':'#79b9f2'}, edgecolor='black') 
plt.ylabel('Clonal family complexity index')
plt.xscale('log')
plt.show()
../_images/fcbd34422ad5f619827fabbc2b2ba8c7237de2d898a29441d4b72aa03e33245b.png
corrDf, corrmat, Pmat = tl.compute_correlation(cloneRich = tmp_df, groupby_cols = ['Cregion_simple','tissue'], 
                                               corr1 = 'BaggArea', corr2 = 'Clonal_diversification', 
                                               save=False, path=None, compute_corr_matrix=True)


fig, ax = plt.subplots(figsize=(5, 3))
sns.heatmap(corrmat, annot=False, fmt=".2f",vmin=0,  square=True, linewidths=0.5, ax=ax, cmap='Blues')
for i in range(corrmat.shape[0]):
    for j in range(corrmat.shape[1]):
        x, y = j + 0.5, i + 0.5  # 每个方块的中心点
        if Pmat.iloc[i, j] < 0.001:  # FDR > 0.1
            ax.text(x, y, "***", ha="center", va="center", color="red", fontsize=15)
        elif Pmat.iloc[i, j] < 0.01: 
            ax.text(x, y, "**", ha="center", va="center", color="red", fontsize=15)
        elif Pmat.iloc[i, j] < 0.05: 
            ax.text(x, y, "*", ha="center", va="center", color="red", fontsize=15)
plt.title('Correlation between \nBCR clone family complexity and region area', fontsize=12)
plt.xlabel('')
plt.show()
../_images/3c0bd807ab71b189e64565fc4a5af602b8caac9ac958739eab5509169b2d73a8.png