【Python】Fisherの正確確率検定【scipy】

1. 理論

例として、全タンパクを化合物 $i$ の標的であるかという観点、Pathway $j$ の構成タンパクであるという観点から以下のような分割表を作成することを考えます。

合計部分を固定した場合、上記のような表が得られる確率は超幾何分布に従います(詳しくはこちらの記事)。

具体的には以下のようにして $p\_{ij}$ を計算できます。

p\_{ij} = \frac{{}*{a+c}C\_a \times {}*{b+d}C\_b}{{}*{n}C*{a+b}}

# library

import pandas as pd

# 関数

def make_contingency_table(

    col_positives,

    row_positives,

    N # a+b+c+d

):

    _b = set(col_positives) & set(row_positives) # double positives (b, 右上)

    _a = set(row_positives) - _b # only row positives (a, 左上)

    _d = set(col_positives) - _b # only col positives (d, 右下)

    a, b, d = len(list(_a)), len(list(_b)), len(list(_d))

    c = n_proteins - (a + b + d) # double negatives (c, 左下)

    df_table = pd.DataFrame([[a, b], [c, d]])

    df_table.columns = ['col_neg', 'col_pos']

    df_table.index = ['row_pos', 'rpw_neg']

    df_table = df_table.T

    return df_table

 # 使用例

table = make_contingency_table(ink_syms[ik], ptm_syms[pt], n_proteins)

# library

from scipy.stats import fisher_exact

# fisher's exact testの関数 (alternativeに注意)

def calc_p(table):

    odds_ratio, p = fisher_exact(table, alternative='less')

    return p

# 使用例

table = make_contingency_table(ink_syms[ik], ptm_syms[pt], n_proteins)

calc_p(table)

任意の組合せに対してFisherの正確確率検定を実施。

# make table & tests for all inputs

def enrichment_analysis(

    inkList, ptmList, n_proteins,

    ink_syms, ptm_syms

):

    pvals = []

    for i, ik in enumerate(inkList):

        pvals.append([])

        for pt in ptmList:

            table = make_contingency_table(ink_syms[ik], ptm_syms[pt], n_proteins)

            pvals[-1].append(calc_p(table))

    return pvals