Source code for craft.abf

import numpy as np
import pandas as pd

from scipy.stats import norm

[docs]def calc_abf(pval, maf, n, n_controls, n_cases):
    """Calculate Approximate Bayes Factor.

        (Wakefield, 2009, Genet Epidemiol.)
        Based on Chris Wallace work
    Args:
        pval (float): GWAS p-value
        maf (float): Minor allele freq
        n (int): Sample size
        n_controls: Number of controls
        n_cases: Number of cases
    Returns:
        ABF

    **Usage**
    For a binary trait as no exact Bayes factor is calculable.
    For case-control studies only.

    **Limitations**
    Assumes causal variant is included in SNP population (i.e. requires relatively densely genotyped SNPs.)
    Assumes single causal variant only.
    """

    # Assert/set types
    pval = float(pval)
    maf = float(maf)
    n = int(n)
    n_controls = int(n_controls)
    n_cases = int(n_cases)

    # Variant of ABF calculation that uses p-values
    # Calculate Z-score
    z = np.absolute(norm.ppf(pval / 2))

    # Multiplicative model - assumes if 2 copies, 'twice as much' risk if have one copy.
    x0 = 0
    x1 = 1
    x2 = 2

    # No idea what this does
    d1 = (1-maf)**2 * x0 + 2*maf*(1-maf)*x1 + maf**2 * x2
    d2 = (1-maf)**2 * x0**2 + 2*maf*(1-maf)*x1 + maf**2 * x2**2
    V = (n_controls + n_cases) / (n_controls * n_cases * (d2-d1**2))

    # Compute W (no idea what this is either)
    # assumption: relative risk = 1.5
    # null hypothesis is that relative risk = 1
    # ppf = how many standard deviations is 99%
    W = (np.log(1.5) / norm.ppf(0.99))**2
    VW = V + W # simplification for ABF equation

    # Wakefield's approximate Bayes factor calculation (2009)
    ABF = np.sqrt(VW/V) * np.exp(- z**2 * W / (2 * VW))

    # Kass & Raftery (1995): 2 ln ABF allows comparison / rough interpretation of ABF meaning.
    # This version is not taken because it is not compatible
    #ln_2_ABF = -2 * np.log(ABF)

    return ABF

[docs]def calc_postprob(data):
    """ Calculate posterior probability for each SNP.

    Method taken from Maller et al. 2012 'Bayesian refinement of association signals for 14 loci in 3 common diseases.' (supplementary note).
    This assumes there is 1 causal SNP in the region, and that the prior probability is that any SNP in the region is equally likely to be that causal SNP.
    """
    sum_ABF = data['ABF'].sum()
    for index, row in data.iterrows():
        data['pp'] = data['ABF'] / sum_ABF
    return data

def abf(data_dfs, cred_threshold):
    data_list = []
    for data in data_dfs:
        data['ABF'] = data.apply(
            lambda row: calc_abf(pval=row['pvalue'],
                                maf=row['maf'],
                                n=row['all_total'],
                                n_controls=row['controls_total'],
                                n_cases=row['cases_total']), axis=1)
        data = calc_postprob(data)
        data = data.sort_values('pp', ascending=False)
        data['cpp'] = data.pp.cumsum()
    # Trim credible SNPs based on posterior probability threshold
        if cred_threshold == '95':
            count = sum(data.cpp < 0.95)
        if cred_threshold =='99':
            count = sum(data.cpp < 0.99)
        data = data.head(count+1)
        data_list.append(data)
    return data_list