Source code for craft.annotate

import os
import re
import tempfile

import vcf as pyvcf
import pandas as pd

import craft.config as config
import craft.read as read

[docs]def prepare_df_annoVar(df):
    """Prepare internal dataframe as input to ANNOVAR.

    Docstring contents """
    # make a list of all column names; position repeats twice for input
    df['position2'] = df['position']
    wanted = ['chromosome', 'position', 'position2','allele1', 'allele2']
    colnames = df.columns

    # list comprehensions to identify first 5 column names
    final_colnames = [col for col in wanted if col in colnames] + [col for col in colnames if col not in wanted]

    # re-order dataframe according to final list of column names and return
    annot_input = df[final_colnames]
    return annot_input

[docs]def annotation_annoVar(df):
    """Use ANNOVAR to annotate prepared internal dataframe.

    Describe ANNOVAR functions here. """
    with tempfile.TemporaryDirectory() as tempdir:
        # make file in tempdir, write to file
        to_annovar = os.path.join(tempdir, "to_annovar")
        df.to_csv(to_annovar, sep='\t', index=False, header=False, float_format='%g')
        # perform annotation with ANNOVAR (give input, standard output)
        cmd = (f"{config.annovar_dir}/annotate_variation.pl -geneanno "
           "-dbtype refGene -buildver hg19 "
           f"{to_annovar} {config.annovar_dir}/humandb/")
        os.system(cmd)
        # Output files written to -.variant_function, -.exonic_variant_function
        # add new columns and get original column names
        colnames = ['var_effect','genes'] + list(df.columns)
        # read back in my temp output files as a dataframe with column names
        df = read.annovar(to_annovar + ".variant_function",
        to_annovar + ".exonic_variant_function", colnames)
    return df

[docs]def finemap_annotation_annoVar(cred_snps, locus_df):
    """Use ANNOVAR to annotate prepared .cred FINEMAP output.

    FINEMAP outputs a .cred space-delimited text file. It contains the
    95% credible sets for each causal signal conditional on other causal
    signals in the genomic region together with conditional posterior
    inclusion probabilities for each variant.

    This function:
    Filters the locus dataframe (containing all summary statistic
    information, including chromosome, position, allele 1, allele2),
    using a list of rsids obtained from the .cred file.

    Uses the prepare_df_annoVAR function from craft.annotate to process
    the new dataframe as ANNOVAR input.

    Uses ANNOVAR to add gene-based annotation to the prepared input,
    using annotate_variation.pl.

    Reads the final ANNOVAR input back and returns it as an internal
    dataframe, removes unnecessary columns, and merges it using rsid as
    an index to add the posterior probability from the original  .cred
    file.
    """
    with tempfile.TemporaryDirectory() as tempdir:
        # make a list of rsids in credible SNP set
        rsid_list = list(cred_snps[cred_snps.columns[0]])
        # select locus DF information about rsids in credible SNP set
        locus_df = locus_df[locus_df['rsid'].isin(rsid_list)]
        cred_snps_prepared = prepare_df_annoVar(locus_df)
        # make file in tempdir
        to_annovar = os.path.join(tempdir, "to_annovar")
        cred_snps_prepared.to_csv(to_annovar, sep='\t', index=False,
                                  header=False, float_format='%g')
        # perform annotation with ANNOVAR (give input, standard output)
        cmd = (f"{config.annovar_dir}/annotate_variation.pl -geneanno "
               "-dbtype refGene -buildver hg19 "
               f"{to_annovar} {config.annovar_dir}/humandb/")
        os.system(cmd)
        # read back in my temp output files as a dataframe with column names
        colnames = ['var_effect', 'genes', 'chromosome','position','position2',
                    'allele1','allele2','rsid','all_total','cases_total',
                    'controls_total','maf','pvalue','beta','se','index_rsid',
                    'ABF','pp']
        df = read.annovar(to_annovar + ".variant_function",
        to_annovar + ".exonic_variant_function", colnames)
        # Drop unnecessary columns from locus SNPs dataframe before merge
        df = df.drop(['position2', 'ABF','pp'], axis=1)
        cred_snps = cred_snps.set_index('rsid')
        df = pd.merge(df, cred_snps, how='left',on='rsid')
        df = df.sort_values('pp', ascending=False)
    return df