Source code for craft.annotate

import os
import re
import tempfile

import vcf as pyvcf
import pandas as pd

import craft.config as config
import craft.read as read

[docs]def prepare_df_annoVar(df): """Prepare internal dataframe as input to ANNOVAR. Docstring contents """ # make a list of all column names; position repeats twice for input df['position2'] = df['position'] wanted = ['chromosome', 'position', 'position2','allele1', 'allele2'] colnames = df.columns # list comprehensions to identify first 5 column names final_colnames = [col for col in wanted if col in colnames] + [col for col in colnames if col not in wanted] # re-order dataframe according to final list of column names and return annot_input = df[final_colnames] return annot_input
[docs]def annotation_annoVar(df): """Use ANNOVAR to annotate prepared internal dataframe. Describe ANNOVAR functions here. """ with tempfile.TemporaryDirectory() as tempdir: # make file in tempdir, write to file to_annovar = os.path.join(tempdir, "to_annovar") df.to_csv(to_annovar, sep='\t', index=False, header=False, float_format='%g') # perform annotation with ANNOVAR (give input, standard output) cmd = (f"{config.annovar_dir}/annotate_variation.pl -geneanno " "-dbtype refGene -buildver hg19 " f"{to_annovar} {config.annovar_dir}/humandb/") os.system(cmd) # Output files written to -.variant_function, -.exonic_variant_function # add new columns and get original column names colnames = ['var_effect','genes'] + list(df.columns) # read back in my temp output files as a dataframe with column names df = read.annovar(to_annovar + ".variant_function", to_annovar + ".exonic_variant_function", colnames) return df
[docs]def finemap_annotation_annoVar(cred_snps, locus_df): """Use ANNOVAR to annotate prepared .cred FINEMAP output. FINEMAP outputs a .cred space-delimited text file. It contains the 95% credible sets for each causal signal conditional on other causal signals in the genomic region together with conditional posterior inclusion probabilities for each variant. This function: Filters the locus dataframe (containing all summary statistic information, including chromosome, position, allele 1, allele2), using a list of rsids obtained from the .cred file. Uses the prepare_df_annoVAR function from craft.annotate to process the new dataframe as ANNOVAR input. Uses ANNOVAR to add gene-based annotation to the prepared input, using annotate_variation.pl. Reads the final ANNOVAR input back and returns it as an internal dataframe, removes unnecessary columns, and merges it using rsid as an index to add the posterior probability from the original .cred file. """ with tempfile.TemporaryDirectory() as tempdir: # make a list of rsids in credible SNP set rsid_list = list(cred_snps[cred_snps.columns[0]]) # select locus DF information about rsids in credible SNP set locus_df = locus_df[locus_df['rsid'].isin(rsid_list)] cred_snps_prepared = prepare_df_annoVar(locus_df) # make file in tempdir to_annovar = os.path.join(tempdir, "to_annovar") cred_snps_prepared.to_csv(to_annovar, sep='\t', index=False, header=False, float_format='%g') # perform annotation with ANNOVAR (give input, standard output) cmd = (f"{config.annovar_dir}/annotate_variation.pl -geneanno " "-dbtype refGene -buildver hg19 " f"{to_annovar} {config.annovar_dir}/humandb/") os.system(cmd) # read back in my temp output files as a dataframe with column names colnames = ['var_effect', 'genes', 'chromosome','position','position2', 'allele1','allele2','rsid','all_total','cases_total', 'controls_total','maf','pvalue','beta','se','index_rsid', 'ABF','pp'] df = read.annovar(to_annovar + ".variant_function", to_annovar + ".exonic_variant_function", colnames) # Drop unnecessary columns from locus SNPs dataframe before merge df = df.drop(['position2', 'ABF','pp'], axis=1) cred_snps = cred_snps.set_index('rsid') df = pd.merge(df, cred_snps, how='left',on='rsid') df = df.sort_values('pp', ascending=False) return df