import glob
import os
import pandas as pd
import numpy as np
[docs]def snptest(file):
""" Read snptest data into an internal dataframe. """
cols = ['chromosome','alleleA','alleleB','rsid','position','all_total', 'cases_total','controls_total','all_maf','frequentist_add_pvalue',
'frequentist_add_beta_1', 'frequentist_add_se_1']
df = pd.read_csv(file, sep=' ', comment='#')[cols]
df.rename(columns={'all_maf':'maf','frequentist_add_pvalue':'pvalue', 'frequentist_add_beta_1':'beta', 'frequentist_add_se_1':'se','alleleA':'allele1','alleleB':'allele2'}, inplace=True)
return df
[docs]def plink(file, frq_file):
""" Read plink (.assoc.logistic) data into an internal dataframe. """
# read .assoc.logistic file
cols = ['CHR','A1','SNP','BP','P','SE','OR']
df = pd.read_csv(file, sep='\s+')[cols]
df.rename(columns={'CHR':'chromosome','SNP':'rsid','BP':'position','A1':'allele1','P':'pvalue','SE':'se'}, inplace=True)
# For finemap, we need the beta coefficient. For a binary logistic regression, ln(OR) = beta coefficient.
for index, row in df.iterrows():
df['beta'] = np.log(df['OR'])
df.drop(columns='OR')
# read .frq.cc file
cols = ['CHR','SNP','A2','MAF_A','MAF_U','NCHROBS_A', 'NCHROBS_U']
frq_df = pd.read_csv(frq_file, sep='\s+')[cols]
# takes MAF_U as reflects 'unaffected' population controls, unless MAF_U > 0.5, in which case it uses MAF_A
frq_df.rename(columns={'CHR':'chromosome','SNP':'rsid','A2':'allele2','MAF_U':'maf','NCHROBS_A':'cases_total','NCHROBS_U':'controls_total'}, inplace=True)
frq_df['maf'].values[frq_df['maf'] > 0.5] = 0.5
# if chromosome column has more than 1 number, read chromosome number from .assoc.logistic file and only include rows with that value.
chromosomes = df.chromosome.unique()
frq_df = frq_df[frq_df['chromosome'].isin(chromosomes)]
frq_df = frq_df.drop(columns={"chromosome", "MAF_A"}, axis=1)
# create an all_total column
for index, row in frq_df.iterrows():
frq_df['all_total'] = frq_df['cases_total'] + frq_df['controls_total']
# merge based on rsid
df = pd.merge(df, frq_df, how='inner',on='rsid')
# Rearrange column order after merge to match SNPtest format
order = ['chromosome','allele1','allele2','rsid','position','all_total', 'cases_total','controls_total','maf','pvalue', 'beta', 'se']
df = df[order]
return df
[docs]def csv(file):
"""Read csv data into an internal dataframe. """
cols = ['chromosome','allele1','allele2','rsid','position','all_total', 'cases_total','controls_total','maf','pvalue', 'beta', 'se']
df = pd.read_csv(file, sep='\t')[cols]
return df
[docs]def maps(source_dir):
""" Read genetic map data into a maps object. """
map_file_list = glob.glob(source_dir + '/*chr[0-9]*.txt')
maps = {}
for file in map_file_list:
map_file = pd.read_csv(file, sep='\t')
chromosome = map_file['Chromosome'].ix[0].strip('chr')
maps[chromosome] = map_file
return maps
[docs]def annovar(file, file_exonic, colnames):
""" Read ANNOVAR output files into an internal dataframe.
Gene annotation with ANNOVAR returns two different output files (variant_function and exonic_variant_function).
Where exonic SNPs exist, we merge the additional data of exonic variant function, and genes + transcript ID + protein-level change into the dataframe based on matching rsids.
"""
df = pd.DataFrame(columns=colnames)
if os.path.getsize(file) != 0:
df = df.append(pd.read_csv(file, sep='\t', names = colnames))
if os.path.getsize(file_exonic) != 0:
df2 = pd.read_csv(file_exonic, sep='\t', names = colnames, usecols=range(1,(len(colnames) + 1)))
df2 = df2.filter(items=['var_effect','genes','rsid'], axis=1)
df2.rename(columns={'var_effect':'exonic_variant_function', 'genes':'genes_transcriptID'}, inplace=True)
df2 = df2.set_index('rsid')
df = pd.merge(df, df2, how='left',on='rsid')
return df
[docs]def index(file):
""" Read CRAFT .index output file into a dataframe."""
index_df = pd.read_csv(file, sep='\t')
return index_df
[docs]def abf_cred(file):
"""Read CRAFT .abf.cred file into a dataframe."""
cred_snps = pd.read_csv(file, sep='\t')
return cred_snps
[docs]def finemap_cred(file):
"""Read FINEMAP .cred file into a dataframe."""
cred_snps = pd.read_csv(file, sep=' ')
no_cols = int((len(cred_snps.columns) - 2)/ 2)
cred_dfs = []
for i in range(no_cols):
cred_df = cred_snps[cred_snps.columns[2*i + 1: 2*i + 3]]
cred_df = cred_df.rename(columns={cred_df.columns[0]:"rsid", cred_df.columns[1]:"pp"})
cred_df = cred_df.dropna(axis=0)
cred_dfs.append(cred_df)
return cred_dfs
[docs]def cred_annotated(file):
"""Read CRAFT .cred.annotated file into a dataframe."""
cred_df = pd.read_csv(file, sep='\t')
return cred_df
[docs]def ld(file):
""" Read CRAFT .ld output file into a numpy array."""
ld_array = np.loadtxt(file)
return ld_array
[docs]def variant_file(file):
""" Read CRAFT variant_file rsids into a list."""
variant_df = pd.read_csv(file, sep=' ')
return variant_df
[docs]def snp(file):
"""Read FINEMAP .snp file into a dataframe."""
snp_df = pd.read_csv(file, sep=' ')
snp_df.rename(columns={'prob' : 'pp'}, inplace=True)
return snp_df