Source code for craft.paintor

import sys
import os
import tempfile

import pandas as pd
import numpy as np

import craft.config as config

[docs]def paintor(data_dfs, index_df): """ Runs PAINTOR V3.0 on summary statistics. Usage information available at the PAINTOR wiki. https://github.com/gkichaev/PAINTOR_V3.0/wiki/2.-Input-Files-and-Formats The CRAFT pipeline does not implement visualisation with CANVIS (as this requires Python 2.7, which is near end-of-life.) """ with tempfile.TemporaryDirectory() as tempdir: tempdir = "output/paintor_input/" ld_store_executable = os.path.join(config.ldstore_dir, "ldstore") input_file_loc = os.path.join(tempdir, "input_file") input_file = open(f"{input_file_loc}", "w") # need to take in index_df region definitions. index_count = 0 for data in data_dfs: # set the PLINK basename based on chromosome in file chr = index_df.at[index_count, 'chromosome'] index = index_df.at[index_count, 'rsid'] # set filenames in tempdir with index SNP rsid (as unique identifier for input and output files) locus_file = os.path.join(tempdir, index) variant_file = os.path.join(tempdir, index + "_variant.txt") plink_basename = os.path.join(config.plink_basename_dir, f"chr{chr}_ld_panel") bcor_file = os.path.join(tempdir, index + ".bcor") ld_file = os.path.join(tempdir, index + ".ld") annotation_file = os.path.join(tempdir, index + ".annotations") # define region size [need to give index df as well and identify matching row based on rsid] region_start_cm = index_df.at[index_count, 'region_start_cm'] region_end_cm = index_df.at[index_count, 'region_end_cm'] # make and write a locus file order = ['chromosome','position','rsid', 'beta', 'se', 'allele1','allele2'] data = data[order] # Z-score = beta / se (the Wald statistic) data['ZSCORE'] = data['beta']/data['se'] data = data.drop(['beta','se'], axis=1) data.to_csv(locus_file, sep=' ', index=False, header=['CHR','POS','RSID','ALLELE1','ALLELE2','ZSCORE'], float_format='%g') # order of SNPs in LD file must correspond to order in Z file variants = data[['rsid','position','chromosome','allele1','allele2']] variants.to_csv(variant_file, sep=' ', index=False, header=['RSID','position','chromosome','A_allele','B_allele'], float_format='%g') # make an LD file (bcor) cmd = (ld_store_executable + " --bplink " + plink_basename + f" --bcor {bcor_file} --incl-range {region_start_cm}-{region_end_cm} --n-threads 1") os.system(cmd) # make an LD file matrix for our rsids in locus (matrix) cmd = (ld_store_executable + f" --bcor {bcor_file}_1 --matrix {ld_file} --incl-variants {variant_file}") os.system(cmd) # Make an annotation file (all rows 0 to show 'no annotation') # Annotation library (large, 6.7GB download) is available from PAINTOR and may be implemented in future versions of this pipeline annotation_df = pd.DataFrame(1, index = np.arange(len(data.index)), columns=['dummy_annotation']) annotation_df.to_csv(annotation_file, sep=' ',index=False, header=['dummy_annotation'], float_format='%g') # append row to input file input_file.write(f"{index_df.at[index_count, 'rsid']}\n") # increment index count to bring in new region definition. index_count+=1 # Write completed input file out for use input_file.close() # run paintor (tell it data files are in temp directory) # may wish to add command line option for specifying max causal and enumerate [number of causals] cmd = (f"{config.paintor_dir}" + "/PAINTOR " + f" -input {input_file_loc} -Zhead ZSCORE -LDname ld -in {tempdir} -out {tempdir} -max_causal 2 -enumerate 2 -annotations dummy_annotation") os.system(cmd) return 0