### OASIS - A Novel GWAS Analysis Method ### Copyright (C) GPL-3.0 (2016) Dr. Mohammad Saeed ## Citation: Mohammad Saeed (2017). Novel linkage disequilibrium clustering algorithm identifies new lupus genes on meta-analysis of GWAS datasets. Immunogenetics. ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## The GNU General Public License can be accessed at https://opensource.org/licenses/GPL-3.0. # Installing libraries # Import libraries # Download these libraries: numpy, matplotlib, six, dateutil, pyparsing, ptz # http://stackoverflow.com/users/5179477/mohammad-saeed # Then click on the link: Can't install Matplotlib for Python from __future__ import print_function from datetime import date import math import numpy as np # comment out next two lines if you wish to view the graphs instead of saving them import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt ## User Info print ('OASIS - A Novel GWAS Analysis Method') print ('Copyright (C) GPL-3.0 (2016) Dr. Mohammad Saeed') print ('Contact: rheumdocpk@gmail.com or saeed.khan@arkanalabs.com') print ('') print ('Please cite OASIS paper in published works as:') print ('Mohammad Saeed (2017). Novel linkage disequilibrium clustering algorithm identifies new lupus genes on meta-analysis of GWAS datasets. Immunogenetics.') print ('') print ('The GNU General Public License can be accessed at https://opensource.org/licenses/GPL-3.0.') print ('') print ('') print ('For OASIS you need these libraries: numpy, matplotlib, six, dateutil, pyparsing, ptz') print ('If you have any difficulty please refer to http://stackoverflow.com/users/5179477/mohammad-saeed') print ("For detailed instructions click on the link: Can't install Matplotlib for Python") print ('') print ('') print ('OASIS input file (.csv) has to be in the following format:') print ('The .csv Excel sheet has to sorted according to chromosome and position.') print ('Data has to be in the following 4 columns (without header)') print ('Chromosome, SNP, Position (bp), P-value') print ('') print ('') print ('OASIS Module 3 input file (.csv) has to be in the following format:') print ('This input file is prepared by merging QRD.txt files for two GWAS from OASIS Analysis (Modules 1 & 2)') print ('This is the reason for performing OASIS Analysis in separate folders by placing the OASIS program file in them') print ('The file has to have a single line header with the variables below:') print ('Serial, Dataset, Initial_chr, Initial_loc, Initial_SNP, End_loc, End_SNP, Max_SNP, Max_-log(p), OASIS, Oasis_percent, Quadrant') print ('') print ('') print ('Please be advised that data entry is generally case sensitive (capital vs simple letters)') print ('') print ('') ## Global variables # A Dictionary (var) that holds as keys the basepair locations of SNPs in a single chromosome # A List (chrom) that holds the numbers of chromosomes whose data is present in the input file chrom = [] var = {} # glop and goasis are lists for storing the entire gwas data of these two variables for determining the graph axes later glop = [] goasis = [] # These are chromosome specific lists for storing the relevant variables and are nullified at each cycle (reading of one chromosome) snp_pos = [] lop = [] oasis_scr = [] chrom_rng = range(1, 23) # These variables are for labeling chromosome graphs. They do not have any calculation function though bar* are generated by mean+sd calculations on glop/goasis c = 1 bar_lop = 0 bar_oasis = 0 ## OASIS Module 1 # Define a function 'ch_chk' which checks i.e reads the number of chromosomes in the PLINK / dbGAP GWAS file # It stores these numbers in the list called chrom def ch_chk(): global chrom fgwas = open (fname) for line_chk in fgwas: line_chk = line_chk.rstrip() # Splits the read line from the input file into 'variables' var_chk = line_chk.split(',') # Appends the chromosome number to the list 'chrom' as the input file is read if var_chk[0] not in chrom: chrom.append(var_chk[0]) fgwas.close() return chrom ## Define a 'read_file' function which: # Reads the input file line by line, identifies the variables and # Store variables in a dictionary called var with location as key # Thus all relevant data for each chromosome is stored in the dictionary 'var' - this process is iterated over the list chrom # Therefore the dictionary var is made for one chromosome at a time (and reset after 'oasis_process') # The function 'oasis_process' is called to process data in the dictionary 'var' one chromosome at a time def read_file(): # Imports the chromsome list 'chrom' and the dictionary 'var' global var, chrom # Iterates over the list of chromosomes i.e stores all relevant data for one chromosome at a time in the dictionary 'var' for chr_n in chrom: chint = int(chr_n) fgwas = open (fname) for line in fgwas: line = line.rstrip() variant = line.split(',') chno = int(variant[0]) # At its turn each chromosome's relevant data (chromosome number, snp, chi square, p-value, odd ratio) is stored agaist the key (pos) if chno == chint: pos = int(variant[2]) var[pos] = (variant[0], variant[1], variant[3]) # chromosome, snp, position, p-value fgwas.close() print (" Chromosome: ", chint) # Calls the function oasis_process to perform the required calculations oasis_process() # Resets the chromosome dictionary 'var' var = {} return ## Process the variables in the chromosome dictionary as oasis blocks # This is the main OASIS calculator def oasis_process(): # Imports the chromosome dictionary and sets local variables global var oasis = 0 epos = 0 total_snp = 0 last_pos = 0 block = {} # Main Loop for reading the chromosome dictionary sorted by the basepair location key (pos) for pos, (chm, snp, pval) in sorted(var.items()): loc = int(pos) p = float(pval) # Loop to calculate oasis within the window if epos > 0 and loc <= epos: if p <= 0.05: total_snp += 1 oasis += 1 block[p] = (snp) # Continue to add number of SNPs for final calculation of Oasis_percent elif p > 0.05: total_snp += 1 # Loop activated when window has ended # It prints the end of block stats to the file # It also resets all local variables elif epos > 0 and loc > epos: mx_p = min(block) mx_snp = block[mx_p] mx_lg_p = math.log(mx_p, 10)*-1 oasis_pc = (oasis *100 / total_snp) print (pos, snp, mx_snp, mx_lg_p, oasis, oasis_pc, file=fout) last_pos = loc total_snp = 0 oasis = 0 epos = 0 block = {} # Initiates OASIS block and determines window size # This happens only on a new read from 'var' ensured by the expression 'last_pos != loc' if epos == 0 and p <= 0.05 and last_pos != loc: bpos = loc epos = loc + win*10**3 print (chm, pos, snp, end=' ', file=fout) total_snp += 1 oasis += 1 block[p] = (snp) # Activated for the last oasis block in the chromosome dictionary when the block is less than the window size if oasis > 0: mx_p = min(block) mx_snp = block[mx_p] oasis_pc = (oasis *100 / total_snp) print (pos, snp, mx_snp, mx_lg_p, oasis, oasis_pc, file=fout) ## OASIS Module 2 ## Generate data from the GWAS dataset for the graph function def xy(): global snp_pos, oasis_scr, lop, c, bar_lop, bar_oasis, glop, goasis, f_result # Iterates over the list of chromosomes i.e stores all relevant data for one chromosome at a time in respective lists print ("Creating graphs for:") print ("") for ch_no in chrom_rng: c = ch_no foasis = open (f_result) head = 7 while head>0: foasis.readline() head = head -1 for line in foasis: line = line.rstrip() v = line.split() chno = int(v[0]) # At its turn each chromosome's relevant data (chromosome position, p-value, oasis) is stored against their respective lists l = int(v[1]) p = float(v[6]) b = int(v[7]) glop.append(p) goasis.append(b) if chno == ch_no: ## In case you want to limit the oasis_scr or lop data you can arbitrary set cut offs here ## if p >= 1 or b >= 1: snp_pos.append(l) lop.append(p) oasis_scr.append(b) foasis.close() # Calculates the 3-sigma rule cut off for -log[p] and oasis_scr mlop = np.mean(glop) sdlop = np.std(glop) moasis = np.mean(goasis) sdoasis = np.std(goasis) r3s_lop = mlop + (3*sdlop) r3s_oasis = moasis + (3*sdoasis) bar_lop = float("{0:.1f}".format(r3s_lop)) bar_oasis = float("{0:.1f}".format(r3s_oasis)) # Calls the graph function to generate the graphs for each chromosome print (" Chromosome: ", c) graph() plt.close() # Resets the chromosome lists snp_pos = [] lop = [] oasis_scr = [] return snp_pos, lop, oasis_scr, c, bar_lop, bar_oasis, glop, goasis ## Create graphs for OASIS and -log[p] data def graph(): global snp_pos, oasis_scr, lop, bar_lop, bar_oasis x = max(snp_pos)+10**7 y = max(glop)+1 plt.figure() plt.subplot(211) plt.plot(snp_pos, lop, 'bo') plt.axis([0, x, 0, y]) plt.plot((0, x), (bar_lop, bar_lop), 'm--') plt.title('Chromosome: %s' % c) plt.grid(True) plt.ylabel('- log [P]') plt.subplot(212) yy = max(goasis)+1 plt.plot(snp_pos, oasis_scr, 'ro') plt.axis([0, x, 0, yy]) plt.plot((0, x), (bar_oasis, bar_oasis), 'm--') plt.grid(True) plt.xlabel('Position (bp)') plt.ylabel('OASIS') plt.tight_layout() # comment out next line if you wish to view the graphs instead of saving them plt.savefig('chr%s' % c) plt.figure() plt.title('Chromosome: %s' % c) plt.scatter(oasis_scr, lop, c = u'r') plt.axis([0, yy, 0, y]) plt.plot((bar_oasis, bar_oasis), (0, y), 'm--') plt.plot((0, yy), (bar_lop, bar_lop), 'm--') plt.grid(True) plt.xlabel('OASIS') plt.ylabel('- log [P]') # comment out next line if you wish to view the graphs instead of saving them plt.savefig('chr%s_scatter' % c) # New code for full GWAS scatter plot plt.figure() plt.title(r'GWAS (3$\sigma$) scatter plot') plt.scatter(goasis, glop, c=u'b', marker='s') plt.axis([0, yy, 0, y]) plt.plot((bar_oasis, bar_oasis), (0, y), 'r--') plt.plot((0, yy), (bar_lop, bar_lop), 'r--') plt.grid(True) plt.xlabel('OASIS') plt.ylabel('- log [P]') # comment out next line if you wish to view the graphs instead of saving them plt.savefig('GWAS_scatter') plt.show() plt.close() ## Open OASIS results file and categorize Quadrants A, B, C def quads(): foasis = open (f_result) head = 7 while head>0: foasis.readline() head = head -1 fqrd= open ("QRD.txt", 'w') print ("OASIS outfile file:", f_result, file=fqrd) print ('Stats: 3-sigma(-log[p]):', bar_lop, ' 3-sigma(oasis):', bar_oasis, file=fqrd) print ('Axis_loc', 'Initial_chr', 'Initial_loc', 'Initial_SNP', 'End_loc', 'End_SNP', 'Max_SNP', 'Max_-log(p)', 'OASIS', 'Oasis_percent', 'Quadrant', file=fqrd) a_axis = [] b_axis = [] c_axis = [] a_qrd = [] b_qrd = [] c_qrd = [] for line in foasis: line = line.rstrip() var = line.split() cr = float(var[0]) iloc = float(var[1]) xloc = float(cr + (iloc / 10**9)) lp = float(var[6]) ob = int(var[7]) if lp >= bar_lop and ob >= bar_oasis: print (xloc, line, "A", file=fqrd) a_qrd.append(lp) a_axis.append(xloc) elif lp >= bar_lop: print (xloc, line, "B", file=fqrd) b_qrd.append(lp) b_axis.append(xloc) elif ob >= bar_oasis: print (xloc, line, "C", file=fqrd) c_qrd.append(lp) c_axis.append(xloc) foasis.close() fqrd.close() amax = max(a_qrd)+1 bmax = max(b_qrd)+1 cmax = max(c_qrd)+1 ymax = max(amax, bmax, cmax) plt.figure() plt.subplot(311) plt.title(r'Quadrant A: Oasis and p-value >> 3$\sigma$') plt.plot(a_axis, a_qrd, 'ro') plt.axis([0, 23, 0, ymax]) plt.grid(b=True, which='major', color='k', linestyle='-') plt.grid(b=True, which='minor', color='k', linestyle='-', alpha=0.2) plt.minorticks_on() plt.ylabel('-log[p]') plt.subplot(312) plt.title(r'Quadrant B: p-value >> 3$\sigma$') plt.plot(b_axis, b_qrd, 'go') plt.axis([0, 23, 0, ymax]) plt.grid(b=True, which='major', color='k', linestyle='-') plt.grid(b=True, which='minor', color='k', linestyle='-', alpha=0.2) plt.minorticks_on() plt.ylabel('-log[p]') plt.subplot(313) plt.title(r'Quadrant C: Oasis >> 3$\sigma$') plt.plot(c_axis, c_qrd, 'bo') plt.axis([0, 23, 0, ymax]) plt.grid(b=True, which='major', color='k', linestyle='-') plt.grid(b=True, which='minor', color='k', linestyle='-', alpha=0.2) plt.minorticks_on() plt.xlabel('Chromosome position') plt.ylabel('-log[p]') plt.tight_layout() plt.savefig('GWAS_Quadrants') plt.show() ## OASIS Module 3 # Module 3 is the program Maplink # Maplink allows users to group together OASIS results from two GWAS datasets into a single html table and highlighting (*) regions from different datasets within 200kb # Maplink has to be run directly after loading this OASIS program by calling this function in the Python shell thus: maplink() # The input file for Maplink is a .csv table combining QRD results from two GWAS datasets (this file has to be prepared manually by merging two QRD.txt files in Excel) # Two columns have to be added to this .csv file viz 'Serial' and 'Dataset' # The 'Dataset' symbols should be inserted after pasting the two QRD files into a single sheet (one dataset QRD results stacked over the other) # The file should be sorted according to chromosome and position and then numbered serially in the 'Serial column' def maplink(): # Inserting Mapview hyperlinks to OASIS results (Composite QRD Excel (.csv) sheet) ## Open file and read content line by line print ("") fn_qrd = raw_input('Please enter name of file (.csv) with composite OASIS (QRD) GWAS results: ') qrd_name = str(fn_qrd+'.csv') fmap = open (qrd_name, 'r') fmap.readline() ## Store values in variables (dictionary) oamap = {} title = ['Serial', 'Dataset', 'Initial_chr', 'Initial_loc', 'Initial_SNP', 'End_loc', 'End_SNP', 'Max_SNP', 'Max_-log(P)', 'OASIS', 'Oasis%', 'Quadrant', 'Select', 'Mapview_link'] db1 = 0 db2 = 0 last_pos1 = 0 last_pos2 = 0 select = "" slt = 0 how_far=0 chrm = 1 for hit in fmap: hit = hit.rstrip() var = hit.split(',') serial = float(var[0]) oamap[serial] = (var[1], var[2], var[3], var[4], var[5], var[6],var[7], var[8], var[9], var[10], var[11]) fmap.close() ## Links fn_html = raw_input('Please enter name of HTML output file (with Mapview links): ') html_name = str(fn_html+'.html') fhtml = open (html_name, 'w') datcode = raw_input('Please enter the code for the dataset to be highlighted in the HTML table: ') total_hit = (len(oamap)) # prints the total number of oasis hits / records series = oamap.keys() series.sort() part1 ="http://www.ncbi.nlm.nih.gov/projects/mapview/maps.cgi?TAXID=9606&CHR=" part2="&MAPS=genes-r%2Csnp-r&QUERY=" part3="%2BOR%2B" part4="&BEG=" part5="M&END=" part6="M&oview=default" # Preparing HTML file headers print ('', file=fhtml) print ('
', file=fhtml) print ('', file=fhtml) print ('', file=fhtml) print ('', file=fhtml) # Printing the data to the HTML file as a Table serial = 1 for ser in series: region = oamap[ser] beg=(float(region[2])/10**6)-1 begr = round(beg, 1) begn = str(begr) enz=(float(region[4])/10**6)+1 enzr=round(enz, 1) endz = str(enzr) maplnk = part1+region[1]+part2+region[3]+part3+region[5]+part3+region[6]+part4+begn+part5+endz+part6 snp_pos = int(region[4]) if serial == 1: db1 = region[0] last_pos1 = snp_pos how_far = 5*10**6 elif serial == 2: db2 = region[0] last_pos2 = snp_pos how_far = last_pos2 - last_pos1 else: db2 = region[0] last_pos2 = snp_pos how_far = last_pos2 - last_pos1 if chrm == region[1]: if db1 != db2 and how_far < 2*10**6: select = "**" slt += 1 else: chrm = region[1] how_far = 5*10**6 if region[0]==datcode: print ('', file=fhtml) else: print ('', file=fhtml) serial += 1 select = "" if serial > 2: db1 = db2 last_pos1 = last_pos2 total_select = str(slt) total_hits = str(total_hit) print ('

OASIS

A Novel GWAS Meta-Analysis Algorithm by Dr. Mohammad Saeed

'+title[0]+''+title[1]+''+title[2]+''+title[3]+''+title[4]+''+title[5]+''+title[6]+''+title[7]+''+title[8]+''+title[9]+''+title[10]+''+title[11]+''+title[12]+''+title[13]+'
'+str(serial)+''+region[0]+''+region[1]+''+region[2]+''+region[3]+''+region[4]+''+region[5]+''+region[6]+''+region[7]+''+region[8]+''+region[9]+''+region[10]+''+select+''+'map'+'
'+str(serial)+''+region[0]+''+region[1]+''+region[2]+''+region[3]+''+region[4]+''+region[5]+''+region[6]+''+region[7]+''+region[8]+''+region[9]+''+region[10]+''+select+''+'map'+'
', file=fhtml) print ('Regions replicated: '+total_select+'', file=fhtml) print ('Total Hits: '+total_hits+'', file=fhtml) fhtml.close() raise SystemExit() ## Command and control function # It does not contain any significant code but calls the above functions in an ordered fashion # Check if User wishes to use Module 3 only module3 = raw_input('Do you wish to use Module 3 (Maplink) [M3] or proceed to OASIS Analysis [M1] (M1 OR M3): ') if module3 == "M3" or module3 == "m3": maplink() print ("") # Open the PLINK / dbGAP GWAS association results file using User input fn = raw_input('Please enter name of file (.csv) with PLINK / dbGAP GWAS association results: ') fname = str(fn+'.csv') print ("") # Open the OASIS results output file using User input fo = raw_input('Please enter name of OASIS results output file:') f_result = str(fo+'.txt') fout = open (f_result, 'w') print ("") # User to determine the OASIS window size win = int(raw_input('Please enter window size in kb (default 200kb):')) print ("") def Main(): print ('Intializing OASIS') print ("") print ('OASIS - Novel Method of GWAS Analysis. Dr. Mohammad Saeed. Copyright (C) GNU GPL-3.0 (2016)', file=fout) print ("", file=fout) print ('OASIS Analysis dated:', date.today(), file=fout) print ("", file=fout) print ('Window Size:', win, 'kb', file=fout) print ("", file=fout) print ('Initial_chr', 'Initial_loc', 'Initial_SNP', 'End_loc', 'End_SNP', 'Max_SNP', 'Max_-log(p)', 'OASIS', 'Oasis_percent', file=fout) ch_chk() read_file() fout.close() print ("") print ("") xy() quads() ## Starts the program Main()