Box 2: Example PERL Scripts for Merging Analysis File p-values and CHPB Files - SNP FAQ Archive

create_chpa.pl
#!/usr/bin/perl –w
# Program to combine GWAS chip dumps with analysis files
# Author: Melvin Quintos
# Date: 2008-03-27
# History: -
#

use strict;
use warnings;
use Getopt::Std;

####################################################################
# Usage
####################################################################
sub usage()
{
print STDERR << "EOF";

usage: $0 -o output_file [-r column] [-p column] chpb_file analysis_file

This program takes a GWAS chip batch file and an analysis file and creates a
combined GWAS analysis file (chpa file).

Options:
-h : this help message
-o : sets the name of the output file (recommend extension of .chpa)
-r : rsid column number in analysis file (1-based)
-p : pvalue column number in analysis file (1-based)

example: $0 -o AREDS_1.chr1.chpa ILMN_Human-1_chr1.chpb pha000001.v1.p1.chr1.txt

EOF
exit;
}

####################################################################
# Global Variables
####################################################################
my %opts;
my %mapPvals;
my $col_rsid, my $col_pvalue;
my $outputfn;
my $ifn_chpb, my $ifn_analysis;
my @header;

####################################################################
# Functions
####################################################################

# Parse command-line options
sub init()
{
getopts( 'ho:v:r:p:', \%opts ) or usage();
usage() if $opts{h};
}

sub verify_args()
{
$col_rsid = ($opts{r}) ? $opts{r} : 1;
$col_pvalue = ($opts{p}) ? $opts{p} : 3;
$outputfn = $opts{o};

# validate variables
if ( (scalar(@ARGV) != 2) # correct number of args
or !(defined($outputfn)) # name of output file is given
or !($col_rsid =~ /^[1-9]\d*$/) # col is a positive number
or !($col_pvalue =~ /^[1-9]\d*$/) # col is a positive number
)
{
usage();
}
else {
$ifn_chpb = $ARGV[0];
$ifn_analysis = $ARGV[1];
}
}

# use 1-based column numbering (because it's easier to count that way)
sub get_cols # params: $line, $col_array
{
my ($line, $col_list) = @_;
my @result;

my @items = split(/\t/, $line);
foreach my $col (@{$col_list})
{
push @result, $items[$col-1]; # 1-based columns
}

return @result;
}

sub read_chpb_file_version1()
{
open IFILE, "$ifn_chpb";
my @cols_of_interest = (2); # rsid is the 2nd column

my $reading_header = 1;

my $line_num = 0;
foreach my $line (<IFILE>) {
$line_num++;

if ($reading_header) {
if ($line =~ /^ss#/) {
# Header information stops at row with 'ss#'
$reading_header = 0;
}
else {
push @header, $line;
}
}

# skip all lines that don't have a proper 'ss' number at start of row
next unless $line =~ /^ss\d+/;
chomp $line;

# grab the rsid column
my ($rsid) = get_cols( $line, \@cols_of_interest );

if (!defined($rsid)) {
die "Unknown value in chpb file at line: $line_num\n";
}

# trim string, then place in map with a default value of -1
$rsid =~ s/^\s+|\s+$//g;
$mapPvals{$rsid} = -1;
}

close IFILE;
}

sub read_analysis_file()
{
open IFILE, "$ifn_analysis";
my @cols_of_interest = ($col_rsid, $col_pvalue);

my $reading_header = 1;
my $line_num = 0;
foreach my $line (<IFILE>) {
$line_num++;

if ($reading_header) {
if ($line =~ /^# Marker accession/) {
# Header information stops at row with '# Marker accesssion'
$reading_header = 0;
}
else {
push @header, $line;
}
}

# skip all lines that are comments or blank lines
next if $line =~ /^\s*#|^\s*$/;
chomp $line;

# for every valid line (e.g. not a comment or blank space)
my ($rsid, $pval) = get_cols( $line, \@cols_of_interest );

if (!defined($rsid)) {
die "Unknown value in file at line: $line_num\n";
}

# trim string, then place in map with a default value of -1
$rsid =~ s/^\s+|\s+$//g;
$mapPvals{$rsid} = $pval;
}

close IFILE;
}

# TODO: this is specific to version 1 of chpb file format
sub write_chpa_file()
{
# open up new file
open OFILE, ">$outputfn";
my @cols_of_interest = (2); # rsid is 2nd column in VERSION=1

# write header information
print OFILE @header;
print OFILE "ss#\trs#\tloc_snp_id\tchrom\tchrom_pos\trsToss_orient\tssTochrom_orient\tweight\tbitfield\tpvalue\n";

# read the chpb file again
open IFILE, "$ifn_chpb";
foreach my $line (<IFILE>) {
chomp $line;

# if the line is a data line, then do a pvalue lookup.
# otherwise ignore
if ($line =~ /^ss\d+/) {
my ($rsid) = get_cols($line, \@cols_of_interest);
my $pval = $mapPvals{$rsid};
if (defined($pval) and $pval != -1) {
$line = "$line\t$pval";
print OFILE $line, "\n";
}
}
}
close IFILE;

close OFILE;

}

####################################################################
# Program flow
####################################################################

init();
verify_args();
# TODO: determine which chpb file version we're using
read_chpb_file_version1();
read_analysis_file();
write_chpa_file();

print "Done!\n";

------------------------Example 2------------------------------------------------------------------------
batch_chpa.pl
#!/usr/bin/perl -w
#
# Batch processing of CHPB and analysis files
# Author: Melvin Quintos
# Date: 2008-03-27
# History: -
#

use strict;
use warnings;
use Getopt::Std;
use File::Find;
use Cwd;

####################################################################
# Usage
####################################################################
sub usage()
{
print STDERR << "EOF";

usage: $0 [-d directory] prefix_chpb prefix_analysis

This program goes through and creates chpa data for all analysis and chpb
files in specified directory.

The chpb files must be in the format of <name>chr[1-22,x,y].chpb
The analysis files must be in the format of <name>chr[1-22,x,y].txt

Options:
-h : this help message
-d : sets the path to chpb and analysis files (must be in same folder)
defaults to current directory

example 1: $0 ILMN_Human-1_
example 2: $0 -d path_to_files ILMN_Human-1_

EOF
exit;
}

####################################################################
# Global Variables
####################################################################
my %opts;
my %mapChrChpb; # map of chromosome to chpb file
my $workdir;
my $initdir;
my $chpb_prefix;

####################################################################
# Functions
####################################################################

# Parse command-line options
sub init()
{
getopts( 'hd:', \%opts ) or usage();
usage() if $opts{h};
}

# setup initial program conditions
sub verify_args()
{
$workdir = ($opts{d}) ? $opts{d} : '.';
$initdir = cwd;

if (scalar(@ARGV)!=1) {
usage();
}
else {
$chpb_prefix = $ARGV[0];
}
}

# given an array of files, return only an array of analysis files
sub filter_analysis
{
my @files;
foreach my $file (@_) {
push @files, $file if $file =~ /\.txt$/i;
}
@files;
}

# given an array of files, return only an array of chpb files
sub filter_chpb
{
my @files;
foreach my $file (@_) {
# check extension
if ($file =~ /\.chpb$/i) {
# check prefix of filename)
if ( substr($file, 0, length($chpb_prefix)) eq $chpb_prefix) {
push @files, $file;
}
}
}
@files;
}

# Store the chpb file by the chromosome it represents
sub process_chpb
{
my $fn = $_;
if (-f $fn) { # if a file
if ($fn =~ /(chr.+)\.chpb$/) { # extract the chromsome name from file
$mapChrChpb{$1} = $fn; # store in map
}
}
}

# Process each analysis file against the proper chromosome
sub process_analysis
{
my $fn = $_;
if (-f $fn) { # if a file
if ($fn =~ /(chr.+)\./) { # extract chomosome name
# if found in map, then process the files
my $chpb_file = $mapChrChpb{$1};
if (defined($chpb_file)) {
my $new_file = $fn;
$new_file =~ s/\.txt$/\.chpa/i;
print "Processing $new_file ... ";
my $exeCmd = qq{"$initdir/create_chpa.pl" -o $new_file $chpb_file $fn};
system($exeCmd);
}
}
}
}

####################################################################
# Program flow
####################################################################

init();
verify_args();

# Process the names of the CHPB files
find({wanted=>\&process_chpb, preprocess=>\&filter_chpb}, $workdir);

# Process the names of analysis files
find({wanted=>\&process_analysis, preprocess=>\&filter_analysis}, $workdir);

print "Done!\n";

From: Using GST to Study Genome Wide Association (GWAS) Data

SNP FAQ Archive [Internet].

Bethesda (MD): National Center for Biotechnology Information (US); 2005-.

NCBI Bookshelf. A service of the National Library of Medicine, National Institutes of Health.