Box 2: Example PERL Scripts for Merging Analysis File p-values and CHPB Files
#!/usr/bin/perl –w
# Program to combine GWAS chip dumps with analysis files
# Author: Melvin Quintos
# Date: 2008-03-27
# History: -

use strict;
use warnings;
use Getopt::Std;

# Usage
sub usage()
print STDERR << "EOF";

usage: $0 -o output_file [-r column] [-p column] chpb_file analysis_file

This program takes a GWAS chip batch file and an analysis file and creates a
combined GWAS analysis file (chpa file).

-h : this help message
-o : sets the name of the output file (recommend extension of .chpa)
-r : rsid column number in analysis file (1-based)
-p : pvalue column number in analysis file (1-based)

example: $0 -o AREDS_1.chr1.chpa ILMN_Human-1_chr1.chpb pha000001.v1.p1.chr1.txt


# Global Variables
my %opts;
my %mapPvals;
my $col_rsid, my $col_pvalue;
my $outputfn;
my $ifn_chpb, my $ifn_analysis;
my @header;

# Functions

# Parse command-line options
sub init()
getopts( 'ho:v:r:p:', \%opts ) or usage();
usage() if $opts{h};

sub verify_args()
$col_rsid = ($opts{r}) ? $opts{r} : 1;
$col_pvalue = ($opts{p}) ? $opts{p} : 3;
$outputfn = $opts{o};

# validate variables
if ( (scalar(@ARGV) != 2) # correct number of args
or !(defined($outputfn)) # name of output file is given
or !($col_rsid =~ /^[1-9]\d*$/) # col is a positive number
or !($col_pvalue =~ /^[1-9]\d*$/) # col is a positive number
else {
$ifn_chpb = $ARGV[0];
$ifn_analysis = $ARGV[1];

# use 1-based column numbering (because it's easier to count that way)
sub get_cols # params: $line, $col_array
my ($line, $col_list) = @_;
my @result;

my @items = split(/\t/, $line);
foreach my $col (@{$col_list})
push @result, $items[$col-1]; # 1-based columns

return @result;

sub read_chpb_file_version1()
open IFILE, "$ifn_chpb";
my @cols_of_interest = (2); # rsid is the 2nd column

my $reading_header = 1;

my $line_num = 0;
foreach my $line (<IFILE>) {

if ($reading_header) {
if ($line =~ /^ss#/) {
# Header information stops at row with 'ss#'
$reading_header = 0;
else {
push @header, $line;

# skip all lines that don't have a proper 'ss' number at start of row
next unless $line =~ /^ss\d+/;
chomp $line;

# grab the rsid column
my ($rsid) = get_cols( $line, \@cols_of_interest );

if (!defined($rsid)) {
die "Unknown value in chpb file at line: $line_num\n";

# trim string, then place in map with a default value of -1
$rsid =~ s/^\s+|\s+$//g;
$mapPvals{$rsid} = -1;

close IFILE;

sub read_analysis_file()
open IFILE, "$ifn_analysis";
my @cols_of_interest = ($col_rsid, $col_pvalue);

my $reading_header = 1;
my $line_num = 0;
foreach my $line (<IFILE>) {

if ($reading_header) {
if ($line =~ /^# Marker accession/) {
# Header information stops at row with '# Marker accesssion'
$reading_header = 0;
else {
push @header, $line;

# skip all lines that are comments or blank lines
next if $line =~ /^\s*#|^\s*$/;
chomp $line;

# for every valid line (e.g. not a comment or blank space)
my ($rsid, $pval) = get_cols( $line, \@cols_of_interest );

if (!defined($rsid)) {
die "Unknown value in file at line: $line_num\n";

# trim string, then place in map with a default value of -1
$rsid =~ s/^\s+|\s+$//g;
$mapPvals{$rsid} = $pval;

close IFILE;

# TODO: this is specific to version 1 of chpb file format
sub write_chpa_file()
# open up new file
open OFILE, ">$outputfn";
my @cols_of_interest = (2); # rsid is 2nd column in VERSION=1

# write header information
print OFILE @header;
print OFILE "ss#\trs#\tloc_snp_id\tchrom\tchrom_pos\trsToss_orient\tssTochrom_orient\tweight\tbitfield\tpvalue\n";

# read the chpb file again
open IFILE, "$ifn_chpb";
foreach my $line (<IFILE>) {
chomp $line;

# if the line is a data line, then do a pvalue lookup.
# otherwise ignore
if ($line =~ /^ss\d+/) {
my ($rsid) = get_cols($line, \@cols_of_interest);
my $pval = $mapPvals{$rsid};
if (defined($pval) and $pval != -1) {
$line = "$line\t$pval";
print OFILE $line, "\n";
close IFILE;

close OFILE;


# Program flow

# TODO: determine which chpb file version we're using

print "Done!\n";

------------------------Example 2------------------------------------------------------------------------
#!/usr/bin/perl -w
# Batch processing of CHPB and analysis files
# Author: Melvin Quintos
# Date: 2008-03-27
# History: -

use strict;
use warnings;
use Getopt::Std;
use File::Find;
use Cwd;

# Usage
sub usage()
print STDERR << "EOF";

usage: $0 [-d directory] prefix_chpb prefix_analysis

This program goes through and creates chpa data for all analysis and chpb
files in specified directory.

The chpb files must be in the format of <name>chr[1-22,x,y].chpb
The analysis files must be in the format of <name>chr[1-22,x,y].txt

-h : this help message
-d : sets the path to chpb and analysis files (must be in same folder)
defaults to current directory

example 1: $0 ILMN_Human-1_
example 2: $0 -d path_to_files ILMN_Human-1_


# Global Variables
my %opts;
my %mapChrChpb; # map of chromosome to chpb file
my $workdir;
my $initdir;
my $chpb_prefix;

# Functions

# Parse command-line options
sub init()
getopts( 'hd:', \%opts ) or usage();
usage() if $opts{h};

# setup initial program conditions
sub verify_args()
$workdir = ($opts{d}) ? $opts{d} : '.';
$initdir = cwd;

if (scalar(@ARGV)!=1) {
else {
$chpb_prefix = $ARGV[0];

# given an array of files, return only an array of analysis files
sub filter_analysis
my @files;
foreach my $file (@_) {
push @files, $file if $file =~ /\.txt$/i;

# given an array of files, return only an array of chpb files
sub filter_chpb
my @files;
foreach my $file (@_) {
# check extension
if ($file =~ /\.chpb$/i) {
# check prefix of filename)
if ( substr($file, 0, length($chpb_prefix)) eq $chpb_prefix) {
push @files, $file;

# Store the chpb file by the chromosome it represents
sub process_chpb
my $fn = $_;
if (-f $fn) { # if a file
if ($fn =~ /(chr.+)\.chpb$/) { # extract the chromsome name from file
$mapChrChpb{$1} = $fn; # store in map

# Process each analysis file against the proper chromosome
sub process_analysis
my $fn = $_;
if (-f $fn) { # if a file
if ($fn =~ /(chr.+)\./) { # extract chomosome name
# if found in map, then process the files
my $chpb_file = $mapChrChpb{$1};
if (defined($chpb_file)) {
my $new_file = $fn;
$new_file =~ s/\.txt$/\.chpa/i;
print "Processing $new_file ... ";
my $exeCmd = qq{"$initdir/" -o $new_file $chpb_file $fn};

# Program flow


# Process the names of the CHPB files
find({wanted=>\&process_chpb, preprocess=>\&filter_chpb}, $workdir);

# Process the names of analysis files
find({wanted=>\&process_analysis, preprocess=>\&filter_analysis}, $workdir);

print "Done!\n";

From: Using GST to Study Genome Wide Association (GWAS) Data

Cover of SNP FAQ Archive
SNP FAQ Archive [Internet].

NCBI Bookshelf. A service of the National Library of Medicine, National Institutes of Health.