#!/usr/bin/perl
#Wei Wu 
use LWP;
my $GoBase='http://www.geneontology.org/ontology/';
my @GoFiles=('function.ontology','process.ontology','component.ontology');
my $urlbase='http://www.geneontology.org/gene-associations/';
my %files=('S_cerevisiae' =>'gene_association.sgd',
           'Drosophila'   =>'gene_association.fb',
           'Mouse' => 'gene_association.mgi',
           'Arabidopsis'=>'gene_association.tair',
           'C_elegans'=>'gene_association.wb',
           'Rat'=>'gene_association.rgd',
            'S_pombe'=>'gene_association.GeneDB_Spombe',
             'Zebrafish'=>'gene_association.zfin',
          'Human'=>'gene_association.goa_human');

sub do_GET {
  # Parameters: the URL,
  #  and then, optionally, any header lines: (key,value, key,value)
 
  my $resp = $browser->get(@_);
  return ($resp->content, $resp->status_line, $resp->is_success, $resp)
    if wantarray;
  return unless $resp->is_success;
  return $resp->content;
}


$browser = LWP::UserAgent->new;
my %GoHash=();
foreach my $file (@GoFiles){
($doc, $status, $success, $resp) = do_GET($GoBase.$file);
    open OUTPUT, ">$file" or die;
    binmode(OUTPUT);
    print OUTPUT $doc;
    close OUTPUT;
open FH, $file  or die;
my $line,@rowdata;
while ( $line=<FH>){
  next if $line =~ /^!/;
  chomp($line);
  my @rowdata= split /;/, $line;
  $rowdata[0]=~ s/^\s+//;
  $rowdata[0]=~ s/\s+$//;
  $rowdata[0]=~ s/[$\<%]//;
  $rowdata[1]=~ /GO:\d{7}/;
  $GoHash{$&}=$rowdata[0];
}
close FH;
unlink $file;
}
 


foreach my $file (keys %files){
    my %geneHash=();
    ($doc, $status, $success, $resp) = do_GET($urlbase.$files{$file}.'.gz');
    open OUTPUT, ">$files{$file}.gz" or die;
    binmode(OUTPUT);
    print OUTPUT $doc;
    close OUTPUT;
system("gunzip ${files{$file}}.gz");
    open FH,"$files{$file}" or die;
    open OUTPUT,">GO_${file}.txt" or die;
    print OUTPUT "Names\tDescription\n";
    my $last_id,$last_goid;
    my $names;
while( $line=<FH>){
    next if $line =~ /^!/;
   @rowdata=split /\t/,$line;
    if (defined ($last_id) && ( $last_id ne $rowdata[1])){
     print OUTPUT "${names}\t${geneHash{$last_id}}\n";
   }
    $last_id=$rowdata[1];
    
if ($geneHash{$rowdata[1]}){
    if ($last_goid ne $rowdata[4]){
           $geneHash{$rowdata[1]}.="| ${GoHash{$rowdata[4]}}"};
    $last_goid=$rowdata[4];next;
};
 $last_goid=$rowdata[4];
$geneHash{$rowdata[1]}.=$GoHash{$rowdata[4]};
 $names=$rowdata[1];
$names.=", $rowdata[2]" unless ($rowdata[10] =~ /$rowdata[2]/);
@aliases= split /\|/, $rowdata[10];
$rowdata[10]= join ", ",@aliases;
$names.=", $rowdata[10]";

}
print OUTPUT "${names}\t${geneHash{$last_id}}\n";
    close OUTPUT;
    close FH;
unlink $files{$file};

}


($doc, $status, $success, $resp) = do_GET("http://www.gene.ucl.ac.uk/public-files/nomen/searchdata.txt");
open OUTPUT, ">searchdata.txt" or die;
    binmode(OUTPUT);
    print OUTPUT $doc;
    close OUTPUT;
open FH, "searchdata.txt"  or die;
my %geneHash=();
<FH>;
while ($line=<FH>){chomp($line);
		   my @rowdata =split /\t/,$line;
		   $geneHash{$rowdata[17]}=$line;

}
close FH;

open FH,"GO_Human.txt"  or die;
open OUTPUT,">HGNC_HumanAnnotation.txt" or die;
$line=<FH>;
print OUTPUT $line;
while($line=<FH>){
    @rowdata=split /\t/, $line;
    @names=split /,/,$rowdata[0];
    $names[0]=~ s/\s+$//;
   if (defined($geneHash{$names[0]}))
   {my  @moreinfo= split /\t/,$geneHash{$names[0]};
      $rowdata[0].=", ${moreinfo[3]}" if ($moreinfo[3]);
      $rowdata[0].=", ${moreinfo[7]}" if ($moreinfo[7]);
      $rowdata[0].=", $moreinfo[11]" if ($moreinfo[11]);
      $rowdata[0].=", $moreinfo[16]" if ($moreinfo[16]);
      $rowdata[1]="${moreinfo[2]}:".$rowdata[1];
      $line = join "\t", @rowdata; }
    print OUTPUT $line;}
unlink  "searchdata.txt" ;
close FH;
close OUTPUT;
