File Coverage

blib/lib/OBO/Parser/GoaParser.pm
Criterion Covered Total %
statement 127 133 95.4
branch 41 60 68.3
condition 3 9 33.3
subroutine 11 11 100.0
pod 2 4 50.0
total 184 217 84.7


line stmt bran cond sub pod time code
1             # $Id: GoaParser.pm 2159 2013-02-20 Erick Antezana $
2             #
3             # Module : GoaParser.pm
4             # Purpose : Parse GOA files
5             # License : Copyright (c) 2006-2014 by Vladimir Mironov. All rights reserved.
6             # This program is free software; you can redistribute it and/or
7             # modify it under the same terms as Perl itself.
8             # Contact : erick.antezana@gmail.com
9             #
10              
11             package OBO::Parser::GoaParser;
12              
13 2     2   27273 use strict;
  2         4  
  2         66  
14 2     2   7 use warnings;
  2         4  
  2         44  
15 2     2   7 use Carp;
  2         2  
  2         130  
16              
17 2     2   961 use open qw(:std :utf8); # Make All I/O Default to UTF-8
  2         2037  
  2         8  
18              
19 2     2   1129 use OBO::Core::Term;
  2         6  
  2         63  
20 2     2   799 use OBO::APO::GoaAssociation;
  2         5  
  2         68  
21 2     2   894 use OBO::APO::GoaAssociationSet;
  2         5  
  2         2292  
22              
23             # use Data::Dumper;
24              
25             $Carp::Verbose = 0;
26             my $verbose = 0;
27              
28             sub new {
29 2     2 0 14 my $class = shift;
30 2         4 my $self = {};
31            
32 2         6 bless ( $self, $class );
33 2         6 return $self;
34             }
35              
36             =head2 parse
37              
38             Usage - $GoaParser->parse ( $FH, $map )
39             Returns - OBO::APO::GoaAssociationSet object
40             Args -
41             1. indirect filehandle to a GOA associations file
42             2. ref to a hash { UniProtAC => UniProtID } or { GO ID => GO term name } to filter by, optional
43             Function - converts a GOA associations file into a OBO::APO::GoaAssociationSet object
44            
45             =cut
46              
47             sub parse {
48             # TODO remove the option of filtering by GO map
49             # TODO accomodate annotations including multiple taxa like: 'taxon:10090|taxon:10360', taxon:9606|taxon:44130
50             my (
51 4     4 1 1405 $self,
52             $in_file_path,
53             $map # hash ref, either UP map or GO map, optional
54             ) = @_;
55            
56 4 50       183 open my $FH, '<', $in_file_path or croak "Can't open file '$in_file_path': $! ";
57 4         41 my $goaAssocSet = OBO::APO::GoaAssociationSet->new ( );
58             # assumption: the map is not empty and no blank lines
59 4         6 my $map_source;
60 4 100       12 if ( $map ) {
61             # identify the type of the map
62 2         3 my @ids = keys %{ $map };
  2         7  
63 2 100       14 if ( $ids[0] =~ /\AGO:\d{7}\z/xms ) {
    50          
64 1         3 $map_source = 'GO';
65             }
66             elsif ( $ids[0] =~ /\A\w{6}\z/xms ){
67 1         3 $map_source = 'UniProtKB';
68             }
69             else {
70 0         0 carp "An illegal ID in the map! ";
71             }
72 2 50       6 print "Filtering by $map_source map\n" if $verbose;
73             }
74             # Populate the OBO::APO::GoaAssociationSet class with objects
75 4         6 my $count_added =0;
76 4         6 my $count_rejected =0;
77 4         59 while ( <$FH> ){
78 29         41 chomp;
79 29         237 my @fields = split ( /\t/ );
80 29 50       63 next if ( @fields != 15 );
81 29         60 my $goaAssoc = load_data ( \@fields, $map, $map_source );
82 29 50       44 $goaAssoc ? $count_added++ : $count_rejected++;
83 29 50       103 $goaAssocSet->add_unique ( $goaAssoc ) if $goaAssoc;
84             }# end of while $FH
85 4         85 close $FH;
86 4 50       15 print "accepted associations: $count_added, rejected associations: $count_rejected\n" if $verbose;
87 4 50       30 ! $goaAssocSet->is_empty ( ) ? return $goaAssocSet : carp "The set is empty ! $! ";
88             }
89              
90             =head2 work
91              
92             Usage - $GoaParser->work ( $ontology, $data, $up_map, $parent_protein_name ) # the last arg is optional
93             Returns - a data structure with added proteins ( { NCBI_ID => {UP_AC => OBO::Core::Term object}} )
94             Args -
95             1. OBO::Core::Ontology object,
96             2. OBO::APO::GoaAssociationSet object,
97             3. parent term name for proteins ( string ), # to link new proteins to, e.g. 'gene regulation protein'
98             Function - adds GO associations ( and optionally protein terms ) to ontology
99            
100             =cut
101              
102             sub work {
103             my (
104 2     2 1 783 $self,
105             $onto,
106             $data,
107             $parent_protein_name,
108             ) = @_ ;
109 2 50 33     16 croak "Not enough arguments! " if ! ( $onto and $data );
110            
111 2         3 my $parent_protein;
112             my $taxon;
113 2 100       5 if ( $parent_protein_name ) {
114 1   33     7 $parent_protein = $onto->get_term_by_name ( $parent_protein_name ) || croak "No term for $parent_protein_name in ontology: $! ";
115             }
116            
117 2         6 my $is_a = 'is_a';
118 2         3 my $participates_in = 'participates_in';
119 2         6 my $located_in = 'located_in';
120 2         3 my $has_function = 'has_function';
121 2         4 my $has_source = 'has_source';
122 2         6 my @rel_types = ( $is_a, $participates_in, $located_in, $has_function, $has_source );
123 2         5 foreach ( @rel_types ){
124 10 50       34 croak "'$_' is not in ontology" unless ( $onto->{RELATIONSHIP_TYPES}->{$_} );
125             }
126            
127 2         3 my %proteins; # { NCBI_ID => {UP_AC => OBO::Core::Term object}}
128             my %go_terms; # { GO_id => OBO::Core::Term object }
129 0         0 my %taxa; # { NCBI_id => OBO::Core::Term object }
130            
131 2         5 foreach my $goaAssoc ( @{$data->{SET}} ){
  2         6  
132             # get GO term
133 16         32 my $go_id = $goaAssoc->go_id ( );
134 16 50       32 next if ( $go_id eq 'GO:0008150' ); # remove associations with 'biological_process'
135 16         17 my $go_term = $go_terms{$go_id};
136 16 100       24 if ( ! $go_term ) { # $go_term is not yet in the hash
137 10         22 $go_term = $onto->get_term_by_id ( $go_id );
138 10 50       17 next if ( ! $go_term );
139 10         14 $go_terms{$go_id} = $go_term;
140             }
141            
142             # get taxon
143             # for multiple taxa in a single association: taxon:9606|taxon:44130
144 16         31 $goaAssoc->taxon ( ) =~ /\Ataxon:(\d+)/xms;
145 16 50       40 my $ncbi_id = $1 or carp "No NCBI id: $!";
146 16         20 my $taxon_id = "NCBI:$ncbi_id";
147 16         18 my $taxon = $taxa{$ncbi_id};
148 16 100       25 if ( ! $taxon ) {
149 2   33     6 $taxon = $onto->get_term_by_id ( $taxon_id ) || carp "No taxon term for $taxon_id in ontology: $! ";
150 2 50       6 next if ! $taxon;
151 2         5 $taxa{$ncbi_id} = $taxon;
152             }
153            
154             # get protein term
155 16         31 my $obj_id = $goaAssoc->obj_id ( );
156 16         27 my $db = $goaAssoc->obj_src ( );
157 16         23 my $prot_id = "$db:$obj_id";
158 16         25 my $protein = $proteins{$ncbi_id}{$obj_id};
159 16 100       26 if ( ! $protein ) { # $protein is not yet in the hash
160 2         5 $protein = $onto->get_term_by_id ( $prot_id );
161 2 100       6 if ( ! $protein ) { # $protein is not yet in the ontolgy
162 1 50       2 if ( $parent_protein_name ) { # testing whether new protein terms should be created
163 1         11 $protein = OBO::Core::Term->new ( );
164 1         3 $protein->id ( $prot_id );
165 1         6 $onto->add_term ( $protein );
166 1         5 $onto->create_rel ( $protein, $has_source, $taxon );
167 1         4 $onto->create_rel ( $protein, $is_a, $parent_protein );
168             }
169             else {
170             # the protein not in the ontology and no new terms should be created
171             # normally should not happen
172 0         0 carp "No protein term in the ontology for $prot_id";
173 0         0 next;
174             }
175             }
176 2         4 $proteins{$ncbi_id}{$obj_id} = $protein;
177             }
178            
179            
180             # create relations
181 16         25 my $aspect = $goaAssoc->aspect ( );
182 16 50       50 if ( $aspect eq 'F' ) {
    100          
    50          
183 0         0 $onto->create_rel ( $protein, $has_function, $go_term );
184             }
185             elsif ( $aspect eq 'C' ) {
186 14         27 $onto->create_rel ( $protein, $located_in, $go_term );
187             # $onto->create_rel ( $go_term, $location_of, $protein ); # inverse of 'located_in'
188             }
189 0         0 elsif ( $aspect eq 'P' ) {
190 2         9 $onto->create_rel ( $protein, $participates_in, $go_term );
191             # $onto->create_rel ( $go_term, $has_participant, $protein ); # inverse of 'participates_in'
192             }
193             else {carp "An illegal GO aspect '$aspect'\n"}
194             }
195 2         11 return \%proteins;
196             }
197              
198             sub load_data {
199 29     29 0 28 my ( $fields, $map, $map_source ) = @_;
200 29         25 my @fields = @{$fields};
  29         82  
201 29         41 foreach ( @fields ) {
202 435         425 $_ =~ s/^\s+//;
203 435         506 $_ =~ s/\s+$//;
204             }
205 29 100       58 if ( $map ) {
206 16 100       23 if ( $map_source eq 'GO' ) {
207 8 50       23 return 0 unless $map->{$fields[4]};
208             }
209 16 100       31 if ( $map_source eq 'UniProtKB' ) {
210 8 50       17 return 0 unless $map->{$fields[1]};
211             }
212             }
213 29         167 my $goaAssoc = OBO::APO::GoaAssociation->new ( );
214 29         66 $goaAssoc->assc_id ( $. );
215 29         62 $goaAssoc->obj_src ( $fields[0] );
216 29         59 $goaAssoc->obj_id ( $fields[1] );
217 29         67 $goaAssoc->obj_symb ( $fields[2] );
218 29         56 $goaAssoc->qualifier ( $fields[3] );
219 29         58 $goaAssoc->go_id ( $fields[4] );
220 29         50 $goaAssoc->refer ( $fields[5] );
221 29         58 $goaAssoc->evid_code ( $fields[6] );
222 29         64 $goaAssoc->sup_ref ( $fields[7] );
223 29         62 $goaAssoc->aspect ( $fields[8] );
224 29         55 $goaAssoc->description ( $fields[9] );
225 29         57 $goaAssoc->synonym ( $fields[10] );
226 29         53 $goaAssoc->type ( $fields[11] );
227 29         43 $goaAssoc->taxon ( $fields[12] );
228 29         57 $goaAssoc->date ( $fields[13] );
229 29         57 $goaAssoc->annot_src ( $fields[14] );
230 29         67 return $goaAssoc;
231             }
232              
233             1;
234              
235             __END__