File Coverage

blib/lib/OBO/Parser/GoaParser.pm
Criterion Covered Total %
statement 127 133 95.4
branch 41 60 68.3
condition 3 9 33.3
subroutine 11 11 100.0
pod 2 4 50.0
total 184 217 84.7


line stmt bran cond sub pod time code
1             # $Id: GoaParser.pm 2159 2013-02-20 Erick Antezana $
2             #
3             # Module : GoaParser.pm
4             # Purpose : Parse GOA files
5             # License : Copyright (c) 2006-2015 by Vladimir Mironov. All rights reserved.
6             # This program is free software; you can redistribute it and/or
7             # modify it under the same terms as Perl itself.
8             # Contact : erick.antezana@gmail.com
9             #
10              
11             package OBO::Parser::GoaParser;
12              
13 2     2   32036 use strict;
  2         3  
  2         49  
14 2     2   10 use warnings;
  2         4  
  2         45  
15 2     2   9 use Carp;
  2         3  
  2         115  
16              
17 2     2   1465 use open qw(:std :utf8); # Make All I/O Default to UTF-8
  2         2328  
  2         11  
18              
19 2     2   1583 use OBO::Core::Term;
  2         7  
  2         66  
20 2     2   1205 use OBO::APO::GoaAssociation;
  2         5  
  2         56  
21 2     2   1074 use OBO::APO::GoaAssociationSet;
  2         5  
  2         2737  
22              
23             # use Data::Dumper;
24              
25             $Carp::Verbose = 0;
26             my $verbose = 0;
27              
28             sub new {
29 2     2 0 13 my $class = shift;
30 2         5 my $self = {};
31            
32 2         5 bless ( $self, $class );
33 2         6 return $self;
34             }
35              
36             =head2 parse
37              
38             Usage - $GoaParser->parse ( $FH, $map )
39             Returns - OBO::APO::GoaAssociationSet object
40             Args -
41             1. indirect filehandle to a GOA associations file
42             2. ref to a hash { UniProtAC => UniProtID } or { GO ID => GO term name } to filter by, optional
43             Function - converts a GOA associations file into a OBO::APO::GoaAssociationSet object
44            
45             =cut
46              
47             sub parse {
48             # TODO remove the option of filtering by GO map
49             # TODO accomodate annotations including multiple taxa like: 'taxon:10090|taxon:10360', taxon:9606|taxon:44130
50             my (
51 4     4 1 994 $self,
52             $in_file_path,
53             $map # hash ref, either UP map or GO map, optional
54             ) = @_;
55            
56 4 50       140 open my $FH, '<', $in_file_path or croak "Can't open file '$in_file_path': $! ";
57 4         42 my $goaAssocSet = OBO::APO::GoaAssociationSet->new ( );
58             # assumption: the map is not empty and no blank lines
59 4         7 my $map_source;
60 4 100       13 if ( $map ) {
61             # identify the type of the map
62 2         3 my @ids = keys %{ $map };
  2         8  
63 2 100       15 if ( $ids[0] =~ /\AGO:\d{7}\z/xms ) {
    50          
64 1         2 $map_source = 'GO';
65             }
66             elsif ( $ids[0] =~ /\A\w{6}\z/xms ){
67 1         2 $map_source = 'UniProtKB';
68             }
69             else {
70 0         0 carp "An illegal ID in the map! ";
71             }
72 2 50       8 print "Filtering by $map_source map\n" if $verbose;
73             }
74             # Populate the OBO::APO::GoaAssociationSet class with objects
75 4         5 my $count_added =0;
76 4         7 my $count_rejected =0;
77 4         55 while ( <$FH> ){
78 29         42 chomp;
79 29         226 my @fields = split ( /\t/ );
80 29 50       76 next if ( @fields != 15 );
81 29         64 my $goaAssoc = load_data ( \@fields, $map, $map_source );
82 29 50       59 $goaAssoc ? $count_added++ : $count_rejected++;
83 29 50       111 $goaAssocSet->add_unique ( $goaAssoc ) if $goaAssoc;
84             }# end of while $FH
85 4         62 close $FH;
86 4 50       13 print "accepted associations: $count_added, rejected associations: $count_rejected\n" if $verbose;
87 4 50       21 ! $goaAssocSet->is_empty ( ) ? return $goaAssocSet : carp "The set is empty ! $! ";
88             }
89              
90             =head2 work
91              
92             Usage - $GoaParser->work ( $ontology, $data, $up_map, $parent_protein_name ) # the last arg is optional
93             Returns - a data structure with added proteins ( { NCBI_ID => {UP_AC => OBO::Core::Term object}} )
94             Args -
95             1. OBO::Core::Ontology object,
96             2. OBO::APO::GoaAssociationSet object,
97             3. parent term name for proteins ( string ), # to link new proteins to, e.g. 'gene regulation protein'
98             Function - adds GO associations ( and optionally protein terms ) to ontology
99            
100             =cut
101              
102             sub work {
103             my (
104 2     2 1 335 $self,
105             $onto,
106             $data,
107             $parent_protein_name,
108             ) = @_ ;
109 2 50 33     20 croak "Not enough arguments! " if ! ( $onto and $data );
110            
111 2         4 my $parent_protein;
112             my $taxon;
113 2 100       6 if ( $parent_protein_name ) {
114 1   33     7 $parent_protein = $onto->get_term_by_name ( $parent_protein_name ) || croak "No term for $parent_protein_name in ontology: $! ";
115             }
116            
117 2         5 my $is_a = 'is_a';
118 2         4 my $participates_in = 'participates_in';
119 2         4 my $located_in = 'located_in';
120 2         3 my $has_function = 'has_function';
121 2         4 my $has_source = 'has_source';
122 2         6 my @rel_types = ( $is_a, $participates_in, $located_in, $has_function, $has_source );
123 2         59 foreach ( @rel_types ){
124 10 50       34 croak "'$_' is not in ontology" unless ( $onto->{RELATIONSHIP_TYPES}->{$_} );
125             }
126            
127 2         3 my %proteins; # { NCBI_ID => {UP_AC => OBO::Core::Term object}}
128             my %go_terms; # { GO_id => OBO::Core::Term object }
129 0         0 my %taxa; # { NCBI_id => OBO::Core::Term object }
130            
131 2         4 foreach my $goaAssoc ( @{$data->{SET}} ){
  2         5  
132             # get GO term
133 16         48 my $go_id = $goaAssoc->go_id ( );
134 16 50       40 next if ( $go_id eq 'GO:0008150' ); # remove associations with 'biological_process'
135 16         27 my $go_term = $go_terms{$go_id};
136 16 100       34 if ( ! $go_term ) { # $go_term is not yet in the hash
137 10         28 $go_term = $onto->get_term_by_id ( $go_id );
138 10 50       25 next if ( ! $go_term );
139 10         22 $go_terms{$go_id} = $go_term;
140             }
141            
142             # get taxon
143             # for multiple taxa in a single association: taxon:9606|taxon:44130
144 16         44 $goaAssoc->taxon ( ) =~ /\Ataxon:(\d+)/xms;
145 16 50       49 my $ncbi_id = $1 or carp "No NCBI id: $!";
146 16         27 my $taxon_id = "NCBI:$ncbi_id";
147 16         28 my $taxon = $taxa{$ncbi_id};
148 16 100       33 if ( ! $taxon ) {
149 2   33     7 $taxon = $onto->get_term_by_id ( $taxon_id ) || carp "No taxon term for $taxon_id in ontology: $! ";
150 2 50       6 next if ! $taxon;
151 2         5 $taxa{$ncbi_id} = $taxon;
152             }
153            
154             # get protein term
155 16         42 my $obj_id = $goaAssoc->obj_id ( );
156 16         45 my $db = $goaAssoc->obj_src ( );
157 16         35 my $prot_id = "$db:$obj_id";
158 16         39 my $protein = $proteins{$ncbi_id}{$obj_id};
159 16 100       31 if ( ! $protein ) { # $protein is not yet in the hash
160 2         7 $protein = $onto->get_term_by_id ( $prot_id );
161 2 100       8 if ( ! $protein ) { # $protein is not yet in the ontolgy
162 1 50       2 if ( $parent_protein_name ) { # testing whether new protein terms should be created
163 1         9 $protein = OBO::Core::Term->new ( );
164 1         4 $protein->id ( $prot_id );
165 1         5 $onto->add_term ( $protein );
166 1         9 $onto->create_rel ( $protein, $has_source, $taxon );
167 1         3 $onto->create_rel ( $protein, $is_a, $parent_protein );
168             }
169             else {
170             # the protein not in the ontology and no new terms should be created
171             # normally should not happen
172 0         0 carp "No protein term in the ontology for $prot_id";
173 0         0 next;
174             }
175             }
176 2         5 $proteins{$ncbi_id}{$obj_id} = $protein;
177             }
178            
179            
180             # create relations
181 16         42 my $aspect = $goaAssoc->aspect ( );
182 16 50       58 if ( $aspect eq 'F' ) {
    100          
    50          
183 0         0 $onto->create_rel ( $protein, $has_function, $go_term );
184             }
185             elsif ( $aspect eq 'C' ) {
186 14         35 $onto->create_rel ( $protein, $located_in, $go_term );
187             # $onto->create_rel ( $go_term, $location_of, $protein ); # inverse of 'located_in'
188             }
189             elsif ( $aspect eq 'P' ) {
190 2         7 $onto->create_rel ( $protein, $participates_in, $go_term );
191             # $onto->create_rel ( $go_term, $has_participant, $protein ); # inverse of 'participates_in'
192             }
193 0         0 else {carp "An illegal GO aspect '$aspect'\n"}
194             }
195 2         11 return \%proteins;
196             }
197              
198             sub load_data {
199 29     29 0 50 my ( $fields, $map, $map_source ) = @_;
200 29         33 my @fields = @{$fields};
  29         113  
201 29         59 foreach ( @fields ) {
202 435         624 $_ =~ s/^\s+//;
203 435         737 $_ =~ s/\s+$//;
204             }
205 29 100       62 if ( $map ) {
206 16 100       35 if ( $map_source eq 'GO' ) {
207 8 50       23 return 0 unless $map->{$fields[4]};
208             }
209 16 100       35 if ( $map_source eq 'UniProtKB' ) {
210 8 50       26 return 0 unless $map->{$fields[1]};
211             }
212             }
213 29         99 my $goaAssoc = OBO::APO::GoaAssociation->new ( );
214 29         89 $goaAssoc->assc_id ( $. );
215 29         77 $goaAssoc->obj_src ( $fields[0] );
216 29         80 $goaAssoc->obj_id ( $fields[1] );
217 29         73 $goaAssoc->obj_symb ( $fields[2] );
218 29         73 $goaAssoc->qualifier ( $fields[3] );
219 29         71 $goaAssoc->go_id ( $fields[4] );
220 29         78 $goaAssoc->refer ( $fields[5] );
221 29         75 $goaAssoc->evid_code ( $fields[6] );
222 29         103 $goaAssoc->sup_ref ( $fields[7] );
223 29         73 $goaAssoc->aspect ( $fields[8] );
224 29         75 $goaAssoc->description ( $fields[9] );
225 29         76 $goaAssoc->synonym ( $fields[10] );
226 29         75 $goaAssoc->type ( $fields[11] );
227 29         74 $goaAssoc->taxon ( $fields[12] );
228 29         76 $goaAssoc->date ( $fields[13] );
229 29         75 $goaAssoc->annot_src ( $fields[14] );
230 29         84 return $goaAssoc;
231             }
232              
233             1;
234              
235             __END__