File Coverage

blib/lib/OBO/APO/GoaToRDF.pm
Criterion Covered Total %
statement 91 91 100.0
branch 7 8 87.5
condition 2 3 66.6
subroutine 8 8 100.0
pod 2 3 66.6
total 110 113 97.3


line stmt bran cond sub pod time code
1             # $Id: GoaToRDF.pm 2194 2008-08-07 12:46:25Z Erick Antezana $
2             #
3             # Module : GoaToRDF.pm
4             # Purpose : A GOA associations to RDF converter
5             # License : Copyright (c) 2006-2015 by ONTO-perl. All rights reserved.
6             # This program is free software; you can redistribute it and/or
7             # modify it under the same terms as Perl itself.
8             # Contact : CCO
9             #
10             package OBO::APO::GoaToRDF;
11              
12             =head1 NAME
13              
14             OBO::APO::GoaToRDF - A GOA associations to RDF converter.
15              
16             =head1 DESCRIPTION
17              
18             Converts a GOA association file to a RDF graph. The RDF graph is very simple,
19             containing a node for each line from the association file (called GOA_ASSOC_n),
20             and several triples for the fields (e.g. obj_symb).
21              
22             GOA associations files can be obtained from http://www.ebi.ac.uk/GOA/proteomes.html
23              
24             The method 'work' gets an assoc file path and a file handler for the RDF graph.
25              
26             =head1 AUTHOR
27              
28             Mikel Egana Aranguren
29             mikel.egana.aranguren@gmail.com
30              
31             =head1 COPYRIGHT AND LICENSE
32              
33             Copyright (c) 2008 by Mikel Egana Aranguren
34              
35             This library is free software; you can redistribute it and/or modify
36             it under the same terms as Perl itself, either Perl version 5.8.7 or,
37             at your option, any later version of Perl 5 you may have available.
38              
39             =cut
40              
41 1     1   6264 use OBO::Parser::GoaParser;
  1         3  
  1         1002  
42 1     1   5 use strict;
  1         2  
  1         19  
43 1     1   4 use warnings;
  1         2  
  1         21  
44 1     1   5 use Carp;
  1         1  
  1         1242  
45              
46             sub new {
47 1     1 0 11 my $class = shift;
48 1         2 my $self = {};
49            
50 1         41 bless ($self, $class);
51 1         3 return $self;
52             }
53              
54             =head2 work
55              
56             Usage - $GoaToRDF->workwork($input_file, $file_handle, $base, $ns);
57             Returns - RDF file handler
58             Args - 1. Full path to the GOA file
59             2. File handle for writing RDF
60             3. base URI (e.g. 'http://www.semantic-systems-biology.org/')
61             4. name space (e.g. 'SSB')
62             Function - converts an assoc. file to an RDF graph
63            
64             =cut
65              
66             sub work {
67 1     1 1 192 my $self = shift;
68              
69             # Get the arguments
70             # my ($file_handle, $path_to_assoc_file) = @_;
71 1         3 my ( $path_to_assoc_file, $file_handle, $base, $namespace ) = @_; #vlmir
72             #
73             # Hard-coded evidence codes
74             #
75             #TODO the list is not complete anymore #vlmir
76 1         13 my %evidence_code_by_id = (
77             'IEA' => 'ECO_0000203',
78             'ND' => 'ECO_0000035',
79             'IDA' => 'ECO_0000002',
80             'IPI' => 'ECO_0000021',
81             'TAS' => 'ECO_0000033',
82             'NAS' => 'ECO_0000034',
83             'ISS' => 'ECO_0000041',
84             'IMP' => 'ECO_0000015',
85             'IC' => 'ECO_0000001',
86             'IGI' => 'ECO_0000011',
87             'IEP' => 'ECO_0000008',
88             'RCA' => 'ECO_0000053',
89             'IGC' => 'ECO_0000177',
90             'EXP' => 'ECO_0000006',
91             'IBA' => 'ECO_0000318',
92             'IRD' => 'ECO_0000321',
93             'IKR' => 'ECO_0000320',
94             'ISO' => 'ECO_0000201'
95             );
96            
97             #
98             # Aspects
99             #
100 1         11 my %aspect = (
101             'P' => 'participates_in',
102             'C' => 'located_in',
103             'F' => 'has_function'
104             );
105            
106             # For the ID
107 1         5 $path_to_assoc_file =~ /.*\/(.*)/; # get what is after the slash in the path...
108 1         3 my $f_name = $1;
109 1         3 (my $prefix_id = $f_name) =~ s/\.goa//;
110 1         3 $prefix_id =~ s/\./_/g;
111              
112             # TODO: set all the NS and URI via arguments
113             # my $default_URL = "http://www.semantic-systems-biology.org/";
114 1         3 my $default_URL = $base; #vlmir
115 1         1 my $NS = $namespace;#vlmir
116 1         3 my $ns = lc ($NS);
117 1         2 my $rdf_subnamespace = "assoc";
118              
119             # Preamble of RDF file
120 1         10 print $file_handle "\n";
121 1         2 print $file_handle "
122 1         2 print $file_handle "\txmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
123 1         2 print $file_handle "\txmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\"\n";
124 1         3 print $file_handle "\txmlns:".$ns."=\"".$default_URL.$NS."#\">\n";
125            
126 1         8 my $GoaParser = OBO::Parser::GoaParser->new();
127 1         4 my $goaAssocSet = $GoaParser->parse($path_to_assoc_file);
128            
129 1         2 my %prot_duplicated; # to add only one copy of the protein
130             my $buffer;
131            
132 1         2 my $prot_space = "protein";
133 1         2 my $previous_protein = "";
134             # Chunk of RDF file
135 1         7 foreach ($goaAssocSet->get_set()) {
136 5         6 my %assoc = %{$_};
  5         41  
137 5         13 my $current_protein = $assoc{OBJ_ID};
138            
139 5 100 66     52 if ($previous_protein && $current_protein ne $previous_protein) { # flush the buffer
140 4         10 $buffer .= "\t\n";
141 4         24 print $file_handle $buffer;
142 4         6 $buffer = ""; # init
143             }
144            
145             #
146             # the protein: (this should come from uniprot.rdf)
147             #
148 5         13 my $triple_prefix_id_assoc_id = "triple_".$prefix_id."_".$assoc{ASSC_ID};
149 5 100       15 if (!$prot_duplicated{$current_protein}) {
150 4         11 $buffer .= "\t<".$ns.":".$prot_space." rdf:about=\"#".$current_protein."\">\n";
151 4         15 $buffer .= "\t\t".&char_hex_http($assoc{OBJ_SYMB})."\n";
152 4         11 $buffer .= "\t\t<".$ns.":name xml:lang=\"en\">".&char_hex_http($assoc{OBJ_SYMB})."\n";
153 4         12 $buffer .= "\t\t<".$ns.":annot_src>".&char_hex_http($assoc{ANNOT_SRC})."\n";
154 4         8 my $t = $assoc{TAXON};
155 4         11 $t =~ s/taxon:/NCBI_/; # clean it
156 4         13 $buffer .= "\t\t<".$ns.":taxon>".$t."\n";
157 4         10 $buffer .= "\t\t<".$ns.":has_source rdf:resource=\"#".$t."\"/>\n";
158 4         10 $buffer .= "\t\t<".$ns.":type>".&char_hex_http($assoc{TYPE})."\n";
159 4         13 $buffer .= "\t\t<".$ns.":description>".&char_hex_http($assoc{DESCRIPTION})."\n";
160 4         11 $buffer .= "\t\t<".$ns.":obj_src>".&char_hex_http($assoc{OBJ_SRC})."\n\n";
161            
162 4         11 $prot_duplicated{$current_protein} = 1;
163 4         7 $previous_protein = $current_protein;
164             }
165            
166 5         11 my $goa_ns_prefix_id_assoc_id = "#GOA_".$prefix_id."_".$assoc{ASSC_ID};
167             #
168             # ASSOC:
169             #
170 5         11 print $file_handle "\t<".$ns.":".$rdf_subnamespace." rdf:about=\"".$goa_ns_prefix_id_assoc_id."\">\n";
171 5         17 print $file_handle "\t\t<".$ns.":date>".$assoc{DATE}."\n";
172 5         13 print $file_handle "\t\t<".$ns.":refer>".&char_hex_http($assoc{REFER})."\n";
173 5         13 print $file_handle "\t\t<".$ns.":sup_ref>".&char_hex_http($assoc{SUP_REF})."\n";
174 5         17 print $file_handle "\t\t<".$ns.":has_evidence rdf:resource=\"#".$evidence_code_by_id{$assoc{EVID_CODE}}."\"/>\n";
175 5         9 print $file_handle "\t\n";
176            
177             #
178             # TRIPLE (version 1):
179             #
180             # print $file_handle "\t\n";
181             # print $file_handle "\t\t\n";
182             # print $file_handle "\t\t\n";
183             # print $file_handle "\t\t\n\n";
184             #
185             # print $file_handle "\t\t<".$ns.":supported_by rdf:resource=\"".$goa_ns_prefix_id_assoc_id."\"/>\n";
186             # print $file_handle "\t\n";
187            
188             #
189             # TRIPLE (version 2):
190             #
191 5         8 print $file_handle "\t\n";
192 5         11 print $file_handle "\t\t<".$ns.":supported_by rdf:resource=\"".$goa_ns_prefix_id_assoc_id."\"/>\n";
193 5         6 print $file_handle "\t\n";
194            
195             #
196             # flushing?
197             #
198 5 100       12 if ($current_protein eq $previous_protein) {
199 4         16 $buffer .= "\t\t<".$ns.":".$aspect{$assoc{ASPECT}}." rdf:ID=\"".$triple_prefix_id_assoc_id."\" rdf:resource=\"#".&char_hex_http($assoc{"GO_ID"})."\"/>\n";
200             }
201 5         21 $previous_protein = $current_protein;
202             }
203            
204             #
205             # LAST FLUSH
206             #
207 1 50       5 if ($previous_protein) {
208 1         3 $buffer .= "\t\n";
209 1         2 print $file_handle $buffer;
210             }
211              
212 1         2 print $file_handle "\n\n";
213 1         5 print $file_handle "";
214              
215 1         32 return $file_handle;
216             }
217              
218             sub __date {
219 1     1   31 my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
220 1         9 my $result = sprintf "%02d:%02d:%4d %02d:%02d", $mday,$mon+1,$year+1900,$hour,$min; # e.g. 11:05:2008 12:52
221             }
222              
223             =head2 char_hex_http
224              
225             Usage - $ontology->char_hex_http($seq)
226             Returns - the sequence with the hexadecimal representation for the http special characters
227             Args - the sequence of characters
228             Function - Transforms a http character to its equivalent one in hexadecimal. E.g. : -> %3A
229            
230             =cut
231              
232              
233             sub char_hex_http {
234 38     38 1 83 $_[0] =~ s/:/_/g; # originally: $_[0] =~ s/:/%3A/g; but changed to get eh GO IDs properly: GO_0000001
235 38         58 $_[0] =~ s/;/%3B/g;
236 38         54 $_[0] =~ s/
237 38         50 $_[0] =~ s/=/%3D/g;
238 38         49 $_[0] =~ s/>/%3E/g;
239 38         54 $_[0] =~ s/\?/%3F/g;
240            
241             #number sign # 23 # --> # # --> #
242             #dollar sign $ 24 $ --> $ $ --> $
243             #percent sign % 25 % --> % % --> %
244              
245 38         50 $_[0] =~ s/\//%2F/g;
246 38         49 $_[0] =~ s/&/%26/g;
247              
248 38         111 return $_[0];
249             }
250             1;