File Coverage

blib/lib/OBO/APO/GoaToRDF.pm

Criterion	Covered	Total	%
statement	91	91	100.0
branch	7	8	87.5
condition	2	3	66.6
subroutine	8	8	100.0
pod	2	3	66.6
total	110	113	97.3

line	stmt	bran	cond	sub	pod	time	code
1							# $Id: GoaToRDF.pm 2194 2008-08-07 12:46:25Z Erick Antezana $
2							#
3							# Module : GoaToRDF.pm
4							# Purpose : A GOA associations to RDF converter
5							# License : Copyright (c) 2006-2015 by ONTO-perl. All rights reserved.
6							# This program is free software; you can redistribute it and/or
7							# modify it under the same terms as Perl itself.
8							# Contact : CCO
9							#
10							package OBO::APO::GoaToRDF;
11
12							=head1 NAME
13
14							OBO::APO::GoaToRDF - A GOA associations to RDF converter.
15
16							=head1 DESCRIPTION
17
18							Converts a GOA association file to a RDF graph. The RDF graph is very simple,
19							containing a node for each line from the association file (called GOA_ASSOC_n),
20							and several triples for the fields (e.g. obj_symb).
21
22							GOA associations files can be obtained from http://www.ebi.ac.uk/GOA/proteomes.html
23
24							The method 'work' gets an assoc file path and a file handler for the RDF graph.
25
26							=head1 AUTHOR
27
28							Mikel Egana Aranguren
29							mikel.egana.aranguren@gmail.com
30
31							=head1 COPYRIGHT AND LICENSE
32
33							Copyright (c) 2008 by Mikel Egana Aranguren
34
35							This library is free software; you can redistribute it and/or modify
36							it under the same terms as Perl itself, either Perl version 5.8.7 or,
37							at your option, any later version of Perl 5 you may have available.
38
39							=cut
40
41	1			1		6264	use OBO::Parser::GoaParser;
	1					3
	1					1002
42	1			1		5	use strict;
	1					2
	1					19
43	1			1		4	use warnings;
	1					2
	1					21
44	1			1		5	use Carp;
	1					1
	1					1242
45
46							sub new {
47	1			1	0	11	my $class = shift;
48	1					2	my $self = {};
49
50	1					41	bless ($self, $class);
51	1					3	return $self;
52							}
53
54							=head2 work
55
56							Usage - $GoaToRDF->workwork($input_file, $file_handle, $base, $ns);
57							Returns - RDF file handler
58							Args - 1. Full path to the GOA file
59							2. File handle for writing RDF
60							3. base URI (e.g. 'http://www.semantic-systems-biology.org/')
61							4. name space (e.g. 'SSB')
62							Function - converts an assoc. file to an RDF graph
63
64							=cut
65
66							sub work {
67	1			1	1	192	my $self = shift;
68
69							# Get the arguments
70							# my ($file_handle, $path_to_assoc_file) = @_;
71	1					3	my ( $path_to_assoc_file, $file_handle, $base, $namespace ) = @_; #vlmir
72							#
73							# Hard-coded evidence codes
74							#
75							#TODO the list is not complete anymore #vlmir
76	1					13	my %evidence_code_by_id = (
77							'IEA' => 'ECO_0000203',
78							'ND' => 'ECO_0000035',
79							'IDA' => 'ECO_0000002',
80							'IPI' => 'ECO_0000021',
81							'TAS' => 'ECO_0000033',
82							'NAS' => 'ECO_0000034',
83							'ISS' => 'ECO_0000041',
84							'IMP' => 'ECO_0000015',
85							'IC' => 'ECO_0000001',
86							'IGI' => 'ECO_0000011',
87							'IEP' => 'ECO_0000008',
88							'RCA' => 'ECO_0000053',
89							'IGC' => 'ECO_0000177',
90							'EXP' => 'ECO_0000006',
91							'IBA' => 'ECO_0000318',
92							'IRD' => 'ECO_0000321',
93							'IKR' => 'ECO_0000320',
94							'ISO' => 'ECO_0000201'
95							);
96
97							#
98							# Aspects
99							#
100	1					11	my %aspect = (
101							'P' => 'participates_in',
102							'C' => 'located_in',
103							'F' => 'has_function'
104							);
105
106							# For the ID
107	1					5	$path_to_assoc_file =~ /.\/(.)/; # get what is after the slash in the path...
108	1					3	my $f_name = $1;
109	1					3	(my $prefix_id = $f_name) =~ s/\.goa//;
110	1					3	$prefix_id =~ s/\./_/g;
111
112							# TODO: set all the NS and URI via arguments
113							# my $default_URL = "http://www.semantic-systems-biology.org/";
114	1					3	my $default_URL = $base; #vlmir
115	1					1	my $NS = $namespace;#vlmir
116	1					3	my $ns = lc ($NS);
117	1					2	my $rdf_subnamespace = "assoc";
118
119							# Preamble of RDF file
120	1					10	print $file_handle "\n";
121	1					2	print $file_handle "
122	1					2	print $file_handle "\txmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
123	1					2	print $file_handle "\txmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\"\n";
124	1					3	print $file_handle "\txmlns:".$ns."=\"".$default_URL.$NS."#\">\n";
125
126	1					8	my $GoaParser = OBO::Parser::GoaParser->new();
127	1					4	my $goaAssocSet = $GoaParser->parse($path_to_assoc_file);
128
129	1					2	my %prot_duplicated; # to add only one copy of the protein
130							my $buffer;
131
132	1					2	my $prot_space = "protein";
133	1					2	my $previous_protein = "";
134							# Chunk of RDF file
135	1					7	foreach ($goaAssocSet->get_set()) {
136	5					6	my %assoc = %{$_};
	5					41
137	5					13	my $current_protein = $assoc{OBJ_ID};
138
139	5	100	66			52	if ($previous_protein && $current_protein ne $previous_protein) { # flush the buffer
140	4					10	$buffer .= "\t\n";
141	4					24	print $file_handle $buffer;
142	4					6	$buffer = ""; # init
143							}
144
145							#
146							# the protein: (this should come from uniprot.rdf)
147							#
148	5					13	my $triple_prefix_id_assoc_id = "triple_".$prefix_id."_".$assoc{ASSC_ID};
149	5	100				15	if (!$prot_duplicated{$current_protein}) {
150	4					11	$buffer .= "\t<".$ns.":".$prot_space." rdf:about=\"#".$current_protein."\">\n";
151	4					15	$buffer .= "\t\t".&char_hex_http($assoc{OBJ_SYMB})."\n";
152	4					11	$buffer .= "\t\t<".$ns.":name xml:lang=\"en\">".&char_hex_http($assoc{OBJ_SYMB})."\n";
153	4					12	$buffer .= "\t\t<".$ns.":annot_src>".&char_hex_http($assoc{ANNOT_SRC})."\n";
154	4					8	my $t = $assoc{TAXON};
155	4					11	$t =~ s/taxon:/NCBI_/; # clean it
156	4					13	$buffer .= "\t\t<".$ns.":taxon>".$t."\n";
157	4					10	$buffer .= "\t\t<".$ns.":has_source rdf:resource=\"#".$t."\"/>\n";
158	4					10	$buffer .= "\t\t<".$ns.":type>".&char_hex_http($assoc{TYPE})."\n";
159	4					13	$buffer .= "\t\t<".$ns.":description>".&char_hex_http($assoc{DESCRIPTION})."\n";
160	4					11	$buffer .= "\t\t<".$ns.":obj_src>".&char_hex_http($assoc{OBJ_SRC})."\n\n";
161
162	4					11	$prot_duplicated{$current_protein} = 1;
163	4					7	$previous_protein = $current_protein;
164							}
165
166	5					11	my $goa_ns_prefix_id_assoc_id = "#GOA_".$prefix_id."_".$assoc{ASSC_ID};
167							#
168							# ASSOC:
169							#
170	5					11	print $file_handle "\t<".$ns.":".$rdf_subnamespace." rdf:about=\"".$goa_ns_prefix_id_assoc_id."\">\n";
171	5					17	print $file_handle "\t\t<".$ns.":date>".$assoc{DATE}."\n";
172	5					13	print $file_handle "\t\t<".$ns.":refer>".&char_hex_http($assoc{REFER})."\n";
173	5					13	print $file_handle "\t\t<".$ns.":sup_ref>".&char_hex_http($assoc{SUP_REF})."\n";
174	5					17	print $file_handle "\t\t<".$ns.":has_evidence rdf:resource=\"#".$evidence_code_by_id{$assoc{EVID_CODE}}."\"/>\n";
175	5					9	print $file_handle "\t\n";
176
177							#
178							# TRIPLE (version 1):
179							#
180							# print $file_handle "\t\n";
181							# print $file_handle "\t\t\n";
182							# print $file_handle "\t\t\n";
183							# print $file_handle "\t\t\n\n";
184							#
185							# print $file_handle "\t\t<".$ns.":supported_by rdf:resource=\"".$goa_ns_prefix_id_assoc_id."\"/>\n";
186							# print $file_handle "\t\n";
187
188							#
189							# TRIPLE (version 2):
190							#
191	5					8	print $file_handle "\t\n";
192	5					11	print $file_handle "\t\t<".$ns.":supported_by rdf:resource=\"".$goa_ns_prefix_id_assoc_id."\"/>\n";
193	5					6	print $file_handle "\t\n";
194
195							#
196							# flushing?
197							#
198	5	100				12	if ($current_protein eq $previous_protein) {
199	4					16	$buffer .= "\t\t<".$ns.":".$aspect{$assoc{ASPECT}}." rdf:ID=\"".$triple_prefix_id_assoc_id."\" rdf:resource=\"#".&char_hex_http($assoc{"GO_ID"})."\"/>\n";
200							}
201	5					21	$previous_protein = $current_protein;
202							}
203
204							#
205							# LAST FLUSH
206							#
207	1	50				5	if ($previous_protein) {
208	1					3	$buffer .= "\t\n";
209	1					2	print $file_handle $buffer;
210							}
211
212	1					2	print $file_handle "\n\n";
213	1					5	print $file_handle "";
214
215	1					32	return $file_handle;
216							}
217
218							sub __date {
219	1			1		31	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
220	1					9	my $result = sprintf "%02d:%02d:%4d %02d:%02d", $mday,$mon+1,$year+1900,$hour,$min; # e.g. 11:05:2008 12:52
221							}
222
223							=head2 char_hex_http
224
225							Usage - $ontology->char_hex_http($seq)
226							Returns - the sequence with the hexadecimal representation for the http special characters
227							Args - the sequence of characters
228							Function - Transforms a http character to its equivalent one in hexadecimal. E.g. : -> %3A
229
230							=cut
231
232
233							sub char_hex_http {
234	38			38	1	83	$_[0] =~ s/:/_/g; # originally: $_[0] =~ s/:/%3A/g; but changed to get eh GO IDs properly: GO_0000001
235	38					58	$_[0] =~ s/;/%3B/g;
236	38					54	$_[0] =~ s/
237	38					50	$_[0] =~ s/=/%3D/g;
238	38					49	$_[0] =~ s/>/%3E/g;
239	38					54	$_[0] =~ s/\?/%3F/g;
240
241							#number sign # 23 # --> # # --> #
242							#dollar sign $ 24 $ --> $ $ --> $
243							#percent sign % 25 % --> % % --> %
244
245	38					50	$_[0] =~ s/\//%2F/g;
246	38					49	$_[0] =~ s/&/%26/g;
247
248	38					111	return $_[0];
249							}
250							1;