File Coverage

GO/Parsers/go_ont_parser.pm

Criterion	Covered	Total	%
statement	170	186	91.4
branch	64	86	74.4
condition	18	21	85.7
subroutine	12	13	92.3
pod	0	7	0.0
total	264	313	84.3

line	stmt	bran	cond	sub	pod	time	code
1							# $Id: go_ont_parser.pm,v 1.17 2005/08/19 01:48:09 cmungall Exp $
2							#
3							#
4							# see also - http://www.geneontology.org
5							# - http://www.godatabase.org/dev
6							#
7							# You may distribute this module under the same terms as perl itself
8
9							package GO::Parsers::go_ont_parser;
10
11							=head1 NAME
12
13							GO::Parsers::go_ont_parser - syntax parsing of GO .ontology flat files
14
15							=head1 SYNOPSIS
16
17							do not use this class directly; use GO::Parser
18
19							=cut
20
21							=head1 DESCRIPTION
22
23							This generates Stag event streams from one of the various GO flat file
24							formats (ontology, defs, xref, associations). See GO::Parser for details
25
26							Examples of these files can be found at http://www.geneontology.org
27
28							A description of the event streams generated follows; Stag or an XML
29							handler can be used to catch these events
30
31							=head1 GO ONTOLOGY FILES
32
33							These files have the .ontology suffix. The stag-schema for the event
34							streams generated look like this:
35
36							(ontology
37							(source
38							(source_type "s")
39							(source_path "s")
40							(source_mtime "i"))
41							(term+
42							(id "s")
43							(name "s")
44							(is_root? "i")
45							(relationship+
46							(type "s")
47							(to "s"))
48							(dbxref*
49							(dbname "s")
50							(acc "s"))
51							(synonym* "s")
52							(secondaryid* "s")
53							(is_obsolete? "i")))
54
55
56							=head1 AUTHOR
57
58							=cut
59
60	5			5		31	use Exporter;
	5					9
	5					273
61	5			5		30	use base qw(GO::Parsers::base_parser);
	5					10
	5					2337
62	5			5		2526	use GO::Parsers::ParserEventNames;
	5					16
	5					1416
63
64	5			5		35	use Carp;
	5					8
	5					310
65	5			5		26	use FileHandle;
	5					9
	5					43
66	5			5		2515	use strict qw(vars refs);
	5					25
	5					15549
67
68							sub dtd {
69	0			0	0	0	'go_ont-parser-events.dtd';
70							}
71
72							sub regesc {
73	66			66	0	77	my $code = shift;
74	66	100				164	if ($code eq '\|') {
75	3					5	$code = '\\|';
76							}
77	66	100				125	if ($code eq '*') {
78	2					3	$code = '\*';
79							}
80	66	50				125	if ($code eq '?') {
81	0					0	$code = '\?';
82							}
83	66					204	$code;
84							}
85
86							sub reln_regexp {
87	14	50		14	0	26	join("\|", map {" ".regesc($_)." "} keys %{shift \|\| {}});
	66					121
	14					82
88							}
89
90							sub parse_fh {
91	6			6	0	1135	my ($self, $fh) = @_;
92
93	6					28	my $file = $self->file;
94
95	6					37	my $is_go;
96	6					43	my %typemap =
97							('$'=>'is_a',
98							'%'=>'is_a',
99							'<'=>'part_of',
100							'~'=>'derives_from',
101							);
102	6					36	my $reln_regexp = reln_regexp(\%typemap);
103	6					20	my $lnum = 0;
104	6					14	my @stack = ();
105	6					12	my $obs_depth;
106
107	6					54	my $usc = $self->replace_underscore;
108	6					59	$self->start_event(OBO);
109	6					399	my %rtypenameh = ();
110	6					54	$self->fire_source_event($file);
111	6					49	$self->handler->{ontology_type} =
112							$self->force_namespace;
113	6					46	my $root_id;
114
115							PARSELINE:
116	6					214	while (<$fh>) {
117							# UNICODE causes problems for XML and DB
118							# delete 8th bit
119	344					614	tr [\200-\377]
120							[\000-\177]; # see 'man perlop', section on tr/
121							# weird ascii characters should be excluded
122	344					518	tr/\0-\10//d; # remove weird characters; ascii 0-8
123							# preserve \11 (9 - tab) and \12 (10-linefeed)
124	344					500	tr/\13\14//d; # remove weird characters; 11,12
125							# preserve \15 (13 - carriage return)
126	344					434	tr/\16-\37//d; # remove 14-31 (all rest before space)
127	344					471	tr/\177//d; # remove DEL character
128	344					571	my $line = $_;
129	344					665	$line =~ s/\r//g;
130	344					589	chomp $line;
131	344					1604	$line =~ s/\s+$//;
132	344					369	++$lnum;
133	344					1111	$self->line($line);
134	344					28401	$self->line_no($lnum);
135	344	100				2197	if ($line =~ /^\!type:\s*(\S+)\s+(\S+)/) {
136	8					24	my ($code, $name) = ($1, $2);
137	8					15	$name =~ s/\s+/_/g;
138	8					13	$name = lc($name);
139	8	50				22	if ($name eq 'isa') {
140	0					0	$name = 'is_a';
141							}
142	8	50				17	if ($name eq 'partof') {
143	0					0	$name = 'part_of';
144							}
145	8	100				57	$typemap{$code} = $name
146							unless $code eq '%';
147							#$code = '\\'.$code;
148	8					18	$reln_regexp = reln_regexp(\%typemap);
149	8					35	next;
150							}
151	336	100				875	next if $line =~ /^\s*\!/; # comment
152	305	50				688	next if $line eq '\$'; #
153	305	50				670	next if $line eq ''; #
154	305	50				706	last if $line =~ /^\s\$\s$/; # end of file
155
156							# get rid of SGML directives, e.g. FADH2, as these confuse the relationship syntax
157	305					460	$line =~ s/<\/?[A-Za-z]+>//g;
158							# $line = &spellGreek ($line);
159	305					599	$line =~ s/&([a-z]+);/$1/g;
160
161	305					1101	$line =~ /( )(.)/;
162	305					832	my $body = $2;
163	305					542	my $indent = length($1);
164
165	305					375	my $is_obs = 0;
166	305		100			1606	while ((scalar @stack) &&
167							$stack[$#stack]->[0] >= $indent) {
168	273					418	pop @stack;
169	273	100	66			1880	if (defined($obs_depth) &&
170							$obs_depth >= $indent) {
171							# no longer under obsolete node
172	2					12	$obs_depth = undef;
173							}
174							}
175
176	305					385	my $rchar;
177	305	50				705	if ($body =~ /^(\@\w+\@)(.*)/) {
178	0					0	$rchar = $self->typemap($1,\%typemap);
179	0					0	$body = $2;
180	0					0	$reln_regexp = ' \@\w+\@ ';
181							}
182							else {
183	305					1937	$rchar = $self->typemap(substr($body, 0, 1),\%typemap);
184	305					851	$body = substr($body, 1);
185							}
186							# +++++++++++++++++++++++++++++++++
187							# parse body / main content of line
188							# +++++++++++++++++++++++++++++++++
189	305					382	my $currxref;
190	305					3143	my @parts = split(/($reln_regexp)/, $body);
191	305					958	for (my $i=0; $i < @parts; $i+=2) {
192	327					3116	my $part = $parts[$i];
193	327					2339	my ($name, @xrefs) =
194							split(/\s*;\s+/, $part);
195	327					1139	$name = $self->unescapego($name);
196	327	50				811	if ($usc) {
197	0					0	$name =~ s/_/$usc/g;
198							}
199	327	100	66			1040	if ($name =~ /^obsolete/i && $i==0) {
200	2					3	$obs_depth = $indent;
201							}
202	327	100				707	if ($name eq "Gene_Ontology") {
203	4					7	$is_go =1;
204							}
205	327	100				629	if (defined($obs_depth)) {
206							# set obsolete flag if we
207							# are anywhere under the obsolete node
208	6					7	$is_obs = 1;
209							}
210	327	100	100			1846	if ($indent < 2 && $is_go) {
		100
		50
211	10	50				48	$self->handler->{ontology_type} = $name
212							unless $self->force_namespace;
213							}
214							elsif ($indent < 1) {
215	4	50				14	$self->handler->{ontology_type} = $name
216							unless $self->force_namespace;
217							}
218							elsif (!$self->handler->{ontology_type}) {
219	0					0	$self->handler->{ontology_type} = $name;
220							}
221							else {
222							}
223
224	327					2602	my $pxrefstr = shift @xrefs;
225	327	50				718	if (!$pxrefstr) {
226	0					0	$pxrefstr = '';
227	0					0	$self->parse_err("no ID");
228	0					0	next PARSELINE;
229							}
230							# get the GO id for this line
231	327					879	my ($pxref, @secondaryids) =
232							split(/,\s+/, $pxrefstr);
233	327	100				746	if ($i==0) {
234	305					381	$currxref = $pxref;
235	305	100				771	if ($currxref =~ /\s/) {
236	2					7	my $msg = "\"$pxref\" doesn't look valid";
237	2					15	$self->parse_err($msg);
238							}
239	305					12326	my $a2t = $self->acc2name_h;
240	305					627	my $prevname = $a2t->{$currxref};
241	305	100	100			747	if ($prevname &&
242							$prevname ne $name) {
243	2					9	my $msg = "clash on $pxref; was '$prevname' now '$name'";
244	2					10	$self->parse_err($msg);
245	2					1040	next PARSELINE;
246							}
247	303	100	100			726	if ($prevname && $indent) {
248							# seen before - no new data, skip to avoid repeats
249	8					66	next PARSELINE;
250							}
251	295					900	$a2t->{$currxref} = $name;
252	295	100				597	$root_id = $currxref if !$indent;
253	295					456	$a2t->{$currxref} = $name;
254	295					950	$self->start_event(TERM);
255	295					9208	$self->event(ID, $currxref);
256	295					27205	$self->event(NAME, $name);
257	295	100				24424	$self->event(IS_OBSOLETE, $is_obs) if $is_obs;
258	295	100				937	$self->event(IS_ROOT, 1) if !$indent;
259	295	50				1389	$self->event(NAMESPACE, $self->handler->{ontology_type})
260							if $self->handler->{ontology_type};
261	15					551	map {
262	295					27442	$self->event(ALT_ID, $_);
263							} @secondaryids;
264							}
265							# map {
266							# $self->start_event("secondaryid");
267							# $self->event("id", $_);
268							# $self->end_event("secondaryid");
269							# } @secondaryids;
270	317	100				1385	if ($i == 0) {
271							# first part on line has main
272							# info for this term
273	295					1193	foreach my $xref (@xrefs) {
274	64					2407	my ($db,@rest) =
275							split(/:/,$xref);
276	64					279	my $dbacc = $self->unescapego(join(":", @rest));
277	64	100				191	if ($db eq "synonym") {
278	26					181	$self->event(SYNONYM, [[synonym_text=>$dbacc],
279							[type=>'related']]);
280							}
281
282							# elsif ($dbacc =~ /\s/) {
283							# # db accessions should not have
284							# # spaces in them - this
285							# # indicates that there is a problem;
286							# # eg synonym spelled wrongly
287							# # [MetaCyc accessions have spaces!]
288							# my $msg =
289							# "ignoring $db:$dbacc - doesn't look like accession";
290							# $self->parse_err({msg=>$msg,
291							# line_no=>$lnum,
292							# line=>$line,
293							# file=>$file});
294							# }
295							else {
296	38					228	$self->event(XREF_ANALOG, [[dbname => $db], [acc => $dbacc]]);
297							}
298							}
299							} else {
300							# other parts on line
301							# have redundant info,
302							# but the relationship
303							# part is useful
304	22					75	my $rchar = $self->typemap($parts[$i-1],\%typemap);
305	22	50				60	if (!$pxref) {
306	0					0	$self->parse_err("problem with $name $currxref: rel $rchar has no parent/object");
307							} else {
308	22					54	$self->relationship_event($rchar, $pxref);
309							}
310							}
311							}
312							#$line =~ s/\\//g;
313							# end of parse body
314	295	100				13644	if (@stack) {
315	288					553	my $up = $stack[$#stack];
316	288					467	my $obj = $up->[1];
317	288	50				497	if (!$obj) {
318	0					0	$self->parse_err("problem with $currxref: rel $rchar has no parent/object [top of stack is @$up]");
319							} else {
320	288					780	$self->relationship_event($rchar, $up->[1]);
321	288					30155	$rtypenameh{$rchar} = 1;
322							}
323							} else {
324							# $self->event("rel", "isa", "TOP");
325							}
326	295					955	$self->end_event(TERM);
327	295					10608	push(@stack, [$indent, $currxref]);
328							}
329	6					112	$self->pop_stack_to_depth(1);
330	6					175	foreach my $rtypename (keys %rtypenameh) {
331	11	100				1778	next if $rtypename eq 'is_a';
332	6					41	$self->event(TYPEDEF, [
333							[id=>$rtypename],
334							[name=>$rtypename],
335							[domain=>$root_id],
336							[range=>$root_id],
337							]);
338							}
339	6					40	$self->pop_stack_to_depth(0);
340	6					431	$self->parsed_ontology(1);
341							}
342
343							sub relationship_event {
344	310			310	0	366	my $self = shift;
345	310					431	my $rchar = shift;
346	310					434	my $to = shift;
347
348	310	100	66			1129	if ($rchar eq 'is_a' \|\|
349							$rchar eq 'isa') {
350	273					926	$self->event(IS_A, $to);
351							}
352							else {
353	37					238	$self->event(RELATIONSHIP,
354							[[type => $rchar],
355							[to=>$to]]);
356							}
357							}
358
359
360							sub typemap {
361	327			327	0	501	my $self = shift;
362	327					650	my $ch = shift;
363	327	50				420	my %typemap = %{shift \|\| {}};
	327					2266
364	327					1530	$ch =~ s/^ *//g;
365	327					1343	$ch =~ s/ *$//g;
366	327	50				839	if ($typemap{$ch}) {
		0
		0
367	327					563	$ch = $typemap{$ch};
368							}
369							elsif ($typemap{'\\'.$ch}) {
370	0					0	$ch = $typemap{$ch};
371							}
372							elsif ($ch =~ /^\@(\w+)\@/) {
373	0					0	$ch = lc($1);
374							}
375							else {
376							}
377	327					542	$ch =~ s/isa/is_a/;
378	327					434	$ch =~ s/partof/part_of/;
379	327					1033	$ch;
380							}
381
382							sub unescapego {
383	391			391	0	534	my $self = shift;
384	391					453	my $ch = shift;
385	391					562	$ch =~ s/\\//g;
386	391					775	$ch;
387
388							}
389
390							1;