File Coverage

blib/lib/OWL/Simple/Parser.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3             =head1 NAME
4              
5             OWL::Simple::Parser
6              
7             =head1 SYNOPSIS
8              
9             use OWL::Simple::Parser;
10            
11             # load Experimental Factor Ontology (http://www.ebi.ac.uk/efo/efo.owl)
12             my $parser = OWL::Simple::Parser->new( owlfile => 'efo.owl',
13             synonym_tag => 'efo:alternative_term',
14             definition_tag => 'efo:definition' );
15            
16             # parse file
17             $parser->parse();
18            
19             # iterate through all the classes
20             for my $id (keys %{ $parser->class }){
21             my $OWLClass = $parser->class->{$id};
22             print $id . ' ' . $OWLClass->label . "\n";
23            
24             # list synonyms
25             for my $syn (@{ $OWLClass->synonyms }){
26             print "\tsynonym - $syn\n";
27             }
28            
29             # list definitions
30             for my $def (@{ $OWLClass->definitions }){
31             print "\tdef - $def\n";
32             }
33            
34             # list parents
35             for my $parent (@{ $OWLClass->subClassOf }){
36             print "\tsubClassOf - $parent\n";
37             }
38             }
39              
40             =head1 DESCRIPTION
41              
42             A simple OWL parser loading accessions, labels and synonyms and exposes them
43             as a collection of OWL::Simple::Class objects.
44              
45             This module wraps XML::Parser, which is a sequential event-driven XML parser that
46             can potentially handle very large XML documents. The whole XML structure
47             is never loaded into memory completely, only the bits of interest.
48              
49             In the constructor specify the owlfile to be loaded and two optional tags -
50             synonym_tag or definition_tag that define custom annotations in the ontology for
51             synonyms and definitions respectively. Note both tags have to be fully
52             specified exactly as in the OWL XML to be loaded, e.g. FULL_SYN for NCI Thesaurus
53             or efo:alternative_term for EFO.
54              
55             =head2 METHODS
56              
57             =over
58              
59             =item class_count()
60              
61             Number of classes loaded by the parser.
62              
63             =item synonyms_count()
64              
65             Number of synonyms loaded by the parser.
66              
67             =item version()
68              
69             Version of the ontology extracted from the owl:versionInfo.
70              
71             =item class
72              
73             Hash collection of all the OWL::Simple::Class objects
74              
75             =back
76              
77             =head1 AUTHOR
78              
79             Tomasz Adamusiak <tomasz@cpan.org>
80              
81             =head1 COPYRIGHT AND LICENSE
82              
83             Copyright (c) 2010-2011 European Bioinformatics Institute. All Rights Reserved.
84              
85             This module is free software; you can redistribute it and/or modify it
86             under lGPLv3.
87              
88             This software is provided "as is" without warranty of any kind.
89              
90             =cut
91              
92             package OWL::Simple::Parser;
93              
94 1     1   26412 use Moose 0.89;
  0            
  0            
95             use OWL::Simple::Class;
96             use XML::Parser 2.34;
97             use Data::Dumper;
98             use Log::Log4perl qw(:easy);
99             Log::Log4perl->easy_init( { level => $INFO, layout => '%-5p - %m%n' } );
100              
101             our $VERSION = 1.01;
102              
103             has 'owlfile' => ( is => 'rw', isa => 'Str', required => 1 );
104             has 'class' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
105             has 'class_count' => ( is => 'rw', isa => 'Int', default => 0 );
106             has 'synonyms_count' => ( is => 'rw', isa => 'Int', default => 0 );
107             has 'version' => ( is => 'rw', isa => 'Str' , default => '');
108             has 'synonym_tag' =>
109             ( is => 'rw', isa => 'Str', default => 'efo:alternative_term' );
110             has 'definition_tag' =>
111             ( is => 'rw', isa => 'Str', default => 'efo:definition' );
112            
113              
114             my $parser;
115             my $path = '';
116             my $class = OWL::Simple::Class->new();
117             my %restriction;
118              
119             # Default constructor. Initializes the XML::Parser and sets appropriate handlers.
120              
121             sub BUILD() {
122             my $self = shift;
123             $parser = new XML::Parser;
124             $parser->setHandlers(
125             Start => sub { $self->startElement(@_) },
126             End => sub { $self->endElement(@_) },
127             Char => sub { $self->characterData(@_) },
128             );
129             }
130              
131             # Increments internal counter of classes and synonyms parser respectively.
132              
133             sub incr_classes() {
134             my $self = shift;
135             $self->class_count( $self->class_count + 1 );
136             }
137              
138             sub incr_synonyms() {
139             my $self = shift;
140             $self->synonyms_count( $self->synonyms_count + 1 );
141             }
142              
143             # Main function. Parser the owlfile using XML::Parser
144              
145             sub parse() {
146             my $self = shift;
147             $parser->parsefile( $self->owlfile );
148             INFO "LOADED "
149             . $self->class_count
150             . ' CLASSES AND '
151             . $self->synonyms_count
152             . ' SYNONYMS from '
153             . $self->owlfile;
154              
155             1;
156             }
157              
158             # Handler executed by XML::Parser. Adds current element to $path.
159             # $path is used characterData() to determine whtether node text should be
160             # added to class.
161             #
162             # Initializes a new OWLClass object and stores it in $class. This is later
163             # populated by other handlers.
164              
165             sub startElement() {
166             my ( $self, $parseinst, $element, %attr ) = @_;
167             DEBUG "->startElement $self, $parseinst, $element";
168             $path = $path . '/' . $element; # add element to path
169             if ( $path eq '/rdf:RDF/owl:Class' ) {
170             $self->incr_classes();
171             INFO(
172             "Loaded " . $self->class_count . " classes from " . $self->owlfile )
173             if $self->class_count % 1000 == 0;
174             $class = OWL::Simple::Class->new();
175             $class->id( $attr{'rdf:about'} ) if defined $attr{'rdf:about'};
176             $class->id( $attr{'rdf:ID'} ) if defined $attr{'rdf:ID'};
177             WARN 'DUPLICATE RDF:ID & RDF:ABOUT IN ' . $attr{'rdf:about'}
178             if ( defined $attr{'rdf:id'} && defined $attr{'rdf:about'} );
179             }
180              
181             # Two ways to match parents, either as rdf:resource attribute
182             # on rdfs:subClassOf or rdf:about on nested rdfs:subClassOf/owl:Class
183             elsif ( $path eq '/rdf:RDF/owl:Class/rdfs:subClassOf' ) {
184             push @{ $class->subClassOf }, $attr{'rdf:resource'}
185             if defined $attr{'rdf:resource'};
186             }
187             elsif ( $path eq '/rdf:RDF/owl:Class/rdfs:subClassOf/owl:Class' ) {
188             push @{ $class->subClassOf }, $attr{'rdf:about'}
189             if defined $attr{'rdf:about'};
190             }
191              
192             # Here we try to match relations, e.g. part_of, derives_from, etc.
193             elsif ( $element eq 'owl:Restriction' ) {
194             $restriction{type} = undef;
195             $restriction{class} = [];
196             }
197             elsif ( $element eq 'owl:someValuesFrom' ) {
198             push @{ $restriction{class} }, $attr{'rdf:resource'}
199             if defined $attr{'rdf:resource'};
200             push @{ $restriction{class} }, $attr{'rdf:about'}
201             if defined $attr{'rdf:about'};
202             }
203              
204             # Regex as properties can be transitive, etc.
205             elsif ( $element =~ /owl:\w+Property$/ ) {
206             $restriction{type} = $attr{'rdf:about'} if defined $attr{'rdf:about'};
207             $restriction{type} = $attr{'rdf:resource'}
208             if defined $attr{'rdf:resource'};
209             }
210             }
211              
212             # Handler executed by XML::Parser when node text is processed.
213             #
214             # For rdfs:label stores the value into $class->label otherwise
215             # class->annotation() this is then subsequently pushed into
216             # respective synonyms or definitions table when the
217             # endElement() event is fired
218             # NOTE characterData can be called multiple times, before
219             # the end tag
220              
221             sub characterData {
222             my ( $self, $parseinst, $data ) = @_;
223             DEBUG "->characterData $self, $parseinst, $data";
224              
225             # Get rdfs:label
226             if ( $path eq '/rdf:RDF/owl:Class/rdfs:label' ) {
227             $class->label(
228             ( defined $class->label() ? $class->label() : '' ) . $data );
229             }
230              
231             # Get definition_citation or defintion
232             elsif (
233             $path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*(definition|definition_citation)\w*!
234             || $path eq '/rdf:RDF/owl:Class/' . $self->definition_tag
235             )
236             {
237             $class->annotation(
238             ( defined $class->annotation() ? $class->annotation() : '' )
239             . $data );
240             }
241            
242             # Get synonyms, either matching to anything with synonym or
243             # alternative_term inside or custom tag from parameters
244             elsif (
245             $path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*(synonym|alternative_term)\w*!
246             || $path eq '/rdf:RDF/owl:Class/' . $self->synonym_tag )
247             {
248             $class->annotation(
249             ( defined $class->annotation() ? $class->annotation() : '' )
250             . $data );
251             WARN( "Unparsable synonym detected for " . $class->id )
252             unless defined $data;
253            
254             # detecting closing tag inside, NCIt fix
255             # FIXME this is probably no longer necessary
256             # once the synonym is concatenated, but have not checked
257             #if ( $data =~ m!</! ) {
258             # ($data) = $data =~ m!>(.*?)</!; # match to first entry
259             #}
260              
261             }
262            
263             # Extract version information
264             elsif ( $path eq '/rdf:RDF/owl:Ontology/owl:versionInfo' ){
265             $self->version($self->version() . $data);
266             }
267             }
268              
269             # Handler executed by XML::Parser when the closing tag
270             # is encountered. For owl:Class it pushes it into the class hash as it was
271             # processed by characterData() already and the parser is ready to
272             # process a new owl:Class.
273             #
274             # Also strips the closing tag from $path.
275              
276             sub endElement() {
277             my ( $self, $parseinst, $element ) = @_;
278             DEBUG "->endElement $self, $parseinst, $element";
279              
280             # Reached end of class, add the class to hash
281             if ( $path eq '/rdf:RDF/owl:Class'
282             && $class->id ne "http://www.w3.org/2002/07/owl#Thing" )
283             {
284             WARN 'Class ' . $class->id . ' possibly duplicated'
285             if defined $self->class->{ $class->id };
286             my $classhash = $self->class;
287             $classhash->{ $class->id } = $class;
288             }
289              
290             # Reached end of the relationship tag, add to appropriate array
291             # Currently supports only part_of, and even that poorly.
292             # FIXME circular references
293             elsif ( $element eq 'owl:Restriction' ) {
294             WARN "UNDEFINED RESTRICTION " . $class->id
295             if not defined $restriction{type};
296             if ( $restriction{type} =~ m!/part_of$! ) {
297             for my $cls ( @{ $restriction{class} } ) {
298             push @{ $class->part_of }, $cls;
299             }
300             }
301             }
302              
303             # character data can be called multiple times
304             # for a single element, so it's concatanated there
305             # and saved here
306             elsif ( $path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*definition_citation$! ){
307             push @{ $class->xrefs }, $class->annotation if $class->annotation() ne '';
308             }
309             elsif ( $path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*definition$!
310             || $path eq '/rdf:RDF/owl:Class/' . $self->definition_tag ){
311             push @{ $class->definitions }, $class->annotation if $class->annotation() ne '';
312             }
313             elsif ( $path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*(synonym|alternative_term)\w*!
314             || $path eq '/rdf:RDF/owl:Class/' . $self->synonym_tag ){
315             $self->incr_synonyms();
316             push @{ $class->synonyms }, $class->annotation if $class->annotation() ne '';
317             }
318             print Dumper($class) unless defined $class->annotation;
319             # clear temp annotation
320             $class->annotation('');
321              
322             #remove end element from path
323             $path =~ s!/$element$!!;
324             }
325              
326             1;