File Coverage

blib/lib/Text/Corpus/Inspec/Document.pm
Criterion Covered Total %
statement 23 92 25.0
branch 0 18 0.0
condition 0 12 0.0
subroutine 7 15 46.6
pod 6 6 100.0
total 36 143 25.1


line stmt bran cond sub pod time code
1             package Text::Corpus::Inspec::Document;
2              
3 1     1   7 use strict;
  1         2  
  1         46  
4 1     1   6 use warnings;
  1         2  
  1         34  
5 1     1   1481 use File::Slurp;
  1         20968  
  1         114  
6 1     1   1180 use Lingua::EN::Sentence qw(get_sentences);
  1         3904  
  1         98  
7              
8             BEGIN {
9 1     1   10 use Exporter ();
  1         3  
  1         26  
10 1     1   6 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  1         2  
  1         133  
11 1     1   2 $VERSION = '1.00';
12 1         14 @ISA = qw(Exporter);
13 1         3 @EXPORT = qw();
14 1         2 @EXPORT_OK = qw();
15 1         2253 %EXPORT_TAGS = ();
16             }
17              
18             #12345678901234567890123456789012345678901234
19             #Parse Inspec abstract for research.
20              
21             =head1 NAME
22              
23             C - Parse Inspec abstract for research.
24              
25             =head1 SYNOPSIS
26              
27             use Text::Corpus::Inspec;
28             use Text::Corpus::Inspec::Document;
29             use Data::Dump qw(dump);
30             my $corpus = Text::Corpus::Inspec->new (corpusDirectory => $corpusDirectory);
31             my $document = $corpus->getDocument (index => 0);
32             dump $document->getBody;
33             dump $document->getCategories;
34             dump $document->getContent;
35             dump $document->getTitle;
36             dump $document->getUri;
37              
38             =head1 DESCRIPTION
39              
40             C provides methods for accessing specific
41             portions of Inspec abstracts for researching and testing of information
42             processing methods.
43              
44             =head1 CONSTRUCTOR
45              
46             =head2 C
47              
48             The method C creates an instance of the C class with the following
49             parameters:
50              
51             =over
52              
53             =item C or C
54              
55             filename => '...' or uri => '...'
56              
57             C or C must be the path name to the corpus document to be parsed. If the file
58             does not exist, C is returned. The path provided is returned by L.
59              
60             =back
61              
62             =cut
63              
64             sub new
65             {
66             # create the class object.
67 0     0 1   my ($Class, %Parameters) = @_;
68 0   0       my $Self = bless {}, ref($Class) || $Class;
69              
70             # make sure a file path is defined.
71 0           my $filename;
72 0 0         $filename = $Parameters{filename} if exists $Parameters{filename};
73 0 0         $filename = $Parameters{uri} if exists $Parameters{uri};
74              
75             # make sure the file exists;
76 0 0         return undef unless defined $filename;
77 0 0         return undef unless -f $Parameters{filename};
78              
79             # read in the file contents.
80 0           my @lines = read_file ($Parameters{filename});
81              
82             # get the title.
83 0           my $title = shift @lines;
84 0           my $line = shift @lines;
85 0   0       while (defined ($line) && ($line =~ m/^\s+/))
86             {
87 0           $title .= $line;
88 0           $line = shift @lines;
89             }
90 0           $title =~ s/[\x00-\x20]+/ /g;
91 0           $title =~ s/^\s+//;
92 0           $title =~ s/\s+$//;
93 0           $title = get_sentences ($title);
94 0           $Self->{title} = $title;
95              
96             # join the remaining lines into the content.
97 0           my $content = join ('', $line, @lines);
98 0           $content =~ s/[\x00-\x20]+/ /g;
99              
100             # parse out the sentences.
101 0           my $sentences = get_sentences ($content);
102 0           $Self->{body} = $sentences;
103              
104             # store the list of sentences.
105 0           $Self->{content} = [@$title, @$sentences];
106              
107             # create the name of the file with uncontrol categories.
108 0           my $uncontrFile = $Parameters{filename};
109 0           substr ($uncontrFile, -length ('abstr')) = 'uncontr';
110              
111             # read in the uncontroled categories and parse them into a list.
112 0           $Self->{categories_uncontolled} = read_file ($uncontrFile);
113 0 0         if (defined ($Self->{categories_uncontolled}))
114             {
115 0           $Self->{categories_uncontolled} = _normalizeCategories ($Self->{categories_uncontolled});
116             }
117             else
118             {
119 0           $Self->{categories_uncontolled} = [];
120             }
121              
122             # create the name of the file with uncontrol categories.
123 0           my $contrFile = $Parameters{filename};
124 0           substr ($contrFile, -length ('abstr')) = 'contr';
125              
126             # read in the controlled categories and parse them into a list.
127 0           $Self->{categories_contolled} = read_file ($contrFile);
128 0 0         if (defined ($Self->{categories_contolled}))
129             {
130 0           $Self->{categories_contolled} = _normalizeCategories ($Self->{categories_contolled});
131             }
132             else
133             {
134 0           $Self->{categories_contolled} = [];
135             }
136              
137             # build the list of all categories
138 0           my %allCategories = (map {(lc $_, $_)} (@{$Self->{categories_uncontolled}}, @{$Self->{categories_contolled}}));
  0            
  0            
  0            
139 0           $Self->{categories_all} = [sort values %allCategories];
140              
141             # store the uri of the document.
142 0           $Self->{uri} = $Parameters{filename};
143              
144 0           return $Self;
145             }
146              
147             # parses a string of categories into an array of strings.
148             # returns the list of categories as an array reference.
149             sub _normalizeCategories
150             {
151 0     0     my $Categories = shift;
152              
153 0           $Categories =~ s/[\x00-\x20]+/ /g;
154 0           my @categories = split (/;\s*/, $Categories);
155 0           foreach my $category (@categories)
156             {
157 0           $category =~ s/^\s+//;
158 0           $category =~ s/\s+$//;
159             }
160              
161             # return duplicative categories.
162 0           my %uniqueCategories = map {(lc $_, $_)} sort @categories;
  0            
163 0           @categories = sort values %uniqueCategories;
164              
165 0           return \@categories;
166             }
167              
168             =head1 METHODS
169              
170             =head2 C
171              
172             getBody ()
173              
174             C returns an array reference of strings of sentences that are the body of the article.
175              
176             =cut
177              
178             sub getBody
179             {
180 0     0 1   my $Self = shift;
181 0           return $Self->{body};
182             }
183              
184             =head2 C
185              
186             getCategories (type => 'all')
187              
188             The method C returns an array reference of strings that are the
189             categories assigned to the document. The C must be either
190             C<'all'>, C<'controlled'>, or C<'uncontrolled'>, which specify the set of
191             categories to be returned. C<'uncontrolled'> categories are those assigned to the
192             document by an editor without machine assistance; whereas C<'controlled'> categories
193             were assigned with machine assistance. The option C<'all'> returns the union of the
194             categories under C<'controlled'> and C<'uncontrolled'>. The default is C<'all'>.
195              
196             =cut
197              
198             sub getCategories
199             {
200 0     0 1   my ($Self, %Parameters) = shift;
201              
202             # set the category type.
203 0           my $type = 'a';
204 0 0 0       $type = lc substr ($Parameters{type}, 0, 1) if (exists ($Parameters{type}) && defined ($Parameters{type}) && length ($Parameters{type}));
      0        
205              
206             # return the list of categories.
207 0 0         if ($type eq 'c')
    0          
208             {
209 0           return $Self->{categories_contolled};
210             }
211             elsif ($type eq 'u')
212             {
213 0           return $Self->{categories_uncontolled};
214             }
215             else
216             {
217 0           return $Self->{categories_all};
218             }
219             }
220              
221             =head2 C
222              
223             getContent ()
224              
225             C returns an array reference of strings of sentences that form the
226             content of the article, the title and body of the article.
227              
228             =cut
229              
230             sub getContent
231             {
232 0     0 1   my $Self = shift;
233 0           return $Self->{content};
234             }
235              
236             =head2 C
237              
238             getTitle ()
239              
240             C returns an array reference of strings, usually one, of the title of the article.
241              
242             =cut
243              
244             sub getTitle
245             {
246 0     0 1   my $Self = shift;
247 0           return $Self->{title};
248             }
249              
250             =head2 C
251              
252             getUri ()
253              
254             C returns the URI of the document.
255              
256             =cut
257              
258             sub getUri
259             {
260 0     0 1   my $Self = shift;
261 0           return $Self->{uri};
262             }
263              
264             sub DESTROY
265             {
266 0     0     my $Self = shift;
267 0           undef $Self;
268             }
269              
270             =head1 INSTALLATION
271              
272             For installation instructions see L.
273              
274             =head1 AUTHOR
275              
276             Jeff Kubina
277              
278             =head1 COPYRIGHT
279              
280             Copyright (c) 2009 Jeff Kubina. All rights reserved.
281             This program is free software; you can redistribute
282             it and/or modify it under the same terms as Perl itself.
283              
284             The full text of the license can be found in the
285             LICENSE file included with this module.
286              
287             =head1 KEYWORDS
288              
289             inspec, english corpus, information processing
290              
291             =head1 SEE ALSO
292              
293             L, L, L
294              
295             =cut
296              
297             1;
298             # The preceding line will help the module return a true value
299