File Coverage

blib/lib/Gwybodaeth/Tokenize.pm
Criterion Covered Total %
statement 41 41 100.0
branch 6 6 100.0
condition 8 9 88.8
subroutine 5 5 100.0
pod 2 2 100.0
total 62 63 98.4


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3 5     5   22653 use warnings;
  5         10  
  5         151  
4 5     5   28 use strict;
  5         10  
  5         2570  
5              
6             package Gwybodaeth::Tokenize;
7              
8             =head1 NAME
9              
10             Tokenize - Split up data on whitespace into tokens.
11              
12             =head1 SYNOPSIS
13              
14             use Tokenize;
15              
16             my $t = Tokenize->new();
17              
18             $t->tokenize($data);
19              
20             =head1 DESCRIPTION
21              
22             This module tokenizes data, where a token is delimited by whitespace.
23              
24             =over
25              
26             =item new()
27              
28             Reterns an instance of the class.
29              
30             =cut
31              
32             sub new {
33 5     5 1 78 my $class = shift;
34 5         12 my $self = {};
35 5         14 bless $self, $class;
36 5         18 return $self;
37             }
38              
39             =item tokenize($data)
40              
41             Tokenizes the data supplied in the array reference $data.
42              
43             =cut
44              
45             # Takes a reference to the input data as a parameter.
46             sub tokenize {
47 10     10 1 732 my($self, $data) = @_;
48              
49 10         13 my @tokenized;
50              
51 10         15 for (@{ $data }) {
  10         24  
52 34         85 for (split /\s+/x) {
53 93 100       258 next if /
54             # string is entirly whitespace or empty
55             ^\s*$/x;
56 83         181 push @tokenized, $_;
57             }
58             }
59              
60 10         35 return $self->_tokenize_clean(\@tokenized);
61             }
62              
63             # Takes a reference to the data which needs to be cleaned
64             sub _tokenize_clean {
65 10     10   92 my($self, $data) = @_;
66              
67 10         14 for my $i (0..$#{ $data }) {
  10         31  
68            
69 85 100       73 next if (not defined ${ $data }[$i]);
  85         180  
70            
71             # If a token begins with '<' but doesn't end with '>'
72             # then the token has been split up.
73 83 100 100     80 if ((${$data}[$i] =~ /^\< # line begins with a opening square bracket/x
  83   100     261  
  4   66     98  
74             &&
75 82         266 ${$data}[$i] =~ /[^\>]$ # line doesn't end with a closing square
76             # bracket
77             /x)
78             ||
79             # If the token begins but doesn't end with " the token may
80             # have been split up
81 8         47 (${$data}[$i] =~ /^\" # line begins with a double quote/x
82             &&
83             ${$data}[$i] =~ /
84             [^\"]$ # line doesn't end with a double quote
85             /x))
86             {
87             # Concatinate the next line to the current
88             # partial token. We add a space inbetween to repair from
89             # the split operation.
90 2         4 ${ $data }[$i] .= " ${ $data }[$i+1]";
  2         4  
  2         5  
91              
92             # Re-index the token list to take into account the last
93             # concatination.
94 2         5 for my $j (($i+1)..($#{ $data }-1)) {
  2         7  
95 4         6 ${ $data }[$j] = ${ $data }[$j + 1];
  4         10  
  4         6  
96             }
97            
98             # The last data element should now be deleted
99             # as the data has been shifted up one in the
100             # list.
101 2         4 delete ${ $data }[$#{ $data }];
  2         5  
  2         3  
102              
103 2         4 redo; # try again in case the token is split onto more than 2 lines
104             }
105             }
106 10         50 return $data;
107             }
108             1;
109             __END__