File Coverage

blib/lib/Lingua/LO/NLP/Syllabify.pm
Criterion Covered Total %
statement 48 48 100.0
branch 6 6 100.0
condition n/a
subroutine 14 14 100.0
pod 3 3 100.0
total 71 71 100.0


line stmt bran cond sub pod time code
1             package Lingua::LO::NLP::Syllabify;
2 6     6   111519 use strict;
  6         15  
  6         149  
3 6     6   28 use warnings;
  6         10  
  6         131  
4 6     6   84 use 5.012000;
  6         21  
5 6     6   30 use utf8;
  6         11  
  6         45  
6 6     6   175 use feature 'unicode_strings';
  6         11  
  6         501  
7 6     6   220 use version 0.77; our $VERSION = version->declare('v1.0.1');
  6         1341  
  6         33  
8 6     6   488 use charnames qw/ :full lao /;
  6         10  
  6         40  
9 6     6   2761 use Carp;
  6         11  
  6         320  
10 6     6   1690 use Unicode::Normalize qw/ NFC /;
  6         7073  
  6         374  
11 6     6   790 use Class::Accessor::Fast 'antlers';
  6         5329  
  6         38  
12 6     6   1978 use Lingua::LO::NLP::Data ':all';
  6         14  
  6         2045  
13              
14             =encoding utf8
15              
16             =head1 NAME
17              
18             Lingua::LO::NLP::Syllabify - Segment Lao or mixed-script text into syllables.
19              
20             =head1 FUNCTION
21              
22             This implements a purely regular expression based algorithm to segment Lao text
23             into syllables, based on the one described in PHISSAMAY et al:
24             I.
25              
26             =cut
27              
28             has text => (is => 'ro');
29              
30             my $syl_re = Lingua::LO::NLP::Data::get_sylre_basic;
31             my $complete_syl_re = Lingua::LO::NLP::Data::get_sylre_full;
32              
33             =head1 METHODS
34              
35             =head2 new
36              
37             C
38              
39             The constructor takes a mandatory argument containing the text to split, and
40             any number of hash-style named options. Currently, the only such option is
41             C which takes a boolean argument and indicates whether to run the
42             text though a normalization function that swaps tone marks and vowels appearing
43             in the wrong order.
44              
45             Note that in any case text is passed through L first
46             to obtain the Composed Normal Form. In pure Lao text, this affects only the
47             decomposed form of LAO VOWEL SIGN AM that will be transformed from C,
48             C to C.
49              
50             =cut
51              
52             sub new {
53 77     77 1 15425 my $class = shift;
54 77         139 my $text = shift;
55 77 100       368 croak("`text' argument missing or undefined") unless defined $text;
56 76         188 my %opts = @_;
57 76         3494 $text = NFC( $text );
58 76 100       259 normalize_tone_marks($text) if $opts{normalize};
59 76         424 return bless { text => $text }, $class
60             }
61              
62             =head2 get_syllables
63              
64             C
65              
66             Returns a list of Lao syllables found in the text passed to the constructor. If
67             there are any blanks, non-Lao parts etc. mixed in, they will be silently
68             dropped.
69              
70             =cut
71              
72             sub get_syllables {
73 22     22 1 632 return shift->text =~ m/($complete_syl_re)/og;
74             }
75              
76             =head2 get_fragments
77              
78             C
79              
80             Returns a complete segmentation of the text passed to the constructor as an
81             array of hashes. Each hash has two keys:
82              
83             =over 4
84              
85             =item C
86              
87             The text of the respective fragment
88              
89             =item C
90              
91             If true, the fragment is a single valid Lao syllable. If
92             false, it may be whitespace, non-Lao script, Lao characters that don't
93             constitute valid syllables - basically anything at all that's I a valid
94             syllable.
95              
96             =back
97              
98             =cut
99              
100             sub get_fragments {
101 53     53 1 141 my $self = shift;
102 53         1088 my $t = $self->text;
103 53         363 my @matches;
104 53         2979 while($t =~ /\G($complete_syl_re | .+?(?=$complete_syl_re|$) )/oxgcs) {
105 122 100       4122 unless($1 eq "\N{ZERO WIDTH SPACE}") {
106 120         254 my $match = $1;
107 120         3788 push @matches, { text => $match, is_lao => scalar($match =~ /^$syl_re/) };
108             }
109             }
110             return @matches
111 53         262 }
112              
113             1;