File Coverage

blib/lib/Lingua/LO/Transform/Syllables.pm
Criterion Covered Total %
statement 45 45 100.0
branch 3 4 75.0
condition n/a
subroutine 14 14 100.0
pod 3 3 100.0
total 65 66 98.4


line stmt bran cond sub pod time code
1             package Lingua::LO::Transform::Syllables;
2 3     3   253874 use strict;
  3         4  
  3         81  
3 3     3   10 use warnings;
  3         2  
  3         72  
4 3     3   59 use 5.012000;
  3         8  
5 3     3   11 use utf8;
  3         4  
  3         16  
6 3     3   78 use feature 'unicode_strings';
  3         5  
  3         261  
7 3     3   521 use version 0.77; our $VERSION = version->declare('v0.0.1');
  3         1728  
  3         15  
8 3     3   227 use charnames qw/ :full lao /;
  3         3  
  3         16  
9 3     3   1850 use Carp;
  3         3  
  3         213  
10 3     3   1116 use Unicode::Normalize qw/ NFC /;
  3         312040  
  3         353  
11 3     3   1854 use Class::Accessor::Fast 'antlers';
  3         6586  
  3         23  
12 3     3   1078 use Lingua::LO::Transform::Data;
  3         5  
  3         724  
13              
14             =encoding UTF-8
15              
16             =head1 NAME
17              
18             Lingua::LO::Transform::Syllables - Segment Lao or mixed-script text into syllables.
19              
20             =head1 FUNCTION
21              
22             This implements a purely regular expression based algorithm to segment Lao text into syllables, based
23             on the one described in PHISSAMAY et al: I.
24              
25             =cut
26              
27             has text => (is => 'ro');
28              
29             my $syl_re = Lingua::LO::Transform::Data::get_sylre_basic;
30             my $complete_syl_re = Lingua::LO::Transform::Data::get_sylre_full;
31              
32             =head1 METHODS
33              
34             =head2 new
35              
36             C $text, ... )>
37              
38             The constructor takes hash-style named arguments. The only one defined so far
39             is C whose value is obviously the text to be segmented.
40              
41             Note that text is passed through L<"Unicode::Normalize"/NFC> first to obtain the Composed Normal Form. In pure Lao text, this affects only the decomposed form of LAO VOWEL SIGN AM that will be transformed from C,C to C.
42              
43             =cut
44              
45             sub new {
46 42     42 1 12122 my $class = shift;
47 42         92 my %opts = @_;
48 42 50       100 croak("`text' key missing or undefined") unless defined $opts{text};
49             return bless {
50 42         103 text => NFC( $opts{text} ),
51             }, $class;
52             }
53              
54             =head2 get_syllables
55              
56             C
57              
58             Returns a list of Lao syllables found in the text passed to the constructor. If
59             there are any blanks, non-Lao parts etc. mixed in, they will be silently
60             dropped.
61              
62             =cut
63              
64             sub get_syllables {
65 20     20 1 5147 return shift->text =~ m/($complete_syl_re)/og;
66             }
67              
68             =head2 get_fragments
69              
70             C
71              
72             Returns a complete segmentation of the text passed to the constructor as an
73             array of hashes. Each hash has two keys:
74              
75             =over 4
76              
77             =item C: the text of the respective fragment
78              
79             =item C: if true, the fragment is a single valid Lao syllable. If
80             false, it may be whitespace, non-Lao script, Lao characters that don't
81             constitute valid syllables - basically anything at all that's I a valid
82             syllable.
83              
84             =back
85              
86             =cut
87              
88             sub get_fragments {
89 21     21 1 2839 my $self = shift;
90 21         47 my $t = $self->text;
91 21         74 my @matches;
92 21         1611 while($t =~ /\G($complete_syl_re | .+?(?=$complete_syl_re|$) )/oxgcs) {
93 50 100       3047 unless($1 eq "\N{ZERO WIDTH SPACE}") {
94 48         54 my $match = $1;
95 48         1777 push @matches, { text => $match, is_lao => scalar($match =~ /^$syl_re/) };
96             }
97             }
98             return @matches
99 21         90 }
100              
101             1;