|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Text::Util::Chinese;  | 
| 
2
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
10087
 | 
 use strict;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
41
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
167
 | 
    | 
| 
3
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
72
 | 
 use warnings;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
143
 | 
    | 
| 
4
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
31
 | 
 use utf8;  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
6
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
206
 | 
 use Exporter 5.57 'import';  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
172
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
269
 | 
    | 
| 
7
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
6003
 | 
 use Unicode::UCD qw(charscript);  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
321356
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
801
 | 
    | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION = '0.06';  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @EXPORT_OK = qw(sentence_iterator phrase_iterator extract_presuf extract_words tokenize_by_script);  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
12
 | 
6
 | 
 
 | 
 
 | 
  
6
  
 | 
 
 | 
58
 | 
 use List::Util qw(uniq pairmap);  | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
    | 
| 
 
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1599
 | 
    | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub grep_iterator {  | 
| 
15
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
0
  
 | 
109
 | 
     my ($iter, $cb) = @_;  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return sub {  | 
| 
17
 | 
14
 | 
 
 | 
 
 | 
  
14
  
 | 
 
 | 
40
 | 
         local $_;  | 
| 
18
 | 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
18
 | 
         do {  | 
| 
19
 | 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
55
 | 
             $_ = $iter->();  | 
| 
20
 | 
19
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
70
 | 
             return undef unless defined($_);  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } while (! $cb->());  | 
| 
22
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
42
 | 
         return $_;  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
24
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
 }  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub phrase_iterator {  | 
| 
27
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
1
  
 | 
139
 | 
     my ($input_iter, $opts) = @_;  | 
| 
28
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     my @phrases;  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return sub {  | 
| 
30
 | 
1403
 | 
 
 | 
  
100
  
 | 
  
1403
  
 | 
 
 | 
343352
 | 
         while(! @phrases && defined(my $text = $input_iter->())) {  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             @phrases = grep {  | 
| 
32
 | 
692
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
22727
 | 
                 (! /\A\s+\z/) && (! /\p{General_Category=Punctuation}/) && /\p{Han}/  | 
| 
 
 | 
3085
 | 
 
 | 
 
 | 
 
 | 
 
 | 
18748
 | 
    | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             } split / ( \r?\n | \p{General_Category: Other_Punctuation} )+ /x, $text;  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
35
 | 
1403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3186
 | 
         return shift @phrases;  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
37
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
 }  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub sentence_iterator {  | 
| 
40
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
  
0
  
 | 
130488
 | 
     my ($input_iter, $opts) = @_;  | 
| 
41
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     my @sentences;  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return sub {  | 
| 
43
 | 
773
 | 
 
 | 
  
100
  
 | 
  
773
  
 | 
 
 | 
187122
 | 
         while(! @sentences && defined(my $text = $input_iter->())) {  | 
| 
44
 | 
692
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12164
 | 
             @sentences = grep { !/\A\s+\z/ } ($text =~  | 
| 
 
 | 
771
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5464
 | 
    | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                           m/(  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                (?:  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                    [^\p{General_Category: Open_Punctuation}\p{General_Category: Close_Punctuation}]+?  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                | .*? \p{General_Category: Open_Punctuation} .*? \p{General_Category: Close_Punctuation} .*?  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                )  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                (?: \z | [\n\?\!。?!]+ )  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                            )/gx);  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
53
 | 
773
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1813
 | 
         return shift @sentences;  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }      | 
| 
55
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
 }  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub extract_presuf {  | 
| 
58
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
110
 | 
     my ($input_iter, $output_cb, $opts) = @_;  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
60
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     my %stats;  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my %extracted;  | 
| 
62
 | 
1
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
6
 | 
     my $threshold = $opts->{threshold} || 9; # an arbitrary choice.  | 
| 
63
 | 
1
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
11
 | 
     my $lengths   = $opts->{lengths} || [2,3];  | 
| 
64
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
     my $text;  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $phrase_iter = grep_iterator(  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         phrase_iterator( $input_iter ),  | 
| 
68
 | 
7
 | 
 
 | 
 
 | 
  
7
  
 | 
 
 | 
39
 | 
         sub { /\A\p{Han}+\z/ }  | 
| 
69
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     );  | 
| 
70
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     while (my $phrase = $phrase_iter->()) {  | 
| 
71
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
         for my $len ( @$lengths ) {  | 
| 
72
 | 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
34
 | 
             my $re = '\p{Han}{' . $len . '}';  | 
| 
73
 | 
14
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
202
 | 
             next unless length($phrase) >= $len * 2 && $phrase =~ /\A($re) .* ($re)\z/x;  | 
| 
74
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1997
 | 
             my ($prefix, $suffix) = ($1, $2);  | 
| 
75
 | 
12
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
48
 | 
             $stats{prefix}{$prefix}++ unless $extracted{$prefix};  | 
| 
76
 | 
12
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
36
 | 
             $stats{suffix}{$suffix}++ unless $extracted{$suffix};  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
78
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
             for my $x ($prefix, $suffix) {  | 
| 
79
 | 
24
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
147
 | 
                 if (! $extracted{$x}  | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     && $stats{prefix}{$x}  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     && $stats{suffix}{$x}  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     && $stats{prefix}{$x} > $threshold  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     && $stats{suffix}{$x} > $threshold  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 ) {  | 
| 
85
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
                     $extracted{$x} = 1;  | 
| 
86
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
                     delete $stats{prefix}{$x};  | 
| 
87
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
                     delete $stats{suffix}{$x};  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
89
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
                     $output_cb->($x, \%extracted);  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
96
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
     return \%extracted;  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub extract_words {  | 
| 
100
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
100
 | 
     my ($input_iter) = @_;  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
102
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     my (%lcontext, %rcontext);  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
104
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     while( my $txt = $input_iter->() ) {  | 
| 
105
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
110
 | 
         my @phrase = split /\P{Letter}/, $txt;  | 
| 
106
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
17
 | 
         for (@phrase) {  | 
| 
107
 | 
9
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
50
 | 
             next unless /\A\p{Han}+\z/;  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
109
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30
 | 
             my @c = split("", $_);  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
111
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
             for my $i (0..$#c) {  | 
| 
112
 | 
67
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
119
 | 
                 if ($i > 0) {  | 
| 
113
 | 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
144
 | 
                     $lcontext{$c[$i]}{$c[$i-1]}++;  | 
| 
114
 | 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
90
 | 
                     for my $n (2,3) {  | 
| 
115
 | 
116
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
206
 | 
                         if ($i >= $n) {  | 
| 
116
 | 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
205
 | 
                             my $tok = join('', @c[ ($i-$n+1) .. $i] );  | 
| 
117
 | 
89
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
185
 | 
                             if (length($tok) > 1) {  | 
| 
118
 | 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
237
 | 
                                 $lcontext{ $tok }{$c[$i - $n]}++;  | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             }  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         }  | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     }  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
123
 | 
67
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
152
 | 
                 if ($i < $#c) {  | 
| 
124
 | 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
158
 | 
                     $rcontext{$c[$i]}{$c[$i+1]}++;  | 
| 
125
 | 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
87
 | 
                     for my $n (2,3) {  | 
| 
126
 | 
116
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
271
 | 
                         if ($i + $n <= $#c) {  | 
| 
127
 | 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
177
 | 
                             my $tok = join('', @c[$i .. ($i+$n-1)]);  | 
| 
128
 | 
89
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
181
 | 
                             if (length($tok) > 1) {  | 
| 
129
 | 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
330
 | 
                                 $rcontext{ $tok }{ $c[$i+$n] }++;  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             }  | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         }  | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     }  | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
137
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
138
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
80
 | 
     my @tokens = uniq((keys %lcontext), (keys %rcontext));  | 
| 
139
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     my @words;  | 
| 
140
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     my $threshold = 5;  | 
| 
141
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     for my $x (@tokens) {  | 
| 
142
 | 
148
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
180
 | 
         next unless ($threshold <= (keys %{$lcontext{$x}}) && $threshold <= (keys %{$rcontext{$x}}));  | 
| 
 
 | 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
354
 | 
    | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
    | 
| 
143
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
         push @words, $x;  | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
146
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
62
 | 
     return \@words;  | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub tokenize_by_script {  | 
| 
150
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
93
 | 
     my ($str) = @_;  | 
| 
151
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     my @tokens;  | 
| 
152
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
     my @chars = grep { defined($_) } split "", $str;  | 
| 
 
 | 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
68
 | 
    | 
| 
153
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     return () unless @chars;  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
155
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
     my $t = shift(@chars);  | 
| 
156
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     my $s = charscript(ord($t));  | 
| 
157
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15241
 | 
     while(my $char = shift @chars) {  | 
| 
158
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
82
 | 
         my $_s = charscript(ord($char));  | 
| 
159
 | 
40
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2681
 | 
         if ($_s eq $s) {  | 
| 
160
 | 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
             $t .= $char;  | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         else {  | 
| 
163
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22
 | 
             push @tokens, $t;  | 
| 
164
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15
 | 
             $s = $_s;  | 
| 
165
 | 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
27
 | 
             $t = $char;  | 
| 
166
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
167
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
168
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     push @tokens, $t;  | 
| 
169
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
     return grep { ! /\A\s*\z/u } @tokens;  | 
| 
 
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
55
 | 
    | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |