File Coverage

blib/lib/String/Tagged.pm
Criterion Covered Total %
statement 509 542 93.9
branch 215 244 88.1
condition 70 78 89.7
subroutine 54 55 98.1
pod 33 33 100.0
total 881 952 92.5


line stmt bran cond sub pod time code
1             # You may distribute under the terms of either the GNU General Public License
2             # or the Artistic License (the same terms as Perl itself)
3             #
4             # (C) Paul Evans, 2008-2023 -- leonerd@leonerd.org.uk
5              
6             package String::Tagged 0.20;
7              
8 20     20   4465627 use v5.14;
  20         180  
9 20     20   116 use warnings;
  20         36  
  20         599  
10              
11 20     20   110 use Scalar::Util qw( blessed );
  20         44  
  20         1106  
12              
13             require String::Tagged::Extent;
14              
15 20     20   140 use constant FLAG_ANCHOR_BEFORE => 0x01;
  20         39  
  20         1664  
16 20     20   159 use constant FLAG_ANCHOR_AFTER => 0x02;
  20         43  
  20         1036  
17              
18 20     20   120 use constant DEBUG => 0;
  20         45  
  20         1018  
19              
20             # Since we're providing overloading, we should set fallback by default
21 20     20   116 use overload fallback => 1;
  20         51  
  20         159  
22              
23             =head1 NAME
24              
25             C - string buffers with value tags on extents
26              
27             =head1 SYNOPSIS
28              
29             use String::Tagged;
30              
31             my $st = String::Tagged->new( "An important message" );
32              
33             $st->apply_tag( 3, 9, bold => 1 );
34              
35             $st->iter_substr_nooverlap(
36             sub {
37             my ( $substring, %tags ) = @_;
38              
39             print $tags{bold} ? "$substring"
40             : $substring;
41             }
42             );
43              
44             =head1 DESCRIPTION
45              
46             This module implements an object class, instances of which store a (mutable)
47             string buffer that supports tags. A tag is a name/value pair that applies to
48             some extent of the underlying string.
49              
50             The types of tag names ought to be strings, or at least values that are
51             well-behaved as strings, as the names will often be used as the keys in hashes
52             or applied to the C operator.
53              
54             The types of tag values are not restricted - any scalar will do. This could be
55             a simple integer or string, ARRAY or HASH reference, or even a CODE reference
56             containing an event handler of some kind.
57              
58             Tags may be arbitrarily overlapped. Any given offset within the string has in
59             effect, a set of uniquely named tags. Tags of different names are independent.
60             For tags of the same name, only the latest, shortest tag takes effect.
61              
62             For example, consider a string with three tags represented here:
63              
64             Here is my string with tags
65             [-------------------------] foo => 1
66             [-------] foo => 2
67             [---] bar => 3
68              
69             Every character in this string has a tag named C. The value of this tag
70             is 2 for the words C and C and the space inbetween, and 1
71             elsewhere. Additionally, the words C and C and the space between them
72             also have the tag C with a value 3.
73              
74             Since C does not understand the significance of the tag values
75             it therefore cannot detect if two neighbouring tags really contain the same
76             semantic idea. Consider the following string:
77              
78             A string with words
79             [-------] type => "message"
80             [--------] type => "message"
81              
82             This string contains two tags. C will treat this as two
83             different tag values as far as C is concerned, even
84             though C yields the same value for the C tag at any position
85             in the string. The C method may be used to merge tag extents of
86             tags that should be considered as equal.
87              
88             =head1 NAMING
89              
90             I spent a lot of time considering the name for this module. It seems that a
91             number of people across a number of languages all created similar
92             functionality, though named very differently. For the benefit of
93             keyword-based search tools and similar, here's a list of some other names this
94             sort of object might be known by:
95              
96             =over 4
97              
98             =item *
99              
100             Extents
101              
102             =item *
103              
104             Overlays
105              
106             =item *
107              
108             Attribute or attributed strings
109              
110             =item *
111              
112             Markup
113              
114             =item *
115              
116             Out-of-band data
117              
118             =back
119              
120             =cut
121              
122             *is_string_tagged =
123             # It would be nice if we could #ifdef HAVE_PERL_VERSION(...)
124             ( $] >= 5.034 ) ?
125             do { eval 'use experimental "isa"; sub { $_[0] isa __PACKAGE__ }' // die $@ } :
126 271 100   271   1446 do { sub { blessed $_[0] and $_[0]->isa( __PACKAGE__ ) } };
127              
128             =head1 CONSTRUCTOR
129              
130             =cut
131              
132             =head2 new
133              
134             $st = String::Tagged->new( $str )
135              
136             Returns a new instance of a C object. It will contain no tags.
137             If the optional C<$str> argument is supplied, the string buffer will be
138             initialised from this value.
139              
140             If C<$str> is a C object then it will be cloned, as if calling
141             the C method on it.
142              
143             =cut
144              
145             sub new
146             {
147 130     130 1 24969 my $class = shift;
148 130         278 my ( $str ) = @_;
149              
150 130 100       286 return $class->clone( $str ) if is_string_tagged( $str );
151              
152 110 100       280 $str = "" unless defined $str;
153              
154 110         544 return bless {
155             str => "$str",
156             tags => [],
157             }, $class;
158             }
159              
160             =head2 new_tagged
161              
162             $st = String::Tagged->new_tagged( $str, %tags )
163              
164             Shortcut for creating a new C object with the given tags
165             applied to the entire length. The tags will not be anchored at either end.
166              
167             =cut
168              
169             sub new_tagged
170             {
171 8     8 1 1407 my $class = shift;
172 8         29 my ( $str, %tags ) = @_;
173              
174 8         27 my $self = $class->new( $str );
175              
176 8         22 my $length = $self->length;
177 8         43 $self->apply_tag( 0, $length, $_ => $tags{$_} ) for keys %tags;
178              
179 8         32 return $self;
180             }
181              
182             =head2 clone (class)
183              
184             $new = String::Tagged->clone( $orig, %opts )
185              
186             Returns a new instance of C made by cloning the original,
187             subject to the options provided. The returned instance will be in the
188             requested class, which need not match the class of the original.
189              
190             The following options are recognised:
191              
192             =over 4
193              
194             =item only_tags => ARRAY
195              
196             If present, gives an ARRAY reference containing tag names. Only those tags
197             named here will be copied; others will be ignored.
198              
199             =item except_tags => ARRAY
200              
201             If present, gives an ARRAY reference containing tag names. All tags will be
202             copied except those named here.
203              
204             =item convert_tags => HASH
205              
206             If present, gives a HASH reference containing tag conversion functions. For
207             any tags in the original to be copied whose names appear in the hash, the
208             name and value are passed into the corresponding function, which should return
209             an even-sized key/value list giving a tag, or a list of tags, to apply to the
210             new clone.
211              
212             my @new_tags = $convert_tags->{$orig_name}->( $orig_name, $orig_value )
213             # Where @new_tags is ( $new_name, $new_value, $new_name_2, $new_value_2, ... )
214              
215             As a further convenience, if the value for a given tag name is a plain string
216             instead of a code reference, it gives the new name for the tag, and will be
217             applied with its existing value.
218              
219             If C is being used too, then the source names of any tags to be
220             converted must also be listed there, or they will not be copied.
221              
222             =back
223              
224             =head2 clone (instance)
225              
226             $new = $orig->clone( %args )
227              
228             Called as an instance (rather than a class) method, the newly-cloned instance
229             is returned in the same class as the original.
230              
231             =cut
232              
233             sub clone
234             {
235 26 100   26 1 2298 my ( $class, $orig ) = blessed $_[0] ?
236             ( ref $_[0], shift ) :
237             ( shift, shift );
238 26         60 my %opts = @_;
239              
240             my $only = exists $opts{only_tags} ?
241 26 100       73 { map { $_ => 1 } @{ $opts{only_tags} } } :
  1         4  
  1         3  
242             undef;
243              
244             my $except = exists $opts{except_tags} ?
245 26 50       67 { map { $_ => 1 } @{ $opts{except_tags} } } :
  0         0  
  0         0  
246             undef;
247              
248 26         56 my $convert = $opts{convert_tags};
249              
250 26         93 my $new = $class->new( $orig->str );
251              
252             $orig->iter_extents( sub {
253 26     26   68 my ( $e, $tn, $tv ) = @_;
254              
255 26 100 100     77 return if $only and not $only->{$tn};
256 25 0 33     51 return if $except and $except->{$tn};
257              
258 25         60 my @tags;
259 25 100 66     72 if( $convert and my $c = $convert->{$tn} ) {
260 2 100       5 if( ref $c eq "CODE" ) {
261 1         2 @tags = $c->( $tn, $tv );
262             }
263             else {
264 1         2 @tags = ( $c, $tv );
265             }
266             }
267             else {
268 23         76 @tags = ( $tn, $tv );
269             }
270              
271 25         101 while( @tags ) {
272 25         71 $new->apply_tag( $e, shift @tags, shift @tags );
273             }
274 26         216 });
275              
276 26         263 return $new;
277             }
278              
279             sub _mkextent
280             {
281 209     209   307 my $self = shift;
282 209         371 my ( $start, $end, $flags ) = @_;
283              
284 209         650 $flags &= (FLAG_ANCHOR_BEFORE|FLAG_ANCHOR_AFTER);
285              
286 209         920 return bless [ $self, $start, $end, $flags ], 'String::Tagged::Extent';
287             }
288              
289             =head2 from_sprintf
290              
291             $str = String::Tagged->from_sprintf( $format, @args )
292              
293             I
294              
295             Returns a new instance of a C object, initialised by
296             formatting the supplied arguments using the supplied format.
297              
298             The C<$format> string is similar to that supported by the core C
299             operator, though a few features such as out-of-order argument indexing and
300             vector formatting are missing. This format string may be a plain perl string,
301             or an instance of C. In the latter case, any tags within it
302             are preserved in the result.
303              
304             In the case of a C<%s> conversion, the value of the argument consumed may
305             itself be a C instance. In this case it will be appended to
306             the returned object, preserving any tags within it.
307              
308             All other conversions are handled individually by the core C
309             operator and appended to the result.
310              
311             =cut
312              
313             sub from_sprintf
314             {
315 12     12 1 768 my $class = shift;
316 12         28 my ( $format, @args ) = @_;
317              
318             # Clone the format string into the candidate return value, and then
319             # repeatedly replace %... expansions with their required value using
320             # ->set_substr, so that embedded tags in the format will behave sensibly.
321              
322 12 100       28 my $ret = ( is_string_tagged( $format ) ) ?
323             $class->clone( $format ) :
324             $class->new( $format );
325              
326 12         21 my $pos = 0;
327              
328 12         59 while( $pos < length $ret ) {
329 22         38 my $str = "$ret";
330 22         56 pos( $str ) = $pos;
331              
332 22         51 my $replacement;
333              
334 22 100       140 if( $str =~ m/\G[^%]+/gc ) {
    100          
    100          
    50          
    0          
335             # A literal span
336 9         22 $pos = $+[0];
337 9         25 next;
338             }
339             elsif( $str =~ m/\G%%/gc ) {
340             # A literal %% conversion
341 1         3 $replacement = "%";
342             }
343             elsif( $str =~ m/\G%([-]?)(\d+|\*)?(?:\.(\d+|\*))?s/gc ) {
344             # A string
345 10         39 my ( $flags, $width, $precision ) = ( $1, $2, $3 );
346 10 100 100     36 $width = shift @args if defined $width and $width eq "*";
347 10 100 100     31 $precision = shift @args if defined $precision and $precision eq "*";
348 10         19 my $arg = shift @args;
349              
350 10 50       20 defined $arg or do {
351 0         0 warnings::warnif( uninitialized => "Use of ininitialized value in String::Tagged->from_sprintf" );
352 0         0 $arg = "";
353             };
354              
355 10 100       23 if( defined $precision ) {
356 2 50       4 if( is_string_tagged( $arg ) ) {
357 0         0 $arg = $arg->substr( 0, $precision );
358             }
359             else {
360 2         5 $arg = substr $arg, 0, $precision;
361             }
362             }
363              
364 10         20 my $leftalign = $flags =~ m/-/;
365              
366 10 100       21 my $padding = defined $width ? $width - length $arg : 0;
367 10 100       21 $padding = 0 if $padding < 0;
368              
369 10         14 $replacement = "";
370              
371 10 100       24 $replacement .= " " x $padding if !$leftalign;
372              
373 10         21 $replacement .= $arg;
374              
375 10 100       26 $replacement .= " " x $padding if $leftalign;
376             }
377             elsif( $str =~ m/\G%(.*?)([cduoxefgXEGbBpaAiDUOF])/gc ) {
378             # Another conversion format
379 2         8 my ( $template, $flags ) = ( $2, $1 );
380 2         4 my $argc = 1;
381 2         5 $argc += ( () = $flags =~ m/\*/g );
382              
383 2         11 $replacement = sprintf "%$flags$template", @args[0..$argc-1];
384 2         5 splice @args, 0, $argc;
385             }
386             elsif( $str =~ m/\G%(.*?)([a-zA-Z])/gc ) {
387 0         0 warn "Unrecognised sprintf conversion %$2";
388             }
389             else {
390             # must be at EOF now
391 0         0 last;
392             }
393              
394 13         38 my $templatelen = $+[0] - $-[0];
395 13         53 $ret->set_substr( $-[0], $templatelen, $replacement );
396              
397 13         40 $pos += length( $replacement );
398             }
399              
400 12         45 return $ret;
401             }
402              
403             =head2 join
404              
405             $str = String::Tagged->join( $sep, @parts )
406              
407             I
408              
409             Returns a new instance of a C object, formed by concatenating
410             each of the component piece together, joined with the separator string.
411              
412             The result will be much like the core C function, except that it will
413             preserve tags in the resulting string.
414              
415             =cut
416              
417             sub join
418             {
419 1     1 1 2 my $class = shift;
420 1         4 my ( $sep, @parts ) = @_;
421              
422 1 50       3 is_string_tagged( $sep ) or
423             $sep = $class->new( $sep );
424              
425 1         3 my $ret = shift @parts;
426 1         4 $ret .= $sep . $_ for @parts;
427              
428 1         4 return $ret;
429             }
430              
431             =head1 METHODS
432              
433             =cut
434              
435             =head2 str
436              
437             $str = $st->str
438              
439             $str = "$st"
440              
441             Returns the plain string contained within the object.
442              
443             This method is also called for stringification; so the C
444             object can be used in a plain string interpolation such as
445              
446             my $message = String::Tagged->new( "Hello world" );
447             print "My message is $message\n";
448              
449             =cut
450              
451 20     20   28084 use overload '""' => 'str';
  20         46  
  20         113  
452              
453             sub str
454             {
455 195     195 1 19810 my $self = shift;
456 195         763 return $self->{str};
457             }
458              
459             =head2 length
460              
461             $len = $st->length
462              
463             $len = length( $st )
464              
465             Returns the length of the plain string. Because stringification works on this
466             object class, the normal core C function works correctly on it.
467              
468             =cut
469              
470             sub length
471             {
472 378     378 1 551 my $self = shift;
473 378         1099 return CORE::length $self->{str};
474             }
475              
476             =head2 substr
477              
478             $str = $st->substr( $start, $len )
479              
480             Returns a C instance representing a section from within the
481             given string, containing all the same tags at the same conceptual positions.
482              
483             =cut
484              
485             sub substr
486             {
487 26     26 1 54 my $self = shift;
488 26         46 my ( $start, $len ) = @_;
489              
490 26         45 my $end = $start + $len;
491              
492 26         107 my $ret = ( ref $self )->new( CORE::substr( $self->{str}, $start, $len ) );
493              
494 26         61 my $tags = $self->{tags};
495              
496 26         61 foreach my $t ( @$tags ) {
497 30         64 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
498              
499 30 100       60 next if $te < $start;
500 25 100       53 last if $ts >= $end;
501              
502 23         54 $_ -= $start for $ts, $te;
503 23 100       58 next if $te <= 0;
504              
505 22 100 100     92 $ts = -1 if $ts < 0 or $tf & FLAG_ANCHOR_BEFORE;
506 22 100 100     79 $te = -1 if $te > $end or $tf & FLAG_ANCHOR_AFTER;
507              
508 22 100       62 $ret->apply_tag( $ts, $te == -1 ? -1 : $te - $ts, $tn => $tv );
509             }
510              
511 26         97 return $ret;
512             }
513              
514             =head2 plain_substr
515              
516             $str = $st->plain_substr( $start, $len )
517              
518             Returns as a plain perl string, the substring at the given position. This will
519             be the same string data as returned by C, only as a plain string
520             without the tags
521              
522             =cut
523              
524             sub plain_substr
525             {
526 23     23 1 38 my $self = shift;
527 23         44 my ( $start, $len ) = @_;
528              
529 23         99 return CORE::substr( $self->{str}, $start, $len );
530             }
531              
532             sub _cmp_tags
533             {
534 138     138   246 my ( $as, $ae ) = @$a;
535 138         212 my ( $bs, $be ) = @$b;
536              
537             # Sort by start first; shortest first
538 138   100     494 return $as <=> $bs ||
539             $ae <=> $be;
540             }
541              
542             sub _assert_sorted
543             {
544 0     0   0 my $self = shift;
545              
546 0         0 my $tags = $self->{tags};
547             # If fewer than 2 tags, must be sorted
548 0 0       0 return if @$tags < 2;
549              
550 0         0 my $prev = $tags->[0];
551              
552 0         0 for( my $i = 1; $i < @$tags; $i++ ) {
553 0         0 my $here = $tags->[$i];
554 0         0 local ( $a, $b ) = ( $prev, $here );
555 0 0       0 if( _cmp_tags() <= 0 ) {
556 0         0 $prev = $here;
557 0         0 next;
558             }
559              
560 0         0 print STDERR "Tag order violation at i=$i\n";
561 0         0 print STDERR "[@{[ $i - 1 ]}] = [ $tags->[$i-1]->[0], $tags->[$i-1]->[1] ]\n";
  0         0  
562 0         0 print STDERR "[@{[ $i ]}] = [ $tags->[$i]->[0], $tags->[$i]->[1] ]\n";
  0         0  
563 0         0 die "Assert failure";
564             }
565             }
566              
567             sub _insert_tag
568             {
569 144     144   205 my $self = shift;
570 144         278 my ( $start, $end, $name, $value, $flags ) = @_;
571              
572 144         213 my $tags = $self->{tags};
573              
574 144         332 my $newtag = [ $start, $end, $name => $value, $flags ];
575              
576             # Specialcase - if there's no tags yet, just push it
577 144 100       336 if( @$tags == 0 ) {
578 74         141 push @$tags, $newtag;
579 74         145 return;
580             }
581              
582 70         127 local $a = $newtag;
583              
584             # Two more special cases - it's quite likely we're either inserting an
585             # 'everywhere' tag, or appending one to the end. Check the endpoints first
586 70         100 local $b;
587              
588 70         103 $b = $tags->[0];
589 70 100       138 if( _cmp_tags() <= 0 ) {
590 14         33 unshift @$tags, $newtag;
591 14         33 return;
592             }
593              
594 56         103 $b = $tags->[-1];
595 56 100       120 if( _cmp_tags() >= 0 ) {
596 54         97 push @$tags, $newtag;
597 54         133 return;
598             }
599              
600 2         8 my $range_start = 0;
601 2         8 my $range_end = $#$tags;
602              
603 2         4 my $inspos;
604              
605 2         9 while( $range_end > $range_start ) {
606 2         10 my $i = int( ( $range_start + $range_end ) / 2 );
607              
608 2         6 $b = $tags->[$i];
609 2         5 my $cmp = _cmp_tags;
610              
611 2 50       7 if( $cmp > 0 ) {
    0          
612 2         5 $range_start = $i + 1;
613             }
614             elsif( $cmp < 0 ) {
615 0         0 $range_end = $i; # open interval
616             }
617             else {
618 0         0 $inspos = $i;
619 0         0 last;
620             }
621              
622 2 50       9 if( $range_start == $range_end ) {
623 2         5 $inspos = $range_start;
624 2         4 last;
625             }
626             }
627              
628 2 50       6 $inspos = $range_end unless defined $inspos;
629              
630 2 50       7 $inspos = 0 if $inspos < 0;
631 2 50       9 $inspos = @$tags if $inspos > @$tags;
632              
633 2         6 splice @$tags, $inspos, 0, $newtag;
634              
635 2         6 $self->_assert_sorted if DEBUG;
636             }
637              
638             =head2 apply_tag
639              
640             $st->apply_tag( $start, $len, $name, $value )
641              
642             Apply the named tag value to the given extent. The tag will start on the
643             character at the C<$start> index, and continue for the next C<$len>
644             characters.
645              
646             If C<$start> is given as -1, the tag will be considered to start "before" the
647             actual string. If C<$len> is given as -1, the tag will be considered to
648             end "after" end of the actual string. These special limits are used by
649             C when deciding whether to move a tag boundary. The start of any
650             tag that starts "before" the string is never moved, even if more text is
651             inserted at the beginning. Similarly, a tag which ends "after" the end of the
652             string, will continue to the end even if more text is appended.
653              
654             This method returns the C<$st> object.
655              
656             $st->apply_tag( $e, $name, $value )
657              
658             Alternatively, an existing L object can be passed as
659             the first argument instead of two integers. The new tag will apply at the
660             given extent.
661              
662             =cut
663              
664             sub apply_tag
665             {
666 141     141 1 6784 my $self = shift;
667 141         215 my ( $start, $end );
668 141         188 my $flags = 0;
669              
670 141 100       401 if( blessed $_[0] ) {
671 25         48 my $e = shift;
672 25         78 $start = $e->start;
673 25         75 $end = $e->end;
674              
675 25 100       79 $flags |= FLAG_ANCHOR_BEFORE if $e->anchor_before;
676 25 100       70 $flags |= FLAG_ANCHOR_AFTER if $e->anchor_after;
677             }
678             else {
679 116         168 $start = shift;
680 116         164 my $len = shift;
681              
682 116         226 my $strlen = $self->length;
683              
684 116 100       313 if( $start < 0 ) {
685 25         40 $start = 0;
686 25         44 $flags |= FLAG_ANCHOR_BEFORE;
687             }
688              
689 116 100       231 if( $len == -1 ) {
690 30         58 $end = $strlen;
691 30         52 $flags |= FLAG_ANCHOR_AFTER;
692             }
693             else {
694 86         133 $end = $start + $len;
695 86 100       204 $end = $strlen if $end > $strlen;
696             }
697             }
698              
699 141         280 my ( $name, $value ) = @_;
700              
701 141         365 $self->_insert_tag( $start, $end, $name, $value, $flags );
702              
703 141         445 return $self;
704             }
705              
706             sub _remove_tag
707             {
708 4     4   7 my $self = shift;
709 4         6 my $keepends = shift;
710 4         8 my ( $start, $end );
711              
712 4 50       13 if( blessed $_[0] ) {
713 0         0 my $e = shift;
714 0         0 $start = $e->start;
715 0         0 $end = $e->end;
716             }
717             else {
718 4         7 $start = shift;
719 4         7 $end = $start + shift;
720             }
721              
722 4         9 my ( $name ) = @_;
723              
724 4         8 my $tags = $self->{tags};
725              
726 4         7 my $have_added = 0;
727              
728             # Can't foreach() because we modify $i
729 4         13 for( my $i = 0; $i < @$tags; $i++ ) {
730 8         11 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  8         19  
731              
732 8 100       18 next if $te <= $start;
733 7 100       18 last if $ts >= $end;
734              
735 4 50       9 next if $tn ne $name;
736              
737 4 100 100     18 if( $keepends and $end < $te ) {
738 2         7 $self->_insert_tag( $end, $te, $tn, $tv, $tf & ~FLAG_ANCHOR_BEFORE );
739 2         4 $have_added = 1;
740             }
741              
742 4         8 splice @$tags, $i, 1;
743              
744 4 100 100     18 if( $keepends and $ts < $start ) {
745 1         14 $self->_insert_tag( $ts, $start, $tn, $tv, $tf & ~FLAG_ANCHOR_AFTER );
746 1         3 $have_added = 1;
747             }
748             else {
749 3         8 $i--;
750             }
751             }
752              
753 4         6 if( DEBUG && $have_added ) {
754             $self->_assert_sorted;
755             }
756              
757 4         28 return $self;
758             }
759              
760             =head2 unapply_tag
761              
762             $st->unapply_tag( $start, $len, $name )
763              
764             Unapply the named tag value from the given extent. If the tag extends beyond
765             this extent, then any partial fragment of the tag will be left in the string.
766              
767             This method returns the C<$st> object.
768              
769             $st->unapply_tag( $e, $name )
770              
771             Alternatively, an existing L object can be passed as
772             the first argument instead of two integers.
773              
774             =cut
775              
776             sub unapply_tag
777             {
778 3     3 1 7 my $self = shift;
779 3         8 return $self->_remove_tag( 1, @_ );
780             }
781              
782             =head2 delete_tag
783              
784             $st->delete_tag( $start, $len, $name )
785              
786             Delete the named tag within the given extent. Entire tags are removed, even if
787             they extend beyond this extent.
788              
789             This method returns the C<$st> object.
790              
791             $st->delete_tag( $e, $name )
792              
793             Alternatively, an existing L object can be passed as
794             the first argument instead of two integers.
795              
796             =cut
797              
798             sub delete_tag
799             {
800 1     1 1 7510 my $self = shift;
801 1         5 return $self->_remove_tag( 0, @_ );
802             }
803              
804             =head2 merge_tags
805              
806             $st->merge_tags( $eqsub )
807              
808             Merge neighbouring or overlapping tags of the same name and equal values.
809              
810             For each pair of tags of the same name that apply on neighbouring or
811             overlapping extents, the C<$eqsub> callback is called, as
812              
813             $equal = $eqsub->( $name, $value_a, $value_b )
814              
815             If this function returns true then the tags are merged.
816              
817             The equallity test function is free to perform any comparison of the values
818             that may be relevant to the application; for example it may deeply compare
819             referred structures and check for equivalence in some application-defined
820             manner. In this case, the first tag of a pair is retained, the second is
821             deleted. This may be relevant if the tag value is a reference to some object.
822              
823             =cut
824              
825             sub merge_tags
826             {
827 8     8 1 8382 my $self = shift;
828 8         15 my ( $eqsub ) = @_;
829              
830 8         15 my $tags = $self->{tags};
831              
832             # Can't foreach() because we modify @$tags
833 8         25 OUTER: for( my $i = 0; $i < @$tags; $i++ ) {
834 13         20 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  13         30  
835              
836 13         40 for( my $j = $i+1; $j < @$tags; $j++ ) {
837 11         19 my ( $t2s, $t2e, $t2n, $t2v, $t2f ) = @{ $tags->[$j] };
  11         21  
838              
839 11 100       25 last if $t2s > $te;
840 10 50       22 next unless $t2s <= $te;
841 10 100       27 next unless $t2n eq $tn;
842              
843 7 100       17 last unless $eqsub->( $tn, $tv, $t2v );
844              
845             # Need to delete the tag at $j, extend the end of the tag at $i, and
846             # possibly move $i later
847 6         36 splice @$tags, $j, 1, ();
848 6         12 $j--;
849              
850 6         12 $te = $tags->[$i][1] = $t2e;
851              
852 6 100       14 $tags->[$i][4] |= FLAG_ANCHOR_AFTER if $t2f & FLAG_ANCHOR_AFTER;
853              
854 6         11 local $a = $tags->[$i];
855              
856 6 100 100     32 if( local $b = $tags->[$i+1] and _cmp_tags() > 0 ) {
857 1         5 my $newpos = $i+1;
858 1   33     6 while( local $b = $tags->[$newpos ] and _cmp_tags() <= 0 ) {
859 0         0 $newpos++;
860             }
861              
862 1         3 splice @$tags, $newpos, 0, splice @$tags, $i, 1, ();
863              
864 1         3 redo OUTER;
865             }
866             }
867             }
868             }
869              
870             =head2 iter_extents
871              
872             $st->iter_extents( $callback, %opts )
873              
874             Iterate the tags stored in the string. For each tag, the CODE reference in
875             C<$callback> is invoked once, being passed a L object
876             that represents the extent of the tag.
877              
878             $callback->( $extent, $tagname, $tagvalue )
879              
880             Options passed in C<%opts> may include:
881              
882             =over 4
883              
884             =item start => INT
885              
886             Start at the given position; defaults to 0.
887              
888             =item end => INT
889              
890             End after the given position; defaults to end of string. This option overrides
891             C.
892              
893             =item len => INT
894              
895             End after the given length beyond the start position; defaults to end of
896             string. This option only applies if C is not given.
897              
898             =item only => ARRAY
899              
900             Select only the tags named in the given ARRAY reference.
901              
902             =item except => ARRAY
903              
904             Select all the tags except those named in the given ARRAY reference.
905              
906             =back
907              
908             =cut
909              
910             sub iter_extents
911             {
912 76     76 1 166 my $self = shift;
913 76         150 my ( $callback, %opts ) = @_;
914              
915             my $start = exists $opts{start} ? $opts{start} :
916 76 100       183 0;
917              
918             my $end = exists $opts{end} ? $opts{end} :
919             exists $opts{len} ? $start + $opts{len} :
920 76 50       361 $self->length + 1; # so as to include zerolen at end
    100          
921              
922 76 100       175 my $only = exists $opts{only} ? { map { $_ => 1 } @{ $opts{only} } } :
  1         5  
  1         3  
923             undef;
924              
925 76 100       173 my $except = exists $opts{except} ? { map { $_ => 1 } @{ $opts{except} } } :
  1         4  
  1         4  
926             undef;
927              
928 76         146 my $tags = $self->{tags};
929              
930 76         178 foreach my $t ( @$tags ) {
931 97         288 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
932              
933 97 100       246 next if $te < $start;
934 96 100       205 last if $ts >= $end;
935              
936 95 100 100     239 next if $only and !$only->{$tn};
937 93 100 100     205 next if $except and $except->{$tn};
938              
939 92         207 $callback->( $self->_mkextent( $ts, $te, $tf ), $tn, $tv );
940             }
941             }
942              
943             =head2 iter_tags
944              
945             $st->iter_tags( $callback, %opts )
946              
947             Iterate the tags stored in the string. For each tag, the CODE reference in
948             C<$callback> is invoked once, being passed the start point and length of the
949             tag.
950              
951             $callback->( $start, $length, $tagname, $tagvalue )
952              
953             Options passed in C<%opts> are the same as for C.
954              
955             =cut
956              
957             sub iter_tags
958             {
959 25     25 1 11323 my $self = shift;
960 25         54 my ( $callback, %opts ) = @_;
961              
962             $self->iter_extents(
963             sub {
964 36     36   70 my ( $e, $tn, $tv ) = @_;
965 36         102 $callback->( $e->start, $e->length, $tn, $tv );
966             },
967 25         129 %opts
968             );
969             }
970              
971             =head2 iter_extents_nooverlap
972              
973             $st->iter_extents_nooverlap( $callback, %opts )
974              
975             Iterate non-overlapping extents of tags stored in the string. The CODE
976             reference in C<$callback> is invoked for each extent in the string where no
977             tags change. The entire set of tags active in that extent is given to the
978             callback. Because the extent covers possibly-multiple tags, it will not define
979             the C and C flags.
980              
981             $callback->( $extent, %tags )
982              
983             The callback will be invoked over the entire length of the string, including
984             any extents with no tags applied.
985              
986             Options may be passed in C<%opts> to control the range of the string iterated
987             over, in the same way as the C method.
988              
989             If the C or C filters are applied, then only the tags that
990             survive filtering will be present in the C<%tags> hash. Tags that are excluded
991             by the filtering will not be present, nor will their bounds be used to split
992             the string into extents.
993              
994             =cut
995              
996             sub iter_extents_nooverlap
997             {
998 34     34 1 66 my $self = shift;
999 34         68 my ( $callback, %opts ) = @_;
1000              
1001             my $start = exists $opts{start} ? $opts{start} :
1002 34 100       94 0;
1003              
1004             my $end = exists $opts{end} ? $opts{end} :
1005             exists $opts{len} ? $start + $opts{len} :
1006 34 50       117 $self->length;
    100          
1007              
1008 34 100       85 my $only = exists $opts{only} ? { map { $_ => 1 } @{ $opts{only} } } :
  1         7  
  1         23  
1009             undef;
1010              
1011 34 100       73 my $except = exists $opts{except} ? { map { $_ => 1 } @{ $opts{except} } } :
  1         4  
  1         3  
1012             undef;
1013              
1014 34         58 my $tags = $self->{tags};
1015              
1016 34         47 my @active; # ARRAY of [ $ts, $te, $tn, $tv ]
1017 34         55 my $pos = $start;
1018              
1019 34         78 foreach my $t ( @$tags ) {
1020 76         164 my ( $ts, $te, $tn, $tv ) = @$t;
1021              
1022 76 100       161 next if $te < $start;
1023 74 100       151 last if $ts > $end;
1024              
1025 72 100 100     163 next if $only and !$only->{$tn};
1026 70 100 100     142 next if $except and $except->{$tn};
1027              
1028 69         138 while( $pos < $ts ) {
1029 43         83 my %activetags;
1030             my %tagends;
1031 43         63 my $rangeend = $ts;
1032              
1033 43         82 foreach ( @active ) {
1034 42         79 my ( undef, $e, $n, $v ) = @$_;
1035              
1036 42 100       83 $e < $rangeend and $rangeend = $e;
1037 42 100 66     113 next if $tagends{$n} and $tagends{$n} < $e;
1038              
1039 41         72 $activetags{$n} = $v;
1040 41         85 $tagends{$n} = $e;
1041             }
1042              
1043 43         99 $callback->( $self->_mkextent( $pos, $rangeend, 0 ), %activetags );
1044              
1045 43         446 $pos = $rangeend;
1046 43         88 @active = grep { $_->[1] > $pos } @active;
  42         161  
1047             }
1048              
1049 69         205 push @active, [ $ts, $te, $tn, $tv ];
1050             }
1051              
1052 34         90 while( $pos < $end ) {
1053 53         93 my %activetags;
1054             my %tagends;
1055 53         69 my $rangeend = $end;
1056              
1057 53         89 foreach ( @active ) {
1058 72         131 my ( undef, $e, $n, $v ) = @$_;
1059              
1060 72 100       144 $e < $rangeend and $rangeend = $e;
1061 72 100 100     195 next if $tagends{$n} and $tagends{$n} < $e;
1062              
1063 71         124 $activetags{$n} = $v;
1064 71         151 $tagends{$n} = $e;
1065             }
1066              
1067 53         114 $callback->( $self->_mkextent( $pos, $rangeend, 0 ), %activetags );
1068              
1069 53         1497 $pos = $rangeend;
1070 53         101 @active = grep { $_->[1] > $pos } @active;
  72         254  
1071             }
1072              
1073             # We might have zero-length tags active at the very end of the range
1074 34 100       185 if( my @zerolen = grep { $_->[0] == $pos and $_->[1] == $pos } @active ) {
  3 100       24  
1075 1         2 my %activetags;
1076 1         3 foreach ( @active ) {
1077 1         2 my ( undef, undef, $n, $v ) = @$_;
1078              
1079 1         3 $activetags{$n} = $v;
1080             }
1081              
1082 1         4 $callback->( $self->_mkextent( $pos, $pos, 0 ), %activetags );
1083             }
1084             }
1085              
1086             =head2 iter_tags_nooverlap
1087              
1088             $st->iter_tags_nooverlap( $callback, %opts )
1089              
1090             Iterate extents of the string using C, but passing
1091             the start and length of each extent to the callback instead of the extent
1092             object.
1093              
1094             $callback->( $start, $length, %tags )
1095              
1096             Options may be passed in C<%opts> to control the range of the string iterated
1097             over, in the same way as the C method.
1098              
1099             =cut
1100              
1101             sub iter_tags_nooverlap
1102             {
1103 28     28 1 9051 my $self = shift;
1104 28         66 my ( $callback, %opts ) = @_;
1105              
1106             $self->iter_extents_nooverlap(
1107             sub {
1108 80     80   177 my ( $e, %tags ) = @_;
1109 80         204 $callback->( $e->start, $e->length, %tags );
1110             },
1111 28         188 %opts
1112             );
1113             }
1114              
1115             =head2 iter_substr_nooverlap
1116              
1117             $st->iter_substr_nooverlap( $callback, %opts )
1118              
1119             Iterate extents of the string using C, but passing the
1120             substring of data instead of the extent object.
1121              
1122             $callback->( $substr, %tags )
1123              
1124             Options may be passed in C<%opts> to control the range of the string iterated
1125             over, in the same way as the C method.
1126              
1127             =cut
1128              
1129             sub iter_substr_nooverlap
1130             {
1131 6     6 1 6363 my $self = shift;
1132 6         19 my ( $callback, %opts ) = @_;
1133              
1134             $self->iter_extents_nooverlap(
1135             sub {
1136 17     17   41 my ( $e, %tags ) = @_;
1137 17         47 $callback->( $e->plain_substr, %tags );
1138             },
1139 6         40 %opts,
1140             );
1141             }
1142              
1143             =head2 tagnames
1144              
1145             @names = $st->tagnames
1146              
1147             Returns the set of tag names used in the string, in no particular order.
1148              
1149             =cut
1150              
1151             sub tagnames
1152             {
1153 12     12 1 52 my $self = shift;
1154              
1155 12         85 my $tags = $self->{tags};
1156              
1157 12         22 my %tags;
1158 12         29 foreach my $t ( @$tags ) {
1159 14         37 $tags{$t->[2]}++;
1160             }
1161              
1162 12         95 keys %tags;
1163             }
1164              
1165             =head2 get_tags_at
1166              
1167             $tags = $st->get_tags_at( $pos )
1168              
1169             Returns a HASH reference of all the tag values active at the given position.
1170              
1171             =cut
1172              
1173             sub get_tags_at
1174             {
1175 11     11 1 1927 my $self = shift;
1176 11         21 my ( $pos ) = @_;
1177              
1178 11         20 my $tags = $self->{tags};
1179              
1180 11         18 my %tags;
1181              
1182             # TODO: turn this into a binary search
1183 11         22 foreach my $t ( @$tags ) {
1184 14         29 my ( $ts, $te, $tn, $tv ) = @$t;
1185              
1186 14 100       34 last if $ts > $pos;
1187 11 100       28 next if $te <= $pos;
1188              
1189 10         26 $tags{$tn} = $tv;
1190             }
1191              
1192 11         51 return \%tags;
1193             }
1194              
1195             =head2 get_tag_at
1196              
1197             $value = $st->get_tag_at( $pos, $name )
1198              
1199             Returns the value of the named tag at the given position, or C if the
1200             tag is not applied there.
1201              
1202             =cut
1203              
1204             sub get_tag_at
1205             {
1206 6     6 1 20 my $self = shift;
1207 6         29 my ( $pos, $name ) = @_;
1208              
1209 6         17 my $tags = $self->{tags};
1210              
1211 6         27 my $value;
1212              
1213 6         19 foreach my $t ( @$tags ) {
1214 15         32 my ( $ts, $te, $tn, $tv ) = @$t;
1215              
1216 15 100       60 last if $ts > $pos;
1217 11 100       26 next if $te <= $pos;
1218              
1219 8 100       40 $value = $tv if $tn eq $name;
1220             }
1221              
1222 6         26 return $value;
1223             }
1224              
1225             =head2 get_tag_extent
1226              
1227             $extent = $st->get_tag_extent( $pos, $name )
1228              
1229             If the named tag applies to the given position, returns a
1230             L object to represent the extent of the tag at that
1231             position. If it does not, C is returned. If an extent is returned it
1232             will define the C and C flags if appropriate.
1233              
1234             =cut
1235              
1236             sub get_tag_extent
1237             {
1238 10     10 1 28 my $self = shift;
1239 10         22 my ( $pos, $name ) = @_;
1240              
1241 10         22 my $tags = $self->{tags};
1242              
1243 10         16 my ( $start, $end, $flags );
1244              
1245 10         23 foreach my $t ( @$tags ) {
1246 14         45 my ( $ts, $te, $tn, undef, $tf ) = @$t;
1247              
1248 14 100       36 last if $ts > $pos;
1249 13 100       34 next if $te <= $pos;
1250              
1251 12 100       43 next unless $tn eq $name;
1252              
1253 9         16 $start = $ts;
1254 9         21 $end = $te;
1255 9         19 $flags = $tf;
1256             }
1257              
1258 10 100       29 if( defined $start ) {
1259 9         28 return $self->_mkextent( $start, $end, $flags );
1260             }
1261             else {
1262 1         4 return undef;
1263             }
1264             }
1265              
1266             =head2 get_tag_missing_extent
1267              
1268             $extent = $st->get_tag_missing_extent( $pos, $name )
1269              
1270             If the named tag does not apply at the given position, returns the extent of
1271             the string around that position that does not have the tag. If it does exist,
1272             C is returned. If an extent is returned it will not define the
1273             C and C flags, as these do not make sense for the
1274             range in which a tag is absent.
1275              
1276             =cut
1277              
1278             sub get_tag_missing_extent
1279             {
1280 3     3 1 486 my $self = shift;
1281 3         6 my ( $pos, $name ) = @_;
1282              
1283 3         6 my $tags = $self->{tags};
1284              
1285 3         6 my $start = 0;
1286              
1287 3         5 foreach my $t ( @$tags ) {
1288 6         13 my ( $ts, $te, $tn ) = @$t;
1289              
1290 6 100       15 next unless $tn eq $name;
1291              
1292 3 100 100     16 if( $ts <= $pos and $te > $pos ) {
1293 1         5 return undef;
1294             }
1295              
1296 2 100       6 if( $ts > $pos ) {
1297 1         3 return $self->_mkextent( $start, $ts, 0 );
1298             }
1299              
1300 1         3 $start = $te;
1301             }
1302              
1303 1         4 return $self->_mkextent( $start, $self->length, 0 );
1304             }
1305              
1306             =head2 set_substr
1307              
1308             $st->set_substr( $start, $len, $newstr )
1309              
1310             Modifies a extent of the underlying plain string to that given. The extents of
1311             tags in the string are adjusted to cope with the modified region, and the
1312             adjustment in length.
1313              
1314             Tags entirely before the replaced extent remain unchanged.
1315              
1316             Tags entirely within the replaced extent are deleted.
1317              
1318             Tags entirely after the replaced extent are moved by appropriate amount to
1319             ensure they still apply to the same characters as before.
1320              
1321             Tags that start before and end after the extent remain, and have their lengths
1322             suitably adjusted.
1323              
1324             Tags that span just the start or end of the extent, but not both, are
1325             truncated, so as to remove the part of the tag applied on the modified extent
1326             but preserving that applied outside.
1327              
1328             If C<$newstr> is a C object, then its tags will be applied to
1329             C<$st> as appropriate. Edge-anchored tags in C<$newstr> will not be extended
1330             through C<$st>, though they will apply as edge-anchored if they now sit at the
1331             edge of the new string.
1332              
1333             =cut
1334              
1335             sub set_substr
1336             {
1337 56     56 1 14706 my $self = shift;
1338 56         129 my ( $start, $len, $new ) = @_;
1339              
1340 56         168 my $limit = $self->length;
1341              
1342 56 50       202 $start = $limit if $start > $limit;
1343 56 50       142 $len = ( $limit - $start ) if $len > ( $limit - $start );
1344              
1345 56         234 CORE::substr( $self->{str}, $start, $len ) = $new;
1346              
1347 56         116 my $oldend = $start + $len;
1348 56         106 my $newend = $start + CORE::length( $new );
1349              
1350 56         97 my $delta = $newend - $oldend;
1351             # Positions after $oldend have now moved up $delta places
1352              
1353 56         98 my $tags = $self->{tags};
1354              
1355 56         99 my $i = 0;
1356              
1357 56         161 for( ; $i < @$tags; $i++ ) {
1358             # In this loop we'll handle tags that start before the deleted section
1359              
1360 42         91 my $t = $tags->[$i];
1361 42         103 my ( $ts, $te, undef, undef, $tf ) = @$t;
1362              
1363 42 100 100     174 last if $ts >= $start and not( $tf & FLAG_ANCHOR_BEFORE );
1364              
1365             # Two cases:
1366             # A: Tag spans entirely outside deleted section - stretch/compress it
1367             # We may have to collapse it to nothing, so delete it
1368             # B: Tag starts before but ends within deleted section - truncate it
1369             # Plus a case we don't care about
1370             # Tag starts and ends entirely before the deleted section - ignore it
1371              
1372 30 100 100     204 if( $te > $oldend or
    100 100        
1373             ( $te == $oldend and $tf & FLAG_ANCHOR_AFTER ) ) {
1374             # Case A
1375 16         38 $t->[1] += $delta;
1376              
1377 16 50       69 if( $t->[0] == $t->[1] ) {
1378 0         0 splice @$tags, $i, 1, ();
1379 0         0 $i--;
1380 0         0 next;
1381             }
1382             }
1383             elsif( $te > $start ) {
1384             # Case B
1385 1         4 $t->[1] = $start;
1386             }
1387             }
1388              
1389 56         156 for( ; $i < @$tags; $i++ ) {
1390 13         30 my $t = $tags->[$i];
1391 13         27 my ( $ts, $te ) = @$t;
1392              
1393             # In this loop we'll handle tags that start within the deleted section
1394 13 100       48 last if $ts >= $oldend;
1395              
1396             # Two cases
1397             # C: Tag contained entirely within deleted section - delete it
1398             # D: Tag starts within but ends after the deleted section - truncate it
1399              
1400 3 100       8 if( $te <= $oldend ) {
1401             # Case C
1402 2         4 splice @$tags, $i, 1;
1403 2         5 $i--;
1404 2         6 next;
1405             }
1406             else {
1407             # Case D
1408 1         3 $t->[0] = $newend;
1409 1         3 $t->[1] += $delta;
1410             }
1411             }
1412              
1413 56         166 for( ; $i < @$tags; $i++ ) {
1414 12         24 my $t = $tags->[$i];
1415 12         42 my ( $ts, $te, undef, undef, $tf ) = @$t;
1416              
1417             # In this loop we'll handle tags that start after the deleted section
1418              
1419             # One case
1420             # E: Tag starts and ends after the deleted section - move it
1421 12 100       37 $t->[0] += $delta unless $tf & FLAG_ANCHOR_BEFORE;
1422 12         23 $t->[1] += $delta;
1423              
1424             # If we've not moved the start (because it was FLAG_ANCHOR_BEFORE), we
1425             # might now have an ordering constraint violation. Better fix it.
1426 12         25 local $b = $t;
1427 12         53 foreach my $new_i ( reverse 0 .. $i-1 ) {
1428 7         16 local $a = $tags->[$new_i];
1429              
1430 7 100       18 last if _cmp_tags() <= 0;
1431              
1432 1         5 splice @$tags, $new_i, 0, splice @$tags, $i, 1, ();
1433              
1434 1         3 last;
1435             }
1436             }
1437              
1438 56 100       145 if( is_string_tagged( $new ) ) {
1439 21         64 my $atstart = $start == 0;
1440 21         62 my $atend = $newend == $self->length;
1441              
1442             $new->iter_extents( sub {
1443 21     21   50 my ( $e, $tn, $tv ) = @_;
1444 21 50 66     121 $self->apply_tag(
    100 100        
1445             ( $atstart && $e->anchor_before ) ? -1 : $start + $e->start,
1446             ( $atend && $e->anchor_after ) ? -1 : $e->length,
1447             $tn, $tv );
1448 21         190 } );
1449             }
1450              
1451 56         162 $self->_assert_sorted if DEBUG;
1452              
1453 56         193 return $self;
1454             }
1455              
1456             =head2 insert
1457              
1458             $st->insert( $start, $newstr )
1459              
1460             Insert the given string at the given position. A shortcut around
1461             C.
1462              
1463             If C<$newstr> is a C object, then its tags will be applied to
1464             C<$st> as appropriate. If C<$start> is 0, any before-anchored tags in will
1465             become before-anchored in C<$st>.
1466              
1467             =cut
1468              
1469             sub insert
1470             {
1471 15     15 1 7673 my $self = shift;
1472 15         82 my ( $at, $new ) = @_;
1473 15         63 $self->set_substr( $at, 0, $new );
1474             }
1475              
1476             =head2 append
1477              
1478             $st->append( $newstr )
1479              
1480             $st .= $newstr
1481              
1482             Append to the underlying plain string. A shortcut around C.
1483              
1484             If C<$newstr> is a C object, then its tags will be applied to
1485             C<$st> as appropriate. Any after-anchored tags in will become after-anchored
1486             in C<$st>.
1487              
1488             =cut
1489              
1490 20     20   73768 use overload '.=' => 'append';
  20         51  
  20         108  
1491              
1492             sub append
1493             {
1494 56     56 1 1420 my $self = shift;
1495 56         125 my ( $new ) = @_;
1496              
1497 56 100       117 return $self->set_substr( $self->length, 0, $new ) if is_string_tagged( $new );
1498              
1499             # Optimised version
1500 38         130 $self->{str} .= $new;
1501              
1502 38         93 my $newend = $self->length;
1503              
1504 38         162 my $tags = $self->{tags};
1505              
1506 38         82 my $i = 0;
1507              
1508             # Adjust boundaries of ANCHOR_AFTER tags
1509 38         121 for( ; $i < @$tags; $i++ ) {
1510 46         85 my $t = $tags->[$i];
1511 46 100       194 $t->[1] = $newend if $t->[4] & FLAG_ANCHOR_AFTER;
1512             }
1513              
1514 38         137 return $self;
1515             }
1516              
1517             =head2 append_tagged
1518              
1519             $st->append_tagged( $newstr, %tags )
1520              
1521             Append to the underlying plain string, and apply the given tags to the
1522             newly-inserted extent.
1523              
1524             Returns C<$st> itself so that the method may be easily chained.
1525              
1526             =cut
1527              
1528             sub append_tagged
1529             {
1530 10     10 1 23 my $self = shift;
1531 10         33 my ( $new, %tags ) = @_;
1532              
1533 10         26 my $start = $self->length;
1534 10         20 my $len = CORE::length( $new );
1535              
1536 10         28 $self->append( $new );
1537 10         52 $self->apply_tag( $start, $len, $_, $tags{$_} ) for keys %tags;
1538              
1539 10         39 return $self;
1540             }
1541              
1542             =head2 concat
1543              
1544             $ret = $st->concat( $other )
1545              
1546             $ret = $st . $other
1547              
1548             Returns a new C containing the two strings concatenated
1549             together, preserving any tags present. This method overloads normal string
1550             concatenation operator, so expressions involving C values
1551             retain their tags.
1552              
1553             This method or operator tries to respect subclassing; preferring to return a
1554             new object of a subclass if either argument or operand is a subclass of
1555             C. If they are both subclasses, it will prefer the type of the
1556             invocant or first operand.
1557              
1558             =cut
1559              
1560 20     20   6018 use overload '.' => 'concat';
  20         83  
  20         126  
1561              
1562             sub concat
1563             {
1564 17     17 1 7304 my $self = shift;
1565 17         50 my ( $other, $swap ) = @_;
1566              
1567             # Try to find the "higher" subclass
1568 17 100 100     95 my $class = ( ref $self eq __PACKAGE__ and is_string_tagged( $other ) )
1569             ? ref $other : ref $self;
1570              
1571 17         67 my $ret = $class->new( $self );
1572 17 100       103 return $ret->insert( 0, $other ) if $swap;
1573 6         19 return $ret->append( $other );
1574             }
1575              
1576             =head2 matches
1577              
1578             @subs = $st->matches( $regexp )
1579              
1580             Returns a list of substrings (as C instances) for every
1581             non-overlapping match of the given C<$regexp>.
1582              
1583             This could be used, for example, to build a formatted string from a formatted
1584             template containing variable expansions:
1585              
1586             my $template = ...
1587             my %vars = ...
1588              
1589             my $ret = String::Tagged->new;
1590             foreach my $m ( $template->matches( qr/\$\w+|[^$]+/ ) ) {
1591             if( $m =~ m/^\$(\w+)$/ ) {
1592             $ret->append_tagged( $vars{$1}, %{ $m->get_tags_at( 0 ) } );
1593             }
1594             else {
1595             $ret->append( $m );
1596             }
1597             }
1598              
1599             This iterates segments of the template containing variables expansions
1600             starting with a C<$> symbol, and replaces them with values from the C<%vars>
1601             hash, careful to preserve all the formatting tags from the original template
1602             string.
1603              
1604             =cut
1605              
1606             sub matches
1607             {
1608 1     1 1 8 my $self = shift;
1609 1         3 my ( $re ) = @_;
1610              
1611 1         3 my $plain = $self->str;
1612              
1613 1         3 my @ret;
1614 1         9 while( $plain =~ m/$re/g ) {
1615 5         23 push @ret, $self->substr( $-[0], $+[0] - $-[0] );
1616             }
1617              
1618 1         6 return @ret;
1619             }
1620              
1621             =head2 match_extents
1622              
1623             @extents = $st->match_extents( $regexp )
1624              
1625             I
1626              
1627             Returns a list of extent objects for every non-overlapping match of the given
1628             C<$regexp>. This is similar to L, except that the results are
1629             returned as extent objects instead of substrings, allowing access to the
1630             position information as well.
1631              
1632             If using the result of this method to find regions of a string to modify,
1633             remember that any length alterations will not update positions in later extent
1634             objects. However, since the extents are non-overlapping and in position order,
1635             this can be handled by iterating them in reverse order so that the
1636             modifications done first are later in the string.
1637              
1638             foreach my $e ( reverse $st->match_extents( $pattern ) ) {
1639             $st->set_substr( $e->start, $e->length, $replacement );
1640             }
1641              
1642             =cut
1643              
1644             sub match_extents
1645             {
1646 2     2 1 9581 my $self = shift;
1647 2         6 my ( $re ) = @_;
1648              
1649 2         4 my $plain = $self->str;
1650              
1651 2         5 my @ret;
1652 2         18 while( $plain =~ m/$re/g ) {
1653 9         25 push @ret, $self->_mkextent( $-[0], $+[0], 0 );
1654             }
1655              
1656 2         11 return @ret;
1657             }
1658              
1659             =head2 split
1660              
1661             @parts = $st->split( $regexp, $limit )
1662              
1663             Returns a list of substrings by applying the regexp to the string content;
1664             similar to the core perl C function. If C<$limit> is supplied, the
1665             method will stop at that number of elements, returning the entire remainder of
1666             the input string as the final element. If the C<$regexp> contains a capture
1667             group then the content of the first one will be added to the return list as
1668             well.
1669              
1670             =cut
1671              
1672             sub split
1673             {
1674 4     4 1 30 my $self = shift;
1675 4         8 my ( $re, $limit ) = @_;
1676              
1677 4         9 my $plain = $self->str;
1678              
1679 4         8 my $prev = 0;
1680 4         7 my @ret;
1681 4         26 while( $plain =~ m/$re/g ) {
1682 5         23 push @ret, $self->substr( $prev, $-[0]-$prev );
1683 5 100       22 push @ret, $self->substr( $-[1], $+[1]-$-[1] ) if @- > 1;
1684              
1685 5         13 $prev = $+[0];
1686              
1687 5 100 66     31 last if defined $limit and @ret == $limit-1;
1688             }
1689              
1690 4 100       11 if( CORE::length $plain > $prev ) {
1691 3         7 push @ret, $self->substr( $prev, CORE::length( $plain ) - $prev );
1692             }
1693              
1694 4         15 return @ret;
1695             }
1696              
1697             =head2 sprintf
1698              
1699             $ret = $st->sprintf( @args )
1700              
1701             I
1702              
1703             Returns a new string by using the given instance as the format string for a
1704             L constructor call. The returned instance will be of the same
1705             class as the invocant.
1706              
1707             =cut
1708              
1709             sub sprintf
1710             {
1711 1     1 1 2 my $self = shift;
1712              
1713 1         5 return ( ref $self )->from_sprintf( $self, @_ );
1714             }
1715              
1716             =head2 debug_sprintf
1717              
1718             $ret = $st->debug_sprintf
1719              
1720             Returns a representation of the string data and all the tags, suitable for
1721             debug printing or other similar use. This is a format such as is given in the
1722             DESCRIPTION section above.
1723              
1724             The output will consist of a number of lines, the first containing the plain
1725             underlying string, then one line per tag. The line shows the extent of the tag
1726             given by C<[---]> markers, or a C<|> in the special case of a tag covering
1727             only a single character. Special markings of C> and C> indicate
1728             tags which are "before" or "after" anchored.
1729              
1730             For example:
1731              
1732             Hello, world
1733             [---] word => 1
1734             <[----------]> everywhere => 1
1735             | space => 1
1736              
1737             =cut
1738              
1739             sub debug_sprintf
1740             {
1741 5     5 1 794 my $self = shift;
1742              
1743 5         14 my $str = $self->str;
1744 5         12 my $len = CORE::length( $str );
1745              
1746 5         8 my $maxnamelen = 0;
1747              
1748 5         21 my $ret = " " . ( $str =~ s/\n/./gr ) . "\n";
1749              
1750             $self->iter_tags( sub {
1751 7     7   14 my ( undef, undef, $name, undef ) = @_;
1752 7 100       34 CORE::length( $name ) > $maxnamelen and $maxnamelen = CORE::length( $name );
1753 5         35 } );
1754              
1755 5         32 foreach my $t ( @{ $self->{tags} } ) {
  5         15  
1756 7         17 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
1757              
1758 7 100       22 $ret .= ( $tf & FLAG_ANCHOR_BEFORE ) ? " <" : " ";
1759              
1760 7         16 $ret .= " " x $ts;
1761              
1762 7         11 my $tl = $te - $ts;
1763              
1764 7 100       19 if( $tl == 0 ) {
    100          
1765 1         6 $ret =~ s/ $/>
1766 1         3 $te++; # account for extra printed width
1767             }
1768             elsif( $tl == 1 ) {
1769 2         3 $ret .= "|";
1770             }
1771             else {
1772 4         11 $ret .= "[" . ( "-" x ( $tl - 2 ) ) . "]";
1773             }
1774              
1775 7         11 $ret .= " " x ( $len - $te );
1776              
1777 7 100       17 $ret .= ( $tf & FLAG_ANCHOR_AFTER ) ? "> " : " ";
1778              
1779 7         30 $ret .= CORE::sprintf "%-*s => %s\n", $maxnamelen, $tn, $tv;
1780             }
1781              
1782 5         37 return $ret;
1783             }
1784              
1785             =head1 TODO
1786              
1787             =over 4
1788              
1789             =item *
1790              
1791             There are likely variations on the rules for C that could equally
1792             apply to some uses of tagged strings. Consider whether the behaviour of
1793             modification is chosen per-method, per-tag, or per-string.
1794              
1795             =item *
1796              
1797             Consider how to implement a clone from one tag format to another which wants
1798             to merge multiple different source tags together into a single new one.
1799              
1800             =back
1801              
1802             =head1 AUTHOR
1803              
1804             Paul Evans
1805              
1806             =cut
1807              
1808             0x55AA;