File Coverage

blib/lib/String/Tagged.pm
Criterion Covered Total %
statement 532 563 94.4
branch 224 254 88.1
condition 72 84 85.7
subroutine 54 55 98.1
pod 34 34 100.0
total 916 990 92.5


line stmt bran cond sub pod time code
1             # You may distribute under the terms of either the GNU General Public License
2             # or the Artistic License (the same terms as Perl itself)
3             #
4             # (C) Paul Evans, 2008-2023 -- leonerd@leonerd.org.uk
5              
6             package String::Tagged 0.21;
7              
8 20     20   4716982 use v5.14;
  20         219  
9 20     20   118 use warnings;
  20         65  
  20         585  
10              
11 20     20   113 use Scalar::Util qw( blessed );
  20         45  
  20         1417  
12              
13             require String::Tagged::Extent;
14              
15             use constant {
16 20         1917 FLAG_ANCHOR_BEFORE => 0x01,
17             FLAG_ANCHOR_AFTER => 0x02,
18             FLAG_ITERATING => 0x04,
19             FLAG_DELETED => 0x08,
20 20     20   136 };
  20         50  
21              
22 20     20   143 use constant DEBUG => 0;
  20         44  
  20         1156  
23              
24             # Since we're providing overloading, we should set fallback by default
25 20     20   135 use overload fallback => 1;
  20         37  
  20         189  
26              
27             =head1 NAME
28              
29             C - string buffers with value tags on extents
30              
31             =head1 SYNOPSIS
32              
33             use String::Tagged;
34              
35             my $st = String::Tagged->new( "An important message" );
36              
37             $st->apply_tag( 3, 9, bold => 1 );
38              
39             $st->iter_substr_nooverlap(
40             sub {
41             my ( $substring, %tags ) = @_;
42              
43             print $tags{bold} ? "$substring"
44             : $substring;
45             }
46             );
47              
48             =head1 DESCRIPTION
49              
50             This module implements an object class, instances of which store a (mutable)
51             string buffer that supports tags. A tag is a name/value pair that applies to
52             some extent of the underlying string.
53              
54             The types of tag names ought to be strings, or at least values that are
55             well-behaved as strings, as the names will often be used as the keys in hashes
56             or applied to the C operator.
57              
58             The types of tag values are not restricted - any scalar will do. This could be
59             a simple integer or string, ARRAY or HASH reference, or even a CODE reference
60             containing an event handler of some kind.
61              
62             Tags may be arbitrarily overlapped. Any given offset within the string has in
63             effect, a set of uniquely named tags. Tags of different names are independent.
64             For tags of the same name, only the latest, shortest tag takes effect.
65              
66             For example, consider a string with three tags represented here:
67              
68             Here is my string with tags
69             [-------------------------] foo => 1
70             [-------] foo => 2
71             [---] bar => 3
72              
73             Every character in this string has a tag named C. The value of this tag
74             is 2 for the words C and C and the space inbetween, and 1
75             elsewhere. Additionally, the words C and C and the space between them
76             also have the tag C with a value 3.
77              
78             Since C does not understand the significance of the tag values
79             it therefore cannot detect if two neighbouring tags really contain the same
80             semantic idea. Consider the following string:
81              
82             A string with words
83             [-------] type => "message"
84             [--------] type => "message"
85              
86             This string contains two tags. C will treat this as two
87             different tag values as far as C is concerned, even
88             though C yields the same value for the C tag at any position
89             in the string. The C method may be used to merge tag extents of
90             tags that should be considered as equal.
91              
92             =head1 NAMING
93              
94             I spent a lot of time considering the name for this module. It seems that a
95             number of people across a number of languages all created similar
96             functionality, though named very differently. For the benefit of
97             keyword-based search tools and similar, here's a list of some other names this
98             sort of object might be known by:
99              
100             =over 4
101              
102             =item *
103              
104             Extents
105              
106             =item *
107              
108             Overlays
109              
110             =item *
111              
112             Attribute or attributed strings
113              
114             =item *
115              
116             Markup
117              
118             =item *
119              
120             Out-of-band data
121              
122             =back
123              
124             =cut
125              
126             *is_string_tagged =
127             # It would be nice if we could #ifdef HAVE_PERL_VERSION(...)
128             ( $] >= 5.034 ) ?
129             do { eval 'use experimental "isa"; sub { $_[0] isa __PACKAGE__ }' // die $@ } :
130 281 100   281   1495 do { sub { blessed $_[0] and $_[0]->isa( __PACKAGE__ ) } };
131              
132             =head1 CONSTRUCTOR
133              
134             =cut
135              
136             =head2 new
137              
138             $st = String::Tagged->new( $str )
139              
140             Returns a new instance of a C object. It will contain no tags.
141             If the optional C<$str> argument is supplied, the string buffer will be
142             initialised from this value.
143              
144             If C<$str> is a C object then it will be cloned, as if calling
145             the C method on it.
146              
147             =cut
148              
149             sub new
150             {
151 132     132 1 23975 my $class = shift;
152 132         285 my ( $str ) = @_;
153              
154 132 100       290 return $class->clone( $str ) if is_string_tagged( $str );
155              
156 112 100       289 $str = "" unless defined $str;
157              
158 112         586 return bless {
159             str => "$str",
160             tags => [],
161             }, $class;
162             }
163              
164             =head2 new_tagged
165              
166             $st = String::Tagged->new_tagged( $str, %tags )
167              
168             Shortcut for creating a new C object with the given tags
169             applied to the entire length. The tags will not be anchored at either end.
170              
171             =cut
172              
173             sub new_tagged
174             {
175 8     8 1 1318 my $class = shift;
176 8         32 my ( $str, %tags ) = @_;
177              
178 8         25 my $self = $class->new( $str );
179              
180 8         28 my $length = $self->length;
181 8         44 $self->apply_tag( 0, $length, $_ => $tags{$_} ) for keys %tags;
182              
183 8         39 return $self;
184             }
185              
186             =head2 clone (class)
187              
188             $new = String::Tagged->clone( $orig, %opts )
189              
190             Returns a new instance of C made by cloning the original,
191             subject to the options provided. The returned instance will be in the
192             requested class, which need not match the class of the original.
193              
194             The following options are recognised:
195              
196             =over 4
197              
198             =item only_tags => ARRAY
199              
200             If present, gives an ARRAY reference containing tag names. Only those tags
201             named here will be copied; others will be ignored.
202              
203             =item except_tags => ARRAY
204              
205             If present, gives an ARRAY reference containing tag names. All tags will be
206             copied except those named here.
207              
208             =item convert_tags => HASH
209              
210             If present, gives a HASH reference containing tag conversion functions. For
211             any tags in the original to be copied whose names appear in the hash, the
212             name and value are passed into the corresponding function, which should return
213             an even-sized key/value list giving a tag, or a list of tags, to apply to the
214             new clone.
215              
216             my @new_tags = $convert_tags->{$orig_name}->( $orig_name, $orig_value )
217             # Where @new_tags is ( $new_name, $new_value, $new_name_2, $new_value_2, ... )
218              
219             As a further convenience, if the value for a given tag name is a plain string
220             instead of a code reference, it gives the new name for the tag, and will be
221             applied with its existing value.
222              
223             If C is being used too, then the source names of any tags to be
224             converted must also be listed there, or they will not be copied.
225              
226             =back
227              
228             =head2 clone (instance)
229              
230             $new = $orig->clone( %args )
231              
232             Called as an instance (rather than a class) method, the newly-cloned instance
233             is returned in the same class as the original.
234              
235             =cut
236              
237             sub clone
238             {
239 26 100   26 1 2419 my ( $class, $orig ) = blessed $_[0] ?
240             ( ref $_[0], shift ) :
241             ( shift, shift );
242 26         66 my %opts = @_;
243              
244             my $only = exists $opts{only_tags} ?
245 26 100       92 { map { $_ => 1 } @{ $opts{only_tags} } } :
  1         5  
  1         4  
246             undef;
247              
248             my $except = exists $opts{except_tags} ?
249 26 50       66 { map { $_ => 1 } @{ $opts{except_tags} } } :
  0         0  
  0         0  
250             undef;
251              
252 26         59 my $convert = $opts{convert_tags};
253              
254 26         78 my $new = $class->new( $orig->str );
255              
256             $orig->iter_extents( sub {
257 26     26   55 my ( $e, $tn, $tv ) = @_;
258              
259 26 100 100     105 return if $only and not $only->{$tn};
260 25 0 33     61 return if $except and $except->{$tn};
261              
262 25         67 my @tags;
263 25 100 66     85 if( $convert and my $c = $convert->{$tn} ) {
264 2 100       13 if( ref $c eq "CODE" ) {
265 1         4 @tags = $c->( $tn, $tv );
266             }
267             else {
268 1         3 @tags = ( $c, $tv );
269             }
270             }
271             else {
272 23         87 @tags = ( $tn, $tv );
273             }
274              
275 25         78 while( @tags ) {
276 25         87 $new->apply_tag( $e, shift @tags, shift @tags );
277             }
278 26         252 });
279              
280 26         258 return $new;
281             }
282              
283             sub _mkextent
284             {
285 214     214   313 my $self = shift;
286 214         382 my ( $start, $end, $flags ) = @_;
287              
288 214         333 $flags &= (FLAG_ANCHOR_BEFORE|FLAG_ANCHOR_AFTER);
289              
290 214         1000 return bless [ $self, $start, $end, $flags ], 'String::Tagged::Extent';
291             }
292              
293             =head2 from_sprintf
294              
295             $str = String::Tagged->from_sprintf( $format, @args )
296              
297             I
298              
299             Returns a new instance of a C object, initialised by
300             formatting the supplied arguments using the supplied format.
301              
302             The C<$format> string is similar to that supported by the core C
303             operator, though a few features such as out-of-order argument indexing and
304             vector formatting are missing. This format string may be a plain perl string,
305             or an instance of C. In the latter case, any tags within it
306             are preserved in the result.
307              
308             In the case of a C<%s> conversion, the value of the argument consumed may
309             itself be a C instance. In this case it will be appended to
310             the returned object, preserving any tags within it.
311              
312             All other conversions are handled individually by the core C
313             operator and appended to the result.
314              
315             =cut
316              
317             sub from_sprintf
318             {
319 12     12 1 741 my $class = shift;
320 12         32 my ( $format, @args ) = @_;
321              
322             # Clone the format string into the candidate return value, and then
323             # repeatedly replace %... expansions with their required value using
324             # ->set_substr, so that embedded tags in the format will behave sensibly.
325              
326 12 100       27 my $ret = ( is_string_tagged( $format ) ) ?
327             $class->clone( $format ) :
328             $class->new( $format );
329              
330 12         22 my $pos = 0;
331              
332 12         56 while( $pos < length $ret ) {
333 22         40 my $str = "$ret";
334 22         57 pos( $str ) = $pos;
335              
336 22         36 my $replacement;
337              
338 22 100       140 if( $str =~ m/\G[^%]+/gc ) {
    100          
    100          
    50          
    0          
339             # A literal span
340 9         22 $pos = $+[0];
341 9         22 next;
342             }
343             elsif( $str =~ m/\G%%/gc ) {
344             # A literal %% conversion
345 1         3 $replacement = "%";
346             }
347             elsif( $str =~ m/\G%([-]?)(\d+|\*)?(?:\.(\d+|\*))?s/gc ) {
348             # A string
349 10         35 my ( $flags, $width, $precision ) = ( $1, $2, $3 );
350 10 100 100     36 $width = shift @args if defined $width and $width eq "*";
351 10 100 100     38 $precision = shift @args if defined $precision and $precision eq "*";
352 10         19 my $arg = shift @args;
353              
354 10 50       21 defined $arg or do {
355 0         0 warnings::warnif( uninitialized => "Use of ininitialized value in String::Tagged->from_sprintf" );
356 0         0 $arg = "";
357             };
358              
359 10 100       20 if( defined $precision ) {
360 2 50       4 if( is_string_tagged( $arg ) ) {
361 0         0 $arg = $arg->substr( 0, $precision );
362             }
363             else {
364 2         7 $arg = substr $arg, 0, $precision;
365             }
366             }
367              
368 10         19 my $leftalign = $flags =~ m/-/;
369              
370 10 100       20 my $padding = defined $width ? $width - length $arg : 0;
371 10 100       21 $padding = 0 if $padding < 0;
372              
373 10         14 $replacement = "";
374              
375 10 100       31 $replacement .= " " x $padding if !$leftalign;
376              
377 10         17 $replacement .= $arg;
378              
379 10 100       22 $replacement .= " " x $padding if $leftalign;
380             }
381             elsif( $str =~ m/\G%(.*?)([cduoxefgXEGbBpaAiDUOF])/gc ) {
382             # Another conversion format
383 2         6 my ( $template, $flags ) = ( $2, $1 );
384 2         4 my $argc = 1;
385 2         4 $argc += ( () = $flags =~ m/\*/g );
386              
387 2         11 $replacement = sprintf "%$flags$template", @args[0..$argc-1];
388 2         5 splice @args, 0, $argc;
389             }
390             elsif( $str =~ m/\G%(.*?)([a-zA-Z])/gc ) {
391 0         0 warn "Unrecognised sprintf conversion %$2";
392             }
393             else {
394             # must be at EOF now
395 0         0 last;
396             }
397              
398 13         54 my $templatelen = $+[0] - $-[0];
399 13         50 $ret->set_substr( $-[0], $templatelen, $replacement );
400              
401 13         39 $pos += length( $replacement );
402             }
403              
404 12         46 return $ret;
405             }
406              
407             =head2 join
408              
409             $str = String::Tagged->join( $sep, @parts )
410              
411             I
412              
413             Returns a new instance of a C object, formed by concatenating
414             each of the component piece together, joined with the separator string.
415              
416             The result will be much like the core C function, except that it will
417             preserve tags in the resulting string.
418              
419             =cut
420              
421             sub join
422             {
423 1     1 1 2 my $class = shift;
424 1         3 my ( $sep, @parts ) = @_;
425              
426 1 50       4 is_string_tagged( $sep ) or
427             $sep = $class->new( $sep );
428              
429 1         2 my $ret = shift @parts;
430 1         6 $ret .= $sep . $_ for @parts;
431              
432 1         5 return $ret;
433             }
434              
435             =head1 METHODS
436              
437             =cut
438              
439             =head2 str
440              
441             $str = $st->str
442              
443             $str = "$st"
444              
445             Returns the plain string contained within the object.
446              
447             This method is also called for stringification; so the C
448             object can be used in a plain string interpolation such as
449              
450             my $message = String::Tagged->new( "Hello world" );
451             print "My message is $message\n";
452              
453             =cut
454              
455 20     20   31814 use overload '""' => 'str';
  20         49  
  20         111  
456              
457             sub str
458             {
459 195     195 1 20551 my $self = shift;
460 195         758 return $self->{str};
461             }
462              
463             =head2 length
464              
465             $len = $st->length
466              
467             $len = length( $st )
468              
469             Returns the length of the plain string. Because stringification works on this
470             object class, the normal core C function works correctly on it.
471              
472             =cut
473              
474             sub length
475             {
476 403     403 1 577 my $self = shift;
477 403         1199 return CORE::length $self->{str};
478             }
479              
480             =head2 substr
481              
482             $str = $st->substr( $start, $len )
483              
484             Returns a C instance representing a section from within the
485             given string, containing all the same tags at the same conceptual positions.
486              
487             =cut
488              
489             sub substr
490             {
491 26     26 1 76 my $self = shift;
492 26         53 my ( $start, $len ) = @_;
493              
494 26         44 my $end = $start + $len;
495              
496 26         107 my $ret = ( ref $self )->new( CORE::substr( $self->{str}, $start, $len ) );
497              
498 26         67 my $tags = $self->{tags};
499              
500 26         58 foreach my $t ( @$tags ) {
501 30         62 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
502              
503 30 100       71 next if $te < $start;
504 25 100       49 last if $ts >= $end;
505              
506 23         52 $_ -= $start for $ts, $te;
507 23 100       48 next if $te <= 0;
508              
509 22 100 100     86 $ts = -1 if $ts < 0 or $tf & FLAG_ANCHOR_BEFORE;
510 22 100 100     87 $te = -1 if $te > $end or $tf & FLAG_ANCHOR_AFTER;
511              
512 22 100       65 $ret->apply_tag( $ts, $te == -1 ? -1 : $te - $ts, $tn => $tv );
513             }
514              
515 26         93 return $ret;
516             }
517              
518             =head2 plain_substr
519              
520             $str = $st->plain_substr( $start, $len )
521              
522             Returns as a plain perl string, the substring at the given position. This will
523             be the same string data as returned by C, only as a plain string
524             without the tags
525              
526             =cut
527              
528             sub plain_substr
529             {
530 23     23 1 39 my $self = shift;
531 23         46 my ( $start, $len ) = @_;
532              
533 23         105 return CORE::substr( $self->{str}, $start, $len );
534             }
535              
536             sub _cmp_tags
537             {
538 150     150   261 my ( $as, $ae ) = @$a;
539 150         224 my ( $bs, $be ) = @$b;
540              
541             # Sort by start first; shortest first
542 150   100     578 return $as <=> $bs ||
543             $ae <=> $be;
544             }
545              
546             sub _assert_sorted
547             {
548 0     0   0 my $self = shift;
549              
550 0         0 my $tags = $self->{tags};
551             # If fewer than 2 tags, must be sorted
552 0 0       0 return if @$tags < 2;
553              
554 0         0 my $prev = $tags->[0];
555              
556 0         0 for( my $i = 1; $i < @$tags; $i++ ) {
557 0         0 my $here = $tags->[$i];
558 0         0 local ( $a, $b ) = ( $prev, $here );
559 0 0       0 if( _cmp_tags() <= 0 ) {
560 0         0 $prev = $here;
561 0         0 next;
562             }
563              
564 0         0 print STDERR "Tag order violation at i=$i\n";
565 0         0 print STDERR "[@{[ $i - 1 ]}] = [ $tags->[$i-1]->[0], $tags->[$i-1]->[1] ]\n";
  0         0  
566 0         0 print STDERR "[@{[ $i ]}] = [ $tags->[$i]->[0], $tags->[$i]->[1] ]\n";
  0         0  
567 0         0 die "Assert failure";
568             }
569             }
570              
571             sub _insert_tag
572             {
573 152     152   226 my $self = shift;
574 152         285 my ( $start, $end, $name, $value, $flags ) = @_;
575              
576 152         233 my $tags = $self->{tags};
577              
578 152         323 my $newtag = [ $start, $end, $name => $value, $flags ];
579              
580             # Specialcase - if there's no tags yet, just push it
581 152 100       357 if( @$tags == 0 ) {
582 76         141 push @$tags, $newtag;
583 76         151 return;
584             }
585              
586 76         129 local $a = $newtag;
587              
588             # Two more special cases - it's quite likely we're either inserting an
589             # 'everywhere' tag, or appending one to the end. Check the endpoints first
590 76         109 local $b;
591              
592 76         121 $b = $tags->[0];
593 76 100       136 if( _cmp_tags() <= 0 ) {
594 14         41 unshift @$tags, $newtag;
595 14         29 return;
596             }
597              
598 62         110 $b = $tags->[-1];
599 62 100       119 if( _cmp_tags() >= 0 ) {
600 60         103 push @$tags, $newtag;
601 60         124 return;
602             }
603              
604 2         5 my $range_start = 0;
605 2         7 my $range_end = $#$tags;
606              
607 2         5 my $inspos;
608              
609 2         8 while( $range_end > $range_start ) {
610 2         11 my $i = int( ( $range_start + $range_end ) / 2 );
611              
612 2         5 $b = $tags->[$i];
613 2         4 my $cmp = _cmp_tags;
614              
615 2 50       12 if( $cmp > 0 ) {
    0          
616 2         6 $range_start = $i + 1;
617             }
618             elsif( $cmp < 0 ) {
619 0         0 $range_end = $i; # open interval
620             }
621             else {
622 0         0 $inspos = $i;
623 0         0 last;
624             }
625              
626 2 50       8 if( $range_start == $range_end ) {
627 2         4 $inspos = $range_start;
628 2         4 last;
629             }
630             }
631              
632 2 50       9 $inspos = $range_end unless defined $inspos;
633              
634 2 50       6 $inspos = 0 if $inspos < 0;
635 2 50       8 $inspos = @$tags if $inspos > @$tags;
636              
637 2         9 splice @$tags, $inspos, 0, $newtag;
638              
639 2         5 $self->_assert_sorted if DEBUG;
640             }
641              
642             =head2 apply_tag
643              
644             $st->apply_tag( $start, $len, $name, $value )
645              
646             Apply the named tag value to the given extent. The tag will start on the
647             character at the C<$start> index, and continue for the next C<$len>
648             characters.
649              
650             If C<$start> is given as -1, the tag will be considered to start "before" the
651             actual string. If C<$len> is given as -1, the tag will be considered to
652             end "after" end of the actual string. These special limits are used by
653             C when deciding whether to move a tag boundary. The start of any
654             tag that starts "before" the string is never moved, even if more text is
655             inserted at the beginning. Similarly, a tag which ends "after" the end of the
656             string, will continue to the end even if more text is appended.
657              
658             This method returns the C<$st> object.
659              
660             $st->apply_tag( $e, $name, $value )
661              
662             Alternatively, an existing L object can be passed as
663             the first argument instead of two integers. The new tag will apply at the
664             given extent.
665              
666             =cut
667              
668             sub apply_tag
669             {
670 149     149 1 7049 my $self = shift;
671 149         221 my ( $start, $end );
672 149         216 my $flags = 0;
673              
674 149 100       418 if( blessed $_[0] ) {
675 25         42 my $e = shift;
676 25         108 $start = $e->start;
677 25         74 $end = $e->end;
678              
679 25 100       59 $flags |= FLAG_ANCHOR_BEFORE if $e->anchor_before;
680 25 100       70 $flags |= FLAG_ANCHOR_AFTER if $e->anchor_after;
681             }
682             else {
683 124         182 $start = shift;
684 124         173 my $len = shift;
685              
686 124         234 my $strlen = $self->length;
687              
688 124 100       291 if( $start < 0 ) {
689 25         36 $start = 0;
690 25         47 $flags |= FLAG_ANCHOR_BEFORE;
691             }
692              
693 124 100       253 if( $len == -1 ) {
694 30         46 $end = $strlen;
695 30         46 $flags |= FLAG_ANCHOR_AFTER;
696             }
697             else {
698 94         143 $end = $start + $len;
699 94 100       228 $end = $strlen if $end > $strlen;
700             }
701             }
702              
703 149         329 my ( $name, $value ) = @_;
704              
705 149         389 $self->_insert_tag( $start, $end, $name, $value, $flags );
706              
707 149         354 return $self;
708             }
709              
710             sub _remove_tag
711             {
712 9     9   14 my $self = shift;
713 9         13 my $keepends = shift;
714 9         11 my ( $start, $end );
715              
716 9 100       26 if( blessed $_[0] ) {
717 5         9 my $e = shift;
718 5         11 $start = $e->start;
719 5         10 $end = $e->end;
720             }
721             else {
722 4         6 $start = shift;
723 4         6 $end = $start + shift;
724             }
725              
726 9         17 my ( $name ) = @_;
727              
728 9 100       24 if( my $t = $self->{iterating} ) {
729 5         9 my ( $ts, $te, $tn ) = @$t;
730 5 50 33     25 if( $start == $ts and $end == $te and $name eq $tn ) {
      33        
731 5         8 $t->[4] |= FLAG_DELETED;
732 5         10 return;
733             }
734             }
735              
736 4         5 my $tags = $self->{tags};
737              
738 4         8 my $have_added = 0;
739              
740             # Can't foreach() because we modify $i
741 4         11 for( my $i = 0; $i < @$tags; $i++ ) {
742 8         10 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  8         18  
743              
744 8 100       18 next if $te <= $start;
745 7 100       13 last if $ts >= $end;
746              
747 4 50       9 next if $tn ne $name;
748              
749 4 100 100     18 if( $keepends and $end < $te ) {
750 2         6 $self->_insert_tag( $end, $te, $tn, $tv, $tf & ~(FLAG_ANCHOR_BEFORE|FLAG_ITERATING) );
751 2         4 $have_added = 1;
752             }
753              
754 4 50       9 if( $tf & FLAG_ITERATING ) {
755 0         0 die "ARGH encountered FLAG_ITERATING while walking the list of tags during ->_remove_tag";
756             }
757              
758 4         9 splice @$tags, $i, 1;
759              
760 4 100 100     25 if( $keepends and $ts < $start ) {
761 1         4 $self->_insert_tag( $ts, $start, $tn, $tv, $tf & ~(FLAG_ANCHOR_AFTER|FLAG_ITERATING) );
762 1         3 $have_added = 1;
763             }
764             else {
765 3         17 $i--;
766             }
767             }
768              
769 4         7 if( DEBUG && $have_added ) {
770             $self->_assert_sorted;
771             }
772              
773 4         14 return $self;
774             }
775              
776             =head2 unapply_tag
777              
778             $st->unapply_tag( $start, $len, $name )
779              
780             Unapply the named tag value from the given extent. If the tag extends beyond
781             this extent, then any partial fragment of the tag will be left in the string.
782              
783             This method returns the C<$st> object.
784              
785             $st->unapply_tag( $e, $name )
786              
787             Alternatively, an existing L object can be passed as
788             the first argument instead of two integers.
789              
790             =cut
791              
792             sub unapply_tag
793             {
794 3     3 1 6 my $self = shift;
795 3         9 return $self->_remove_tag( 1, @_ );
796             }
797              
798             =head2 delete_tag
799              
800             $st->delete_tag( $start, $len, $name )
801              
802             Delete the named tag within the given extent. Entire tags are removed, even if
803             they extend beyond this extent.
804              
805             This method returns the C<$st> object.
806              
807             $st->delete_tag( $e, $name )
808              
809             Alternatively, an existing L object can be passed as
810             the first argument instead of two integers.
811              
812             =cut
813              
814             sub delete_tag
815             {
816 6     6 1 7624 my $self = shift;
817 6         10 return $self->_remove_tag( 0, @_ );
818             }
819              
820             =head2 delete_all_tag
821              
822             $st->delete_all_tag( $name )
823              
824             I
825              
826             Deletes every tag with the given name. This is more efficient than calling
827             C to list the tags then C on each one individually
828             in the case of a simple name match.
829              
830             This method returns the C<$st> object.
831              
832             =cut
833              
834             sub delete_all_tag
835             {
836 1     1 1 8 my $self = shift;
837 1         5 my ( $name ) = @_;
838              
839 1         2 my $tags = $self->{tags};
840              
841 1         5 for( my $i = 0; $i < @$tags; $i++ ) {
842 3         16 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  3         12  
843              
844 3 100       10 next if $tn ne $name;
845              
846 2         4 splice @$tags, $i, 1, ();
847 2         6 $i--;
848             }
849              
850 1         13 return $self;
851             }
852              
853             =head2 merge_tags
854              
855             $st->merge_tags( $eqsub )
856              
857             Merge neighbouring or overlapping tags of the same name and equal values.
858              
859             For each pair of tags of the same name that apply on neighbouring or
860             overlapping extents, the C<$eqsub> callback is called, as
861              
862             $equal = $eqsub->( $name, $value_a, $value_b )
863              
864             If this function returns true then the tags are merged.
865              
866             The equallity test function is free to perform any comparison of the values
867             that may be relevant to the application; for example it may deeply compare
868             referred structures and check for equivalence in some application-defined
869             manner. In this case, the first tag of a pair is retained, the second is
870             deleted. This may be relevant if the tag value is a reference to some object.
871              
872             =cut
873              
874             sub merge_tags
875             {
876 8     8 1 8765 my $self = shift;
877 8         15 my ( $eqsub ) = @_;
878              
879 8         14 my $tags = $self->{tags};
880              
881             # Can't foreach() because we modify @$tags
882 8         22 OUTER: for( my $i = 0; $i < @$tags; $i++ ) {
883 13         25 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  13         28  
884              
885 13         40 for( my $j = $i+1; $j < @$tags; $j++ ) {
886 11         14 my ( $t2s, $t2e, $t2n, $t2v, $t2f ) = @{ $tags->[$j] };
  11         24  
887              
888 11 100       22 last if $t2s > $te;
889 10 50       22 next unless $t2s <= $te;
890 10 100       27 next unless $t2n eq $tn;
891              
892 7 100       22 last unless $eqsub->( $tn, $tv, $t2v );
893              
894             # Need to delete the tag at $j, extend the end of the tag at $i, and
895             # possibly move $i later
896 6         32 splice @$tags, $j, 1, ();
897 6         13 $j--;
898              
899 6         10 $te = $tags->[$i][1] = $t2e;
900              
901 6 100       17 $tags->[$i][4] |= FLAG_ANCHOR_AFTER if $t2f & FLAG_ANCHOR_AFTER;
902              
903 6         12 local $a = $tags->[$i];
904              
905 6 100 100     37 if( local $b = $tags->[$i+1] and _cmp_tags() > 0 ) {
906 1         2 my $newpos = $i+1;
907 1   33     6 while( local $b = $tags->[$newpos ] and _cmp_tags() <= 0 ) {
908 0         0 $newpos++;
909             }
910              
911 1         3 splice @$tags, $newpos, 0, splice @$tags, $i, 1, ();
912              
913 1         3 redo OUTER;
914             }
915             }
916             }
917             }
918              
919             =head2 iter_extents
920              
921             $st->iter_extents( $callback, %opts )
922              
923             Iterate the tags stored in the string. For each tag, the CODE reference in
924             C<$callback> is invoked once, being passed a L object
925             that represents the extent of the tag.
926              
927             $callback->( $extent, $tagname, $tagvalue )
928              
929             Options passed in C<%opts> may include:
930              
931             =over 4
932              
933             =item start => INT
934              
935             Start at the given position; defaults to 0.
936              
937             =item end => INT
938              
939             End after the given position; defaults to end of string. This option overrides
940             C.
941              
942             =item len => INT
943              
944             End after the given length beyond the start position; defaults to end of
945             string. This option only applies if C is not given.
946              
947             =item only => ARRAY
948              
949             Select only the tags named in the given ARRAY reference.
950              
951             =item except => ARRAY
952              
953             Select all the tags except those named in the given ARRAY reference.
954              
955             =back
956              
957             I it is safe to call C from within the
958             callback function to remove the tag currently being iterated on.
959              
960             $str->iter_extents( sub {
961             my ( $e, $n, $v ) = @_;
962             $str->delete_tag( $e, $n ) if $n =~ m/^tmp_/;
963             } );
964              
965             Apart from this scenario, the tags in the string should not otherwise be added
966             or removed while the iteration is occurring.
967              
968             =cut
969              
970             sub iter_extents
971             {
972 77     77 1 162 my $self = shift;
973 77         150 my ( $callback, %opts ) = @_;
974              
975             my $start = exists $opts{start} ? $opts{start} :
976 77 100       196 0;
977              
978             my $end = exists $opts{end} ? $opts{end} :
979             exists $opts{len} ? $start + $opts{len} :
980 77 50       279 $self->length + 1; # so as to include zerolen at end
    100          
981              
982 77 100       187 my $only = exists $opts{only} ? { map { $_ => 1 } @{ $opts{only} } } :
  1         6  
  1         3  
983             undef;
984              
985 77 100       175 my $except = exists $opts{except} ? { map { $_ => 1 } @{ $opts{except} } } :
  1         4  
  1         6  
986             undef;
987              
988 77         141 my $tags = $self->{tags};
989              
990 77         287 for ( my $i = 0; $i < @$tags; $i++ ) {
991 102         178 my $t = $tags->[$i];
992 102         236 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
993              
994 102 100       222 next if $te < $start;
995 101 100       191 last if $ts >= $end;
996              
997 100 100 100     255 next if $only and !$only->{$tn};
998 98 100 100     200 next if $except and $except->{$tn};
999              
1000 97         166 $t->[4] |= FLAG_ITERATING;
1001 97         243 local $self->{iterating} = $t;
1002              
1003 97         260 $callback->( $self->_mkextent( $ts, $te, $tf ), $tn, $tv );
1004              
1005 97         3569 $t->[4] &= ~FLAG_ITERATING;
1006              
1007 97 100       512 if( $t->[4] & FLAG_DELETED ) {
1008 5         8 splice @$tags, $i, 1, ();
1009 5         16 $i--;
1010             }
1011             }
1012             }
1013              
1014             =head2 iter_tags
1015              
1016             $st->iter_tags( $callback, %opts )
1017              
1018             Iterate the tags stored in the string. For each tag, the CODE reference in
1019             C<$callback> is invoked once, being passed the start point and length of the
1020             tag.
1021              
1022             $callback->( $start, $length, $tagname, $tagvalue )
1023              
1024             Options passed in C<%opts> are the same as for C.
1025              
1026             =cut
1027              
1028             sub iter_tags
1029             {
1030 25     25 1 11788 my $self = shift;
1031 25         59 my ( $callback, %opts ) = @_;
1032              
1033             $self->iter_extents(
1034             sub {
1035 36     36   67 my ( $e, $tn, $tv ) = @_;
1036 36         101 $callback->( $e->start, $e->length, $tn, $tv );
1037             },
1038 25         135 %opts
1039             );
1040             }
1041              
1042             =head2 iter_extents_nooverlap
1043              
1044             $st->iter_extents_nooverlap( $callback, %opts )
1045              
1046             Iterate non-overlapping extents of tags stored in the string. The CODE
1047             reference in C<$callback> is invoked for each extent in the string where no
1048             tags change. The entire set of tags active in that extent is given to the
1049             callback. Because the extent covers possibly-multiple tags, it will not define
1050             the C and C flags.
1051              
1052             $callback->( $extent, %tags )
1053              
1054             The callback will be invoked over the entire length of the string, including
1055             any extents with no tags applied.
1056              
1057             Options may be passed in C<%opts> to control the range of the string iterated
1058             over, in the same way as the C method.
1059              
1060             If the C or C filters are applied, then only the tags that
1061             survive filtering will be present in the C<%tags> hash. Tags that are excluded
1062             by the filtering will not be present, nor will their bounds be used to split
1063             the string into extents.
1064              
1065             =cut
1066              
1067             sub iter_extents_nooverlap
1068             {
1069 34     34 1 63 my $self = shift;
1070 34         67 my ( $callback, %opts ) = @_;
1071              
1072             my $start = exists $opts{start} ? $opts{start} :
1073 34 100       96 0;
1074              
1075             my $end = exists $opts{end} ? $opts{end} :
1076             exists $opts{len} ? $start + $opts{len} :
1077 34 50       172 $self->length;
    100          
1078              
1079 34 100       82 my $only = exists $opts{only} ? { map { $_ => 1 } @{ $opts{only} } } :
  1         5  
  1         3  
1080             undef;
1081              
1082 34 100       76 my $except = exists $opts{except} ? { map { $_ => 1 } @{ $opts{except} } } :
  1         5  
  1         3  
1083             undef;
1084              
1085 34         52 my $tags = $self->{tags};
1086              
1087 34         58 my @active; # ARRAY of [ $ts, $te, $tn, $tv ]
1088 34         51 my $pos = $start;
1089              
1090 34         78 foreach my $t ( @$tags ) {
1091 76         179 my ( $ts, $te, $tn, $tv ) = @$t;
1092              
1093 76 100       151 next if $te < $start;
1094 74 100       141 last if $ts > $end;
1095              
1096 72 100 100     153 next if $only and !$only->{$tn};
1097 70 100 100     163 next if $except and $except->{$tn};
1098              
1099 69         152 while( $pos < $ts ) {
1100 43         74 my %activetags;
1101             my %tagends;
1102 43         65 my $rangeend = $ts;
1103              
1104 43         91 foreach ( @active ) {
1105 42         80 my ( undef, $e, $n, $v ) = @$_;
1106              
1107 42 100       93 $e < $rangeend and $rangeend = $e;
1108 42 100 66     105 next if $tagends{$n} and $tagends{$n} < $e;
1109              
1110 41         69 $activetags{$n} = $v;
1111 41         98 $tagends{$n} = $e;
1112             }
1113              
1114 43         110 $callback->( $self->_mkextent( $pos, $rangeend, 0 ), %activetags );
1115              
1116 43         463 $pos = $rangeend;
1117 43         85 @active = grep { $_->[1] > $pos } @active;
  42         181  
1118             }
1119              
1120 69         213 push @active, [ $ts, $te, $tn, $tv ];
1121             }
1122              
1123 34         72 while( $pos < $end ) {
1124 53         123 my %activetags;
1125             my %tagends;
1126 53         75 my $rangeend = $end;
1127              
1128 53         101 foreach ( @active ) {
1129 72         133 my ( undef, $e, $n, $v ) = @$_;
1130              
1131 72 100       141 $e < $rangeend and $rangeend = $e;
1132 72 100 100     159 next if $tagends{$n} and $tagends{$n} < $e;
1133              
1134 71         114 $activetags{$n} = $v;
1135 71         133 $tagends{$n} = $e;
1136             }
1137              
1138 53         140 $callback->( $self->_mkextent( $pos, $rangeend, 0 ), %activetags );
1139              
1140 53         1285 $pos = $rangeend;
1141 53         135 @active = grep { $_->[1] > $pos } @active;
  72         262  
1142             }
1143              
1144             # We might have zero-length tags active at the very end of the range
1145 34 100       196 if( my @zerolen = grep { $_->[0] == $pos and $_->[1] == $pos } @active ) {
  3 100       23  
1146 1         4 my %activetags;
1147 1         5 foreach ( @active ) {
1148 1         4 my ( undef, undef, $n, $v ) = @$_;
1149              
1150 1         4 $activetags{$n} = $v;
1151             }
1152              
1153 1         5 $callback->( $self->_mkextent( $pos, $pos, 0 ), %activetags );
1154             }
1155             }
1156              
1157             =head2 iter_tags_nooverlap
1158              
1159             $st->iter_tags_nooverlap( $callback, %opts )
1160              
1161             Iterate extents of the string using C, but passing
1162             the start and length of each extent to the callback instead of the extent
1163             object.
1164              
1165             $callback->( $start, $length, %tags )
1166              
1167             Options may be passed in C<%opts> to control the range of the string iterated
1168             over, in the same way as the C method.
1169              
1170             =cut
1171              
1172             sub iter_tags_nooverlap
1173             {
1174 28     28 1 8840 my $self = shift;
1175 28         69 my ( $callback, %opts ) = @_;
1176              
1177             $self->iter_extents_nooverlap(
1178             sub {
1179 80     80   220 my ( $e, %tags ) = @_;
1180 80         206 $callback->( $e->start, $e->length, %tags );
1181             },
1182 28         226 %opts
1183             );
1184             }
1185              
1186             =head2 iter_substr_nooverlap
1187              
1188             $st->iter_substr_nooverlap( $callback, %opts )
1189              
1190             Iterate extents of the string using C, but passing the
1191             substring of data instead of the extent object.
1192              
1193             $callback->( $substr, %tags )
1194              
1195             Options may be passed in C<%opts> to control the range of the string iterated
1196             over, in the same way as the C method.
1197              
1198             =cut
1199              
1200             sub iter_substr_nooverlap
1201             {
1202 6     6 1 6209 my $self = shift;
1203 6         18 my ( $callback, %opts ) = @_;
1204              
1205             $self->iter_extents_nooverlap(
1206             sub {
1207 17     17   39 my ( $e, %tags ) = @_;
1208 17         61 $callback->( $e->plain_substr, %tags );
1209             },
1210 6         42 %opts,
1211             );
1212             }
1213              
1214             =head2 tagnames
1215              
1216             @names = $st->tagnames
1217              
1218             Returns the set of tag names used in the string, in no particular order.
1219              
1220             =cut
1221              
1222             sub tagnames
1223             {
1224 14     14 1 73 my $self = shift;
1225              
1226 14         90 my $tags = $self->{tags};
1227              
1228 14         25 my %tags;
1229 14         31 foreach my $t ( @$tags ) {
1230 15         40 $tags{$t->[2]}++;
1231             }
1232              
1233 14         135 keys %tags;
1234             }
1235              
1236             =head2 get_tags_at
1237              
1238             $tags = $st->get_tags_at( $pos )
1239              
1240             Returns a HASH reference of all the tag values active at the given position.
1241              
1242             =cut
1243              
1244             sub get_tags_at
1245             {
1246 11     11 1 1891 my $self = shift;
1247 11         21 my ( $pos ) = @_;
1248              
1249 11         18 my $tags = $self->{tags};
1250              
1251 11         16 my %tags;
1252              
1253             # TODO: turn this into a binary search
1254 11         26 foreach my $t ( @$tags ) {
1255 14         30 my ( $ts, $te, $tn, $tv ) = @$t;
1256              
1257 14 100       35 last if $ts > $pos;
1258 11 100       23 next if $te <= $pos;
1259              
1260 10         29 $tags{$tn} = $tv;
1261             }
1262              
1263 11         65 return \%tags;
1264             }
1265              
1266             =head2 get_tag_at
1267              
1268             $value = $st->get_tag_at( $pos, $name )
1269              
1270             Returns the value of the named tag at the given position, or C if the
1271             tag is not applied there.
1272              
1273             =cut
1274              
1275             sub get_tag_at
1276             {
1277 6     6 1 23 my $self = shift;
1278 6         32 my ( $pos, $name ) = @_;
1279              
1280 6         14 my $tags = $self->{tags};
1281              
1282 6         12 my $value;
1283              
1284 6         45 foreach my $t ( @$tags ) {
1285 15         38 my ( $ts, $te, $tn, $tv ) = @$t;
1286              
1287 15 100       54 last if $ts > $pos;
1288 11 100       30 next if $te <= $pos;
1289              
1290 8 100       53 $value = $tv if $tn eq $name;
1291             }
1292              
1293 6         34 return $value;
1294             }
1295              
1296             =head2 get_tag_extent
1297              
1298             $extent = $st->get_tag_extent( $pos, $name )
1299              
1300             If the named tag applies to the given position, returns a
1301             L object to represent the extent of the tag at that
1302             position. If it does not, C is returned. If an extent is returned it
1303             will define the C and C flags if appropriate.
1304              
1305             =cut
1306              
1307             sub get_tag_extent
1308             {
1309 10     10 1 31 my $self = shift;
1310 10         24 my ( $pos, $name ) = @_;
1311              
1312 10         24 my $tags = $self->{tags};
1313              
1314 10         22 my ( $start, $end, $flags );
1315              
1316 10         31 foreach my $t ( @$tags ) {
1317 14         35 my ( $ts, $te, $tn, undef, $tf ) = @$t;
1318              
1319 14 100       46 last if $ts > $pos;
1320 13 100       35 next if $te <= $pos;
1321              
1322 12 100       31 next unless $tn eq $name;
1323              
1324 9         17 $start = $ts;
1325 9         15 $end = $te;
1326 9         18 $flags = $tf;
1327             }
1328              
1329 10 100       26 if( defined $start ) {
1330 9         36 return $self->_mkextent( $start, $end, $flags );
1331             }
1332             else {
1333 1         4 return undef;
1334             }
1335             }
1336              
1337             =head2 get_tag_missing_extent
1338              
1339             $extent = $st->get_tag_missing_extent( $pos, $name )
1340              
1341             If the named tag does not apply at the given position, returns the extent of
1342             the string around that position that does not have the tag. If it does exist,
1343             C is returned. If an extent is returned it will not define the
1344             C and C flags, as these do not make sense for the
1345             range in which a tag is absent.
1346              
1347             =cut
1348              
1349             sub get_tag_missing_extent
1350             {
1351 3     3 1 481 my $self = shift;
1352 3         8 my ( $pos, $name ) = @_;
1353              
1354 3         6 my $tags = $self->{tags};
1355              
1356 3         4 my $start = 0;
1357              
1358 3         7 foreach my $t ( @$tags ) {
1359 6         12 my ( $ts, $te, $tn ) = @$t;
1360              
1361 6 100       17 next unless $tn eq $name;
1362              
1363 3 100 100     14 if( $ts <= $pos and $te > $pos ) {
1364 1         4 return undef;
1365             }
1366              
1367 2 100       7 if( $ts > $pos ) {
1368 1         4 return $self->_mkextent( $start, $ts, 0 );
1369             }
1370              
1371 1         5 $start = $te;
1372             }
1373              
1374 1         3 return $self->_mkextent( $start, $self->length, 0 );
1375             }
1376              
1377             =head2 set_substr
1378              
1379             $st->set_substr( $start, $len, $newstr )
1380              
1381             Modifies a extent of the underlying plain string to that given. The extents of
1382             tags in the string are adjusted to cope with the modified region, and the
1383             adjustment in length.
1384              
1385             Tags entirely before the replaced extent remain unchanged.
1386              
1387             Tags entirely within the replaced extent are deleted.
1388              
1389             Tags entirely after the replaced extent are moved by appropriate amount to
1390             ensure they still apply to the same characters as before.
1391              
1392             Tags that start before and end after the extent remain, and have their lengths
1393             suitably adjusted.
1394              
1395             Tags that span just the start or end of the extent, but not both, are
1396             truncated, so as to remove the part of the tag applied on the modified extent
1397             but preserving that applied outside.
1398              
1399             If C<$newstr> is a C object, then its tags will be applied to
1400             C<$st> as appropriate. Edge-anchored tags in C<$newstr> will not be extended
1401             through C<$st>, though they will apply as edge-anchored if they now sit at the
1402             edge of the new string.
1403              
1404             =cut
1405              
1406             sub set_substr
1407             {
1408 56     56 1 15451 my $self = shift;
1409 56         140 my ( $start, $len, $new ) = @_;
1410              
1411 56         145 my $limit = $self->length;
1412              
1413 56 50       215 $start = $limit if $start > $limit;
1414 56 50       147 $len = ( $limit - $start ) if $len > ( $limit - $start );
1415              
1416 56         218 CORE::substr( $self->{str}, $start, $len ) = $new;
1417              
1418 56         100 my $oldend = $start + $len;
1419 56         98 my $newend = $start + CORE::length( $new );
1420              
1421 56         116 my $delta = $newend - $oldend;
1422             # Positions after $oldend have now moved up $delta places
1423              
1424 56         97 my $tags = $self->{tags};
1425              
1426 56         107 my $i = 0;
1427              
1428 56         169 for( ; $i < @$tags; $i++ ) {
1429             # In this loop we'll handle tags that start before the deleted section
1430              
1431 42         93 my $t = $tags->[$i];
1432 42         100 my ( $ts, $te, undef, undef, $tf ) = @$t;
1433              
1434 42 100 100     190 last if $ts >= $start and not( $tf & FLAG_ANCHOR_BEFORE );
1435              
1436             # Two cases:
1437             # A: Tag spans entirely outside deleted section - stretch/compress it
1438             # We may have to collapse it to nothing, so delete it
1439             # B: Tag starts before but ends within deleted section - truncate it
1440             # Plus a case we don't care about
1441             # Tag starts and ends entirely before the deleted section - ignore it
1442              
1443 30 100 100     220 if( $te > $oldend or
    100 100        
1444             ( $te == $oldend and $tf & FLAG_ANCHOR_AFTER ) ) {
1445             # Case A
1446 16         31 $t->[1] += $delta;
1447              
1448 16 50       65 if( $t->[0] == $t->[1] ) {
1449 0         0 splice @$tags, $i, 1, ();
1450 0         0 $i--;
1451 0         0 next;
1452             }
1453             }
1454             elsif( $te > $start ) {
1455             # Case B
1456 1         6 $t->[1] = $start;
1457             }
1458             }
1459              
1460 56         171 for( ; $i < @$tags; $i++ ) {
1461 13         55 my $t = $tags->[$i];
1462 13         51 my ( $ts, $te ) = @$t;
1463              
1464             # In this loop we'll handle tags that start within the deleted section
1465 13 100       63 last if $ts >= $oldend;
1466              
1467             # Two cases
1468             # C: Tag contained entirely within deleted section - delete it
1469             # D: Tag starts within but ends after the deleted section - truncate it
1470              
1471 3 100       10 if( $te <= $oldend ) {
1472             # Case C
1473 2         5 splice @$tags, $i, 1;
1474 2         5 $i--;
1475 2         7 next;
1476             }
1477             else {
1478             # Case D
1479 1         1 $t->[0] = $newend;
1480 1         3 $t->[1] += $delta;
1481             }
1482             }
1483              
1484 56         151 for( ; $i < @$tags; $i++ ) {
1485 12         38 my $t = $tags->[$i];
1486 12         34 my ( $ts, $te, undef, undef, $tf ) = @$t;
1487              
1488             # In this loop we'll handle tags that start after the deleted section
1489              
1490             # One case
1491             # E: Tag starts and ends after the deleted section - move it
1492 12 100       50 $t->[0] += $delta unless $tf & FLAG_ANCHOR_BEFORE;
1493 12         23 $t->[1] += $delta;
1494              
1495             # If we've not moved the start (because it was FLAG_ANCHOR_BEFORE), we
1496             # might now have an ordering constraint violation. Better fix it.
1497 12         23 local $b = $t;
1498 12         47 foreach my $new_i ( reverse 0 .. $i-1 ) {
1499 7         14 local $a = $tags->[$new_i];
1500              
1501 7 100       17 last if _cmp_tags() <= 0;
1502              
1503 1         6 splice @$tags, $new_i, 0, splice @$tags, $i, 1, ();
1504              
1505 1         3 last;
1506             }
1507             }
1508              
1509 56 100       120 if( is_string_tagged( $new ) ) {
1510 21         74 my $atstart = $start == 0;
1511 21         56 my $atend = $newend == $self->length;
1512              
1513             $new->iter_extents( sub {
1514 21     21   58 my ( $e, $tn, $tv ) = @_;
1515 21 50 66     157 $self->apply_tag(
    100 100        
1516             ( $atstart && $e->anchor_before ) ? -1 : $start + $e->start,
1517             ( $atend && $e->anchor_after ) ? -1 : $e->length,
1518             $tn, $tv );
1519 21         175 } );
1520             }
1521              
1522 56         168 $self->_assert_sorted if DEBUG;
1523              
1524 56         206 return $self;
1525             }
1526              
1527             =head2 insert
1528              
1529             $st->insert( $start, $newstr )
1530              
1531             Insert the given string at the given position. A shortcut around
1532             C.
1533              
1534             If C<$newstr> is a C object, then its tags will be applied to
1535             C<$st> as appropriate. If C<$start> is 0, any before-anchored tags in will
1536             become before-anchored in C<$st>.
1537              
1538             =cut
1539              
1540             sub insert
1541             {
1542 15     15 1 7654 my $self = shift;
1543 15         45 my ( $at, $new ) = @_;
1544 15         82 $self->set_substr( $at, 0, $new );
1545             }
1546              
1547             =head2 append
1548              
1549             $st->append( $newstr )
1550              
1551             $st .= $newstr
1552              
1553             Append to the underlying plain string. A shortcut around C.
1554              
1555             If C<$newstr> is a C object, then its tags will be applied to
1556             C<$st> as appropriate. Any after-anchored tags in will become after-anchored
1557             in C<$st>.
1558              
1559             =cut
1560              
1561 20     20   84549 use overload '.=' => 'append';
  20         57  
  20         140  
1562              
1563             sub append
1564             {
1565 64     64 1 1316 my $self = shift;
1566 64         172 my ( $new ) = @_;
1567              
1568 64 100       165 return $self->set_substr( $self->length, 0, $new ) if is_string_tagged( $new );
1569              
1570             # Optimised version
1571 46         159 $self->{str} .= $new;
1572              
1573 46         138 my $newend = $self->length;
1574              
1575 46         98 my $tags = $self->{tags};
1576              
1577 46         86 my $i = 0;
1578              
1579             # Adjust boundaries of ANCHOR_AFTER tags
1580 46         136 for( ; $i < @$tags; $i++ ) {
1581 59         98 my $t = $tags->[$i];
1582 59 100       179 $t->[1] = $newend if $t->[4] & FLAG_ANCHOR_AFTER;
1583             }
1584              
1585 46         150 return $self;
1586             }
1587              
1588             =head2 append_tagged
1589              
1590             $st->append_tagged( $newstr, %tags )
1591              
1592             Append to the underlying plain string, and apply the given tags to the
1593             newly-inserted extent.
1594              
1595             Returns C<$st> itself so that the method may be easily chained.
1596              
1597             =cut
1598              
1599             sub append_tagged
1600             {
1601 18     18 1 39 my $self = shift;
1602 18         52 my ( $new, %tags ) = @_;
1603              
1604 18         40 my $start = $self->length;
1605 18         31 my $len = CORE::length( $new );
1606              
1607 18         58 $self->append( $new );
1608 18         67 $self->apply_tag( $start, $len, $_, $tags{$_} ) for keys %tags;
1609              
1610 18         88 return $self;
1611             }
1612              
1613             =head2 concat
1614              
1615             $ret = $st->concat( $other )
1616              
1617             $ret = $st . $other
1618              
1619             Returns a new C containing the two strings concatenated
1620             together, preserving any tags present. This method overloads normal string
1621             concatenation operator, so expressions involving C values
1622             retain their tags.
1623              
1624             This method or operator tries to respect subclassing; preferring to return a
1625             new object of a subclass if either argument or operand is a subclass of
1626             C. If they are both subclasses, it will prefer the type of the
1627             invocant or first operand.
1628              
1629             =cut
1630              
1631 20     20   7152 use overload '.' => 'concat';
  20         88  
  20         136  
1632              
1633             sub concat
1634             {
1635 17     17 1 7674 my $self = shift;
1636 17         49 my ( $other, $swap ) = @_;
1637              
1638             # Try to find the "higher" subclass
1639 17 100 100     122 my $class = ( ref $self eq __PACKAGE__ and is_string_tagged( $other ) )
1640             ? ref $other : ref $self;
1641              
1642 17         71 my $ret = $class->new( $self );
1643 17 100       102 return $ret->insert( 0, $other ) if $swap;
1644 6         23 return $ret->append( $other );
1645             }
1646              
1647             =head2 matches
1648              
1649             @subs = $st->matches( $regexp )
1650              
1651             Returns a list of substrings (as C instances) for every
1652             non-overlapping match of the given C<$regexp>.
1653              
1654             This could be used, for example, to build a formatted string from a formatted
1655             template containing variable expansions:
1656              
1657             my $template = ...
1658             my %vars = ...
1659              
1660             my $ret = String::Tagged->new;
1661             foreach my $m ( $template->matches( qr/\$\w+|[^$]+/ ) ) {
1662             if( $m =~ m/^\$(\w+)$/ ) {
1663             $ret->append_tagged( $vars{$1}, %{ $m->get_tags_at( 0 ) } );
1664             }
1665             else {
1666             $ret->append( $m );
1667             }
1668             }
1669              
1670             This iterates segments of the template containing variables expansions
1671             starting with a C<$> symbol, and replaces them with values from the C<%vars>
1672             hash, careful to preserve all the formatting tags from the original template
1673             string.
1674              
1675             =cut
1676              
1677             sub matches
1678             {
1679 1     1 1 9 my $self = shift;
1680 1         3 my ( $re ) = @_;
1681              
1682 1         4 my $plain = $self->str;
1683              
1684 1         3 my @ret;
1685 1         12 while( $plain =~ m/$re/g ) {
1686 5         29 push @ret, $self->substr( $-[0], $+[0] - $-[0] );
1687             }
1688              
1689 1         6 return @ret;
1690             }
1691              
1692             =head2 match_extents
1693              
1694             @extents = $st->match_extents( $regexp )
1695              
1696             I
1697              
1698             Returns a list of extent objects for every non-overlapping match of the given
1699             C<$regexp>. This is similar to L, except that the results are
1700             returned as extent objects instead of substrings, allowing access to the
1701             position information as well.
1702              
1703             If using the result of this method to find regions of a string to modify,
1704             remember that any length alterations will not update positions in later extent
1705             objects. However, since the extents are non-overlapping and in position order,
1706             this can be handled by iterating them in reverse order so that the
1707             modifications done first are later in the string.
1708              
1709             foreach my $e ( reverse $st->match_extents( $pattern ) ) {
1710             $st->set_substr( $e->start, $e->length, $replacement );
1711             }
1712              
1713             =cut
1714              
1715             sub match_extents
1716             {
1717 2     2 1 10059 my $self = shift;
1718 2         3 my ( $re ) = @_;
1719              
1720 2         6 my $plain = $self->str;
1721              
1722 2         4 my @ret;
1723 2         18 while( $plain =~ m/$re/g ) {
1724 9         29 push @ret, $self->_mkextent( $-[0], $+[0], 0 );
1725             }
1726              
1727 2         10 return @ret;
1728             }
1729              
1730             =head2 split
1731              
1732             @parts = $st->split( $regexp, $limit )
1733              
1734             Returns a list of substrings by applying the regexp to the string content;
1735             similar to the core perl C function. If C<$limit> is supplied, the
1736             method will stop at that number of elements, returning the entire remainder of
1737             the input string as the final element. If the C<$regexp> contains a capture
1738             group then the content of the first one will be added to the return list as
1739             well.
1740              
1741             =cut
1742              
1743             sub split
1744             {
1745 4     4 1 40 my $self = shift;
1746 4         8 my ( $re, $limit ) = @_;
1747              
1748 4         11 my $plain = $self->str;
1749              
1750 4         7 my $prev = 0;
1751 4         6 my @ret;
1752 4         55 while( $plain =~ m/$re/g ) {
1753 5         26 push @ret, $self->substr( $prev, $-[0]-$prev );
1754 5 100       90 push @ret, $self->substr( $-[1], $+[1]-$-[1] ) if @- > 1;
1755              
1756 5         16 $prev = $+[0];
1757              
1758 5 100 66     36 last if defined $limit and @ret == $limit-1;
1759             }
1760              
1761 4 100       13 if( CORE::length $plain > $prev ) {
1762 3         9 push @ret, $self->substr( $prev, CORE::length( $plain ) - $prev );
1763             }
1764              
1765 4         28 return @ret;
1766             }
1767              
1768             =head2 sprintf
1769              
1770             $ret = $st->sprintf( @args )
1771              
1772             I
1773              
1774             Returns a new string by using the given instance as the format string for a
1775             L constructor call. The returned instance will be of the same
1776             class as the invocant.
1777              
1778             =cut
1779              
1780             sub sprintf
1781             {
1782 1     1 1 2 my $self = shift;
1783              
1784 1         4 return ( ref $self )->from_sprintf( $self, @_ );
1785             }
1786              
1787             =head2 debug_sprintf
1788              
1789             $ret = $st->debug_sprintf
1790              
1791             Returns a representation of the string data and all the tags, suitable for
1792             debug printing or other similar use. This is a format such as is given in the
1793             DESCRIPTION section above.
1794              
1795             The output will consist of a number of lines, the first containing the plain
1796             underlying string, then one line per tag. The line shows the extent of the tag
1797             given by C<[---]> markers, or a C<|> in the special case of a tag covering
1798             only a single character. Special markings of C> and C> indicate
1799             tags which are "before" or "after" anchored.
1800              
1801             For example:
1802              
1803             Hello, world
1804             [---] word => 1
1805             <[----------]> everywhere => 1
1806             | space => 1
1807              
1808             =cut
1809              
1810             sub debug_sprintf
1811             {
1812 5     5 1 631 my $self = shift;
1813              
1814 5         13 my $str = $self->str;
1815 5         12 my $len = CORE::length( $str );
1816              
1817 5         8 my $maxnamelen = 0;
1818              
1819 5         19 my $ret = " " . ( $str =~ s/\n/./gr ) . "\n";
1820              
1821             $self->iter_tags( sub {
1822 7     7   19 my ( undef, undef, $name, undef ) = @_;
1823 7 100       21 CORE::length( $name ) > $maxnamelen and $maxnamelen = CORE::length( $name );
1824 5         36 } );
1825              
1826 5         34 foreach my $t ( @{ $self->{tags} } ) {
  5         15  
1827 7         19 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
1828              
1829 7 100       19 $ret .= ( $tf & FLAG_ANCHOR_BEFORE ) ? " <" : " ";
1830              
1831 7         22 $ret .= " " x $ts;
1832              
1833 7         13 my $tl = $te - $ts;
1834              
1835 7 100       19 if( $tl == 0 ) {
    100          
1836 1         6 $ret =~ s/ $/>
1837 1         3 $te++; # account for extra printed width
1838             }
1839             elsif( $tl == 1 ) {
1840 2         3 $ret .= "|";
1841             }
1842             else {
1843 4         10 $ret .= "[" . ( "-" x ( $tl - 2 ) ) . "]";
1844             }
1845              
1846 7         13 $ret .= " " x ( $len - $te );
1847              
1848 7 100       15 $ret .= ( $tf & FLAG_ANCHOR_AFTER ) ? "> " : " ";
1849              
1850 7         40 $ret .= CORE::sprintf "%-*s => %s\n", $maxnamelen, $tn, $tv;
1851             }
1852              
1853 5         36 return $ret;
1854             }
1855              
1856             =head1 TODO
1857              
1858             =over 4
1859              
1860             =item *
1861              
1862             There are likely variations on the rules for C that could equally
1863             apply to some uses of tagged strings. Consider whether the behaviour of
1864             modification is chosen per-method, per-tag, or per-string.
1865              
1866             =item *
1867              
1868             Consider how to implement a clone from one tag format to another which wants
1869             to merge multiple different source tags together into a single new one.
1870              
1871             =back
1872              
1873             =head1 AUTHOR
1874              
1875             Paul Evans
1876              
1877             =cut
1878              
1879             0x55AA;