File Coverage

blib/lib/Data/Random/Structure/UTF8.pm
Criterion Covered Total %
statement 96 108 88.8
branch 44 58 75.8
condition 13 15 86.6
subroutine 15 15 100.0
pod 6 6 100.0
total 174 202 86.1


line stmt bran cond sub pod time code
1             package Data::Random::Structure::UTF8;
2              
3 4     4   319017 use 5.8.0;
  4         44  
4 4     4   27 use strict;
  4         8  
  4         106  
5 4     4   20 use warnings;
  4         6  
  4         212  
6              
7             our $VERSION='0.04';
8              
9 4     4   1814 use parent 'Data::Random::Structure';
  4         1216  
  4         22  
10              
11 4     4   69394 use Scalar::Util qw( looks_like_number );
  4         9  
  4         4994  
12              
13             sub new {
14 5     5 1 4031 my $class = shift;
15 5         26 my %options = @_;
16 5         10 my $only_unicode = 0;
17 5 100       20 if( exists $options{'only-unicode'} ){
18 3 50       10 if( defined $options{'only-unicode'} ){
19 3         7 $only_unicode = $options{'only-unicode'}
20             }
21             # do not pass our options to parent it may get confused and croak
22 3         5 delete $options{'only-unicode'}
23             }
24 5         39 my $self = $class->SUPER::new(%options);
25             # at this point our _init() will be called via parent's
26             # constructor. Our _init() will call parent's _init()
27 5         34 $self->only_unicode($only_unicode);
28 5         32 return $self
29             }
30             sub _reset {
31 12     12   18 my $self = shift;
32             # we are interfering with the internals of the parent... not good
33 12         22 $#{$self->{_types}} = -1;
  12         34  
34 12         23 $#{$self->{_scalar_types}} = -1;
  12         32  
35             }
36             sub _init {
37 12     12   108 my $self = shift;
38 12         36 $self->_reset();
39 12         48 $self->SUPER::_init(@_);
40 12         124 push @{$self->{_scalar_types}}, 'string-UTF8'
  12         34  
41             }
42             sub only_unicode {
43 14     14 1 3639 my $self = $_[0];
44 14         28 my $m = $_[1];
45 14 100       68 return $self->{'_only-unicode'} unless defined $m;
46 7         24 $self->_init();
47 7         19 $self->{'_only-unicode'} = $m;
48 7 100       39 if( $m == 1 ){
    100          
49             # delete just the 'string' type
50             # we will get various types but the strings will
51             # be exclusively unicode
52 5         12 my @idx = grep { $self->{'_scalar_types'}->[$_] eq 'string' }
53 1         3 reverse 0 .. $#{$self->{_scalar_types}}
  1         4  
54             ;
55 1         3 splice(@{$self->{_scalar_types}}, $_, 1) for @idx;
  1         3  
56             } elsif( $m > 1 ){
57             # delete ALL the _scalar_types and leave just our unicode string
58             # we will get only unicode strings no other scalar type
59 2         4 $#{$self->{_scalar_types}} = -1;
  2         6  
60 2         4 push @{$self->{_scalar_types}}, 'string-UTF8'
  2         5  
61             }
62 7         12 return $m
63             }
64             sub random_char_UTF8 {
65             # the crucial part borrowed from The Effective Perler:
66             # https://www.effectiveperlprogramming.com/2018/08/find-the-new-emojis-in-perls-unicode-support/
67             # my $achar;
68             # for(my $trials=100;$trials-->0;){
69             # $achar = chr(int(rand(0x10FFF+1)));
70             # return $achar if $achar =~ /\p{Present_In: 8.0}/;
71             # }
72              
73             # just greek and coptic no holes
74 17672     17672 1 44984 return chr(0x03B0+int(rand(0x03F0-0x03B0)));
75              
76 0         0 my $arand = rand();
77 0 0       0 if( $arand < 0.2 ){
    0          
    0          
    0          
78 0         0 return chr(0x03B0+int(rand(0x03F0-0x03B0)))
79             } elsif( $arand < 0.4 ){
80 0         0 return chr(0x0400+int(rand(0x040F-0x0400)))
81             } elsif( $arand < 0.6 ){
82 0         0 return chr(0x13A0+int(rand(0x13EF-0x13A0)))
83             } elsif( $arand < 0.8 ){
84 0         0 return chr(0x1200+int(rand(0x137F-0x1200)))
85             }
86 0         0 return chr(0xA980+int(rand(0xA9DF-0xA980)))
87             }
88             sub random_chars_UTF8 {
89 8836     8836 1 19449 my %options = @_;
90 8836 50       15586 my $minl = defined($options{'min'}) ? $options{'min'} : 6;
91 8836 50       13944 my $maxl = defined($options{'max'}) ? $options{'max'} : 32;
92 8836         11656 my $ret = "";
93 8836         23765 for(1..($minl+int(rand($maxl-$minl)))){
94 17672         25972 $ret .= random_char_UTF8()
95             }
96 8836         31525 return $ret;
97             }
98             # override's parent's.
99             # first call parent's namesake and if it fails because it
100             # is decided to generate UTF8 something, it will default to
101             # this method which must deal with all the extenstions we introduced
102             # in our own _init()
103             # CAVEAT: it relies on parent croaking the message
104             # "I don't know how to generate $type\n"
105             # if that chanegs (in parent) then we will no longer be able to deduce
106             # $type and have to change this program.
107             # if that happens please file a bug.
108             # unfortunately our parent class does not allow for input params...
109             sub generate_scalar {
110 18439     18439 1 93956 my $self = shift;
111 18439         22500 my $rc = eval { $self->SUPER::generate_scalar(@_) };
  18439         33908  
112 18439 100 66     1729447 if( $@ || ! defined($rc) ){
113 8836 50       50714 if( $@ !~ /how to generate (.+?)\R/ ){
114 0         0 warn "something changed in parent class and can not parse this message any more, please file a bug: '$@'";
115 0         0 return scalar(random_chars_UTF8(min=>2,max=>2));
116             }
117 8836         17874 my $type = $1;
118 8836 50       16837 if( $type eq 'string-UTF8' ){
119 8836         16259 return scalar(random_chars_UTF8(min=>2,max=>2));
120             } else {
121 0         0 warn "child: I don't know how to generate $type, this is a bug, please file a bug and mention this: $@\n";
122             # but don't die
123 0         0 return scalar(random_chars_UTF8(min=>2,max=>2));
124             }
125             }
126 9603         25513 return $rc
127             }
128             sub check_content_recursively {
129 1836     1836 1 17044 my $looking_for = $_[1]; # a hashref of types to look-for, required
130 1836         2097 my $bitparams = 0;
131 1836 100 66     6105 $bitparams |= 1 if exists($looking_for->{'numbers'}) && ($looking_for->{'numbers'}==1);
132 1836 100 100     4081 $bitparams |= 2 if exists($looking_for->{'strings-unicode'}) && ($looking_for->{'strings-unicode'}==1);
133 1836 100 100     4032 $bitparams |= 4 if exists($looking_for->{'strings-plain'}) && ($looking_for->{'strings-plain'}==1);
134 1836 100 100     3791 $bitparams |= (2+4) if exists($looking_for->{'strings'}) && ($looking_for->{'strings'}==1);
135 1836         3065 return _check_content_recursively($_[0], $bitparams);
136             }
137             # returns 1 if we are looking for it and it was found
138             # returns 0 if what we were looking for was not found.
139             # 'looking_for' can be more than one things.
140             # it is a bit string, 1st bit if set looks for numbers,
141             # 2nd bit, if set, looks for unicode strings,
142             # 3rd bit, if set, looks for non-unicode strings (plain)
143             sub _check_content_recursively {
144 22264     22264   25610 my $inp = $_[0];
145             # NUMBER,UNICODE_STRING,NON_UNICODE_STRING
146 22264         22151 my $looking_for = $_[1];
147 22264         24804 my $aref = ref($inp);
148 22264         23852 my ($r, $v);
149 22264 100       30770 if( $aref eq '' ){
    100          
    50          
150 19811 100       36542 if( looks_like_number($inp) ){
151 5074 100       7456 return 1 if $looking_for & 1; # a number
152 4678         6638 return 0;
153             }
154 14737 100       18060 if( _has_utf8($inp) ){
155 13943 100       21829 return 1 if $looking_for & 2; # unicode string
156 13355         20792 return 0;
157             }
158 794 100       1740 return 1 if $looking_for & 4; # plain string
159 512         828 return 0;
160             } elsif( $aref eq 'ARRAY' ){
161 1251         1876 for my $v (@$inp){
162 7748         9817 $r = _check_content_recursively($v, $looking_for);
163 7748 100       13713 return 1 if $r;
164             }
165             } elsif( $aref eq 'HASH' ){
166 1202         6072 for my $k (sort keys %$inp){
167 6526         8606 $r = _check_content_recursively($k, $looking_for);
168 6526 100       10508 return 1 if $r;
169 6154         8530 $r = _check_content_recursively($inp->{$k}, $looking_for);
170 6154 100       11663 return 1 if $r;
171             }
172 0         0 } else { die "don't know how to deal with this ref '$aref'" }
173             }
174 14737     14737   39238 sub _has_utf8 { return $_[0] =~ /[^\x00-\x7f]/ }
175             # this does not work for unicode strings
176             # from https://www.perlmonks.org/?node_id=958679
177             # and https://www.perlmonks.org/?node_id=791677
178             #sub isnum ($) {
179             # return 0 if $_[0] eq '';
180             # $_[0] & ~$_[0] ? 0 : 1
181             #}
182             1;
183              
184             =pod
185              
186             =encoding utf8
187              
188             =head1 NAME
189              
190             Data::Random::Structure::UTF8 - Produce nested data structures with unicode keys, values, elements.
191              
192             =head1 VERSION
193              
194             Version 0.04
195              
196             =head1 SYNOPSIS
197              
198             This module produces random, arbitrarily deep and long,
199             nested Perl data structures with unicode content for the
200             keys, values and/or array elements. Content can be forced
201             to be exclusively strings and exclusively unicode. Or
202             the strings can be unicode. Or anything goes, mixed
203             unicode and non-unicode strings as well as integers, floats, etc.
204              
205             This is an object-oriented module
206             which inherits from
207             L and extends its functionality by
208             providing for unicode keys and values for hashtables and
209             unicode content for array elements or scalars, randomly mixed with the
210             usual repertoire of L, which is
211             non-unicode strings,
212             numerical, boolean values and the assorted entourage to the court
213             of Emperor Computer, post-Turing.
214              
215             For example, it produces these:
216              
217             =over 4
218              
219             =item unicode scalars: e.g. C<"αβγ">,
220              
221             =item mixed arrays: e.g. C<["αβγ", "123", "xyz"]>
222              
223             =item hashtables with some/all keys and/or values as unicode: e.g.
224             C<{"αβγ" => "123", "xyz" => "αβγ"}>
225              
226             =item exclusive unicode arrays or hashtables: e.g. C<["αβγ", "χψζ"]>
227              
228             =back
229              
230             This is accomplised by adding an extra
231             type C (invisible to the user) and the
232             respective generator method. All these are invisible to the user
233             which will get the old functionality plus some (or maybe none
234             because this is a random process which does not eliminate non-unicode
235             strings, at the moment) unicode strings.
236              
237             use Data::Random::Structure::UTF8;
238              
239             my $randomiser = Data::Random::Structure::UTF8->new(
240             'max_depth' => 5,
241             'max_elements' => 20,
242             # all the strings produced (keys, values, elements)
243             # will be unicode strings
244             'only-unicode' => 1,
245             # all the strings produced (keys, values, elements)
246             # will be a mixture of unicode and non-unicode
247             # this is the default behaviour
248             #'only-unicode' => 0,
249             # only unicode strings will be produced for (keys, values, elements),
250             # there will be no numbers, no bool, only unicode strings
251             #'only-unicode' => 2,
252             );
253             my $perl_var = $randomiser->generate() or die;
254             print pp($perl_var);
255              
256             # which prints the usual escape mess of Dump and Dumper
257             [
258             "\x{7D5A}\x{4EC1}",
259             "\x{E6E2}\x{75A4}",
260             329076,
261             0.255759160148987,
262             [
263             "TEb97qJt",
264             1,
265             "_ow|J\@~=6%*N;52?W3Y\$S1",
266             {
267             "x{75A4}x{75A4}" => 123,
268             "123" => "\x{7D5A}\x{4EC1}",
269             "xyz" => [1, 2, "\x{7D5A}\x{4EC1}"],
270             },
271             ],
272              
273             # can control the scalar type (for keys, values, items) on the fly
274             # this produces unicode strings in addition to
275             # Data::Random::Structure's usual repertoire:
276             # non-unicode-string, numbers, bool, integer, float, etc.
277             # (see there for the list)
278             $randomiser->only_unicode(0); # the default: anything plus unicode strings
279             print $randomiser->only_unicode();
280              
281             # this produces unicode strings in addition to
282             # Data::Random::Structure's usual repertoire:
283             # numbers, bool, integer, float, etc.
284             # (see there for the list)
285             # EXCEPT non-unicode-strings, (all strings will be unicode)
286             $randomiser->only_unicode(1);
287             print $randomiser->only_unicode();
288              
289             # this produces unicode strings ONLY
290             # Data::Random::Structure's usual repertoire does not apply
291             # there will be no numbers, no bool, no integer, no float, no nothing
292             $randomiser->only_unicode(2);
293             print $randomiser->only_unicode();
294              
295             =head1 METHODS
296              
297             This is an object oriented module which has exactly the same API as
298             L.
299              
300             =head2 C
301              
302             Constructor. In addition to L API, it
303             takes parameter C<< 'only-unicode' >> with a valid value of 0, 1 or 2.
304             Default is 0.
305              
306             =over 4
307              
308             =item 0 : keys, values, elements of the produced data structure will be
309             a mixture of unicode strings, plus L's full
310             repertoire which includes non-unicode strings, integers, floats etc.
311              
312             =item 1 : keys, values, elements of the produced data structure will be
313             a mixture of unicode strings, plus L's full
314             repertoire except non-unicode strings. That is, all strings will be
315             unicode. But there will possibly be integers etc.
316              
317             =item 2 : keys, values, elements of the produced data structure will be
318             only unicode strings. Nothing of L's
319             repertoire applies. Only unicode strings, no integers, no nothing.
320              
321             =back
322              
323             Controlling the scalar data types can also be done on the fly, after
324             the object has been created using L
325             method.
326              
327             Additionally, L's API reports that
328             the constructor takes 2 optional arguments, C and C.
329             See L for up-to-date, official information.
330              
331             =head2 C
332              
333             Controls what scalar types to be included in the nested
334             data structures generated. With no parameters it returns back
335             the current setting. Otherwise, valid input parameters and their
336             meanings are listed in L
337              
338             =head2 C
339              
340             Generate a nested data structure according to the specification
341             set in the constructor. See L for
342             all options. This method is not overriden by this module.
343              
344             It returns the Perl data structure as a reference.
345              
346             =head2 C
347              
348             Generate a scalar which may contain unicode content.
349             See L for
350             all options. This method is overriden by this module but
351             calls the parent's too.
352              
353             It returns a Perl string.
354              
355             =head2 C
356              
357             Generate an array with random, possibly unicode, content.
358             See L for
359             all options. This method is not overriden by this module.
360              
361             It returns the Perl array as a reference.
362              
363             =head2 C
364              
365             Generate an array with random, possibly unicode, content.
366             See L for
367             all options. This method is not overriden by this module.
368              
369             It returns the Perl array as a reference.
370              
371             =head2 C
372              
373             Return a random unicode character, guaranteed to be valid.
374             This is a very simple method which selects characters
375             from some pre-set code pages (Greek, Cyrillic, Cherokee,
376             Ethiopic, Javanese) with equal probability.
377             These pages and ranges were selected so that there are
378             no "holes" between them which would produce an invalid
379             character. Therefore, not all characters from the
380             particular code page will be produced.
381              
382             Returns a random unicode character guaranteed to be valid.
383              
384             =head2 C
385              
386             my $ret = random_chars_UTF8($optional_paramshash)
387              
388             Arguments:
389              
390             =over 4
391              
392             =item * C<$optional_paramshash> : can contain
393              
394             =over 4
395              
396             =item C<'min'> sets the minimum length of the random sequence to be returned, default is 6
397              
398             =item C<'max'> sets the maximum length of the random sequence to be returned, default is 32
399              
400             =back
401              
402             =back
403              
404             Return a random unicode-only string optionally specifying
405             minimum and maximum length. See L
406             for the range of characters it returns. The returned string
407             is unicode and is guaranteed all its characters are valid.
408              
409             =head2 C
410              
411             my $ret = check_content_recursively($perl_var, $paramshashref)
412              
413             Arguments:
414              
415             =over 4
416              
417             =item * C<$perl_var> : a Perl variable containing an arbitrarily nested data structure
418              
419             =item * C<$paramshashref> : can contain one or more of the following keys:
420              
421             =over 4
422              
423             =item C<'numbers'> set it to 1 to look for numbers (possibly among other things).
424             If set to 1 and a number C<123> or C<"123"> is found, this sub returns 1.
425              
426             =item C<'strings-unicode'> set it to 1 to look for unicode strings (possibly among other things).
427             The definition of "unicode string" is that at least one its characters is unicode.
428             If set to 1 and a "unicode string" is found, this sub returns 1.
429              
430             =item C<'strings-plain'> set it to 1 to look for plain strings (possibly among other things).
431             The definition of "plain string" is that none of its characters is unicode.
432             If set to 1 and a "plain string" is found, this sub returns 1.
433              
434             =item C<'strings'> set it to 1 to look for plain or unicode strings (possibly among other things).
435             If set to 1 and a "plain string" or "unicode string" is found, this sub returns 1. Basically,
436             it returns 1 when a string is found (as opposed to a "number").
437              
438             =back
439              
440             =back
441              
442             Return value: 1 or 0 depending what was looking for was found.
443              
444             This is not an object-oriented method. It is called thously:
445              
446             if( Data::Random::Structure::UTF8::check_content_recursively(
447             {'abc'=>123, 'xyz'=>[1,2,3]},
448             {
449             'numbers' => 1,
450             }
451             ) ){ print "data structure contains numbers\n" }
452              
453             CAVEAT: as its name suggests, this is a recursive function. Beware
454             of extremely deep data structures. Deep, not long. If you do get
455             C<<"Deep recursion..." warnings>>, and you do insist to go ahead,
456             this will remove the warnings (but are you sure?):
457             {
458             no warnings 'recursion';
459             if( Data::Random::Structure::UTF8::check_content_recursively(
460             {'abc'=>123, 'xyz'=>[1,2,3]},
461             {
462             'numbers' => 1,
463             }
464             ) ){ print "data structure contains numbers\n" }
465             }
466              
467             =head1 SEE ALSO
468              
469             =over 4
470              
471             =item The parent class L.
472              
473             =item L for stringifying possibly-unicode Perl data structures.
474              
475             =back
476              
477             =head1 AUTHOR
478              
479             Andreas Hadjiprocopis, C<< >>
480              
481             =head1 BUGS
482              
483             Please report any bugs or feature requests to C, or through
484             the web interface at L. I will be notified, and then you'll
485             automatically be notified of progress on your bug as I make changes.
486              
487             =head1 CAVEATS
488              
489             There are two issues users should know about.
490              
491             The first issue is that the unicode produced can make
492             L to complain with
493              
494             Operation "lc" returns its argument for UTF-16 surrogate U+DA4B at /usr/local/share/perl5/Data/Dump.pm line 302.
495              
496             This, I have found, can be fixed with the following workaround (from L, thank you):
497              
498             # Suppress `Operation "lc" returns its argument for UTF-16 surrogate 0xNNNN` warning
499             # for the `lc()` call below; use 'utf8' instead of a more appropriate 'surrogate' pragma
500             # since the latter is not available in until Perl 5.14
501             no warnings 'utf8';
502              
503             The second issue is that this class inherits from L
504             and relies on it complaining about not being able to handle certain types
505             which are our own extensions (the C extension). We have
506             no way to know that except from catching its C'ing and parsing it
507             with the following code
508              
509             my $rc = eval { $self->SUPER::generate_scalar(@_) };
510             if( $@ || ! defined($rc) ){
511             # parent doesn't know what to do, can we handle this?
512             if( $@ !~ /how to generate (.+?)\R/ ){ ... ... }
513             else { print "type is $1" }
514             ...
515              
516             in order to extract the C which can not be handled
517             and handle it ourselves. So whenever the parent class (L)
518             changes its C song, we will have to adopt this code
519             accordingly (in L).
520             For the moment, I have placed a catch-all, fall-back condition
521             to handle this but it will be called for all kind of types
522             and not only the types we have added.
523              
524             So, this issue is not going to make the module die but may make it
525             to skew the random results in favour of unicode strings (which
526             is the fallback, default action when can't parse the type).
527              
528             =head1 SUPPORT
529              
530             You can find documentation for this module with the perldoc command.
531              
532             perldoc Data::Random::Structure::UTF8
533              
534              
535             You can also look for information at:
536              
537             =over 4
538              
539             =item * RT: CPAN's request tracker (report bugs here)
540              
541             L
542              
543             =item * AnnoCPAN: Annotated CPAN documentation
544              
545             L
546              
547             =item * CPAN Ratings
548              
549             L
550              
551             =item * Search CPAN
552              
553             L
554              
555             =back
556              
557             =head1 SEE ALSO
558              
559             =over 4
560              
561             =item L
562              
563             =back
564              
565             =head1 ACKNOWLEDGEMENTS
566              
567             Mark Allen who created L which is our parent class.
568              
569             =head1 DEDICATIONS AND HUGS
570              
571             !Almaz!
572              
573             =head1 LICENSE AND COPYRIGHT
574              
575             This software is Copyright (c) 2020 by Andreas Hadjiprocopis.
576              
577             This is free software, licensed under:
578              
579             The Artistic License 2.0 (GPL Compatible)
580              
581             =cut