File Coverage

blib/lib/String/Copyright.pm
Criterion Covered Total %
statement 78 88 88.6
branch 26 38 68.4
condition 25 40 62.5
subroutine 17 19 89.4
pod 0 2 0.0
total 146 187 78.0


line stmt bran cond sub pod time code
1 7     7   1647564 use 5.008001;
  7         84  
2 7     7   42 use strict;
  7         16  
  7         141  
3 7     7   31 use warnings;
  7         23  
  7         160  
4 7     7   34 use utf8;
  7         13  
  7         37  
5 7     7   220 use re (qw/eval/);
  7         15  
  7         748  
6              
7             my $CAN_RE2;
8              
9             BEGIN {
10 7     7   31 eval { require re::engine::RE2 };
  7         995  
11 7 50       783 $CAN_RE2 = $@ ? '' : 1;
12             }
13              
14             package String::Copyright;
15              
16             =encoding UTF-8
17              
18             =head1 NAME
19              
20             String::Copyright - Representation of text-based copyright statements
21              
22             =head1 VERSION
23              
24             Version 0.003014
25              
26             =cut
27              
28             our $VERSION = '0.003014';
29              
30             # Dependencies
31 7     7   3260 use parent 'Exporter::Tiny';
  7         2748  
  7         38  
32 7     7   28633 use Carp ();
  7         17  
  7         142  
33 7     7   4098 use Set::IntSpan;
  7         72437  
  7         545  
34              
35             our @EXPORT = qw/copyright/;
36              
37             use constant {
38 7         593 PLAINTEXT => 0,
39             BLOCKS => 1,
40             FORMAT => 2,
41 7     7   61 };
  7         17  
42              
43             use overload (
44 7         52 q{""} => '_compose',
45             fallback => 1,
46 7     7   46 );
  7         15  
47              
48             =head1 SYNOPSIS
49              
50             use String::Copyright;
51              
52             my $copyright = copyright(<<'END');
53             copr. © 1999,2000 Foo Barbaz and Acme Corp.
54             Copyright (c) 2001,2004 Foo (work address)
55             Copyright 2003, Foo B. and friends
56             © 2000, 2002 Foo Barbaz
57             END
58              
59             print $copyright;
60              
61             # Copyright 1999-2000 Foo Barbaz and Acme Corp.
62             # Copyright 2000, 2002 Foo Barbaz and Acme Corp.
63             # Copyright 2001, 2004 Foo (work address)
64             # Copyright 2003 Foo B. and friends
65              
66             =head1 DESCRIPTION
67              
68             L identifies copyright statements in a string
69             and serializes them in a normalized format.
70              
71             =head1 OPTIONS
72              
73             Options can be set as an argument to the 'use' statement.
74              
75             =head2 threshold, threshold_before, threshold_after
76              
77             use String::Copyright { threshold_after => 5 };
78              
79             Stop parsing after this many lines without copyright information,
80             before or after having found any copyright information at all.
81             C sets both C and C.
82              
83             By default unset: All lines are parsed.
84              
85             =head2 format( \&sub )
86              
87             use String::Copyright { format => \&GNU_style } };
88              
89             sub GNU_style {
90             my ( $years, $owners ) = @_;
91              
92             return 'Copyright (C) ' . join ' ', $years || '', $owners || '';
93             }
94              
95             =head1 FUNCTIONS
96              
97             Exports one function: C.
98             This module uses L to export functions,
99             which allows for flexible import options;
100             see the L documentation for details.
101              
102             =cut
103              
104             # OR'ed strings have regular variable name and are already grouped
105             # AND'ed strings have name ending in underscore: must be grouped if repeated
106             my $blank = '[ \t]';
107             my $blank_or_break_ = "$blank*\\n?$blank*";
108             my $dash = '[-˗‐‑‒–—―⁃−﹣-]';
109             my $colons_ = "$blank?:{1,2}";
110             my $strictlabel = 'SPDX-FileCopyrightText:';
111             my $label = '(?i:copyright(?:-holders?)?\b|copr\.)';
112             my $sign = '[©⒞Ⓒⓒ🄒🄫🅒]';
113             my $nroff_sign_ = '\\\\[(]co';
114             my $pseudo_sign_ = '[({][Cc][})]';
115             my $vague_sign_ = '-[Cc]-';
116             my $broken_sign_ = "\\?$blank*";
117              
118             # high-bit © noise, caused by misparsing UTF-8 as latin1
119             # except \xAE (latin1 ©), \xAE (MacRoman ©), \xE2 (latin1 © lowercased after misparse)
120             my $nonsign_ = '[\x80-\xAB\xAD-\xC1\xC3-\xE1\xE3-\xFF]\xA9';
121             my $nonidentifier_
122             = "(?:no |_|$dash)copyright|copyright-[^h]|(?:Digital Millennium|U.S.|US|United States) Copyright Act|\\b(?:for|we) copyright\\b";
123              
124             # this should cause *no* false positives, and stop-chars therefore
125             # exclude e.g. email address building blocks; tested against the code
126             # corpus at https://codesearch.debian.net/ (tricky: its RE2 engine lacks
127             # support for negative groups) using searches like these:
128             # (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\w
129             # (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\b[-_@]
130             # (?im)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)[^ $]
131             my $identifier_action
132             = '(?i:apply|applied|applies|assigned|generated|transfer|transferred)';
133             my $identifier_thing_
134             = '(?i:block|claim|date|disclaimer|holder|info|information|interest|law|license|notice|owner|ownership|permission|sign|statement|string|symbol|tag|text)s?';
135             my $identifier_misc
136             = "(?i:and|are|at|eq|for|if|in|is|of|on|or|,${blank}patent|this|to|the (?:library|software),|treaty)";
137             my $identifier_chatter
138             = "(?:$identifier_action|$identifier_thing_|$identifier_misc)";
139             my $the_notname
140             = '(?i:concrete|fault|first|immediately|least|min\/max|one|outer|previous|ratio|sum|user)';
141             my $the_sentence_
142             = "(?:\\w+$blank+){1,10}(?i:are|can(?:not)?|in|is|must|was)";
143             my $pseudosign_chatter_
144             = "(?:(?:the$blank+(?:$the_notname|$the_sentence_)|all begin|there|you must)\\b|,? \\(?\\w\\))";
145             my $chatter
146             = "(?im:$nonsign_|$nonidentifier_|copyright$blank_or_break_$identifier_chatter(?:\\z|@\\W|[^a-zA-Z0-9@_-])|$blank*$pseudo_sign_(?:$blank_or_break_)+$pseudosign_chatter_)";
147             my $nonyears_ = '\W?(?i:year|19[xy]{2}|[xy]{4})\W?';
148              
149             my $year_ = '\b[0-9]{4}\b';
150             my $comma_spacy = "(?:$blank*,$blank_or_break_|$blank_or_break_,?$blank*)";
151             my $dash_spacy_ = "$blank*$dash(?:$blank_or_break_)*";
152              
153             my $colon_or_dash = "(?:$colons_$blank_or_break_|$blank?$dash\{1,2}$blank)";
154             my $delimiter = "(?:$colon_or_dash|$comma_spacy)";
155              
156             my $vague_year_ = "(?:$dash$blank?)?[0-9]{1,5}";
157             my $owner_intro_
158             = "(?:$colon_or_dash|$pseudo_sign_$blank?|\\bby$blank_or_break_)";
159             my $owner_prefix = '[(*<@\[{]';
160             my $owner_initial = '[^\s!"#$%&\'()*+,./:;<=>?@[\\\\\]^_`{|}~-]';
161              
162             my $signs
163             = "(?m:$strictlabel$blank*|(?:$label|$sign|$nroff_sign_|(?:^|$blank)$pseudo_sign_)(?:$colon_or_dash?$blank*(?:$label|$sign|$pseudo_sign_))*)";
164             my $yearspan_ = "$year_(?:$dash_spacy_$year_)?";
165             my $years_ = "$yearspan_(?:$comma_spacy$yearspan_)*";
166             my $owners_
167             = "(?:$vague_year_|$owner_prefix*$owner_initial\\S*)(?:$blank*\\S+)*";
168              
169             # compile regexps in isolation to limit use of RE2 engine
170             my ($dash_spacy_re, $owner_intro_A_re, $boilerplate_X_re,
171             $signs_and_more_re
172             );
173             {
174 7 50   7   11544 BEGIN { re::engine::RE2->import( -strict => 1 ) if ($CAN_RE2) }
175             $dash_spacy_re = qr/$dash_spacy_/;
176             $owner_intro_A_re = qr/^$owner_intro_/;
177             $boilerplate_X_re
178             = qr/(?i)${comma_spacy}All$blank+Rights$blank+Reserved[.!]?.*/;
179             $signs_and_more_re
180             = qr/$chatter|$signs(?:$blank$vague_sign_)?$delimiter(?:$broken_sign_)?(?:$nonyears_|((?:$years_$delimiter)?(?:(?:$owner_intro_)?$owners_)?))|\n/;
181             }
182              
183             sub _generate_copyright
184             {
185 13     13   2130 my ( $class, $name, $args, $globals ) = @_;
186              
187             return sub {
188 185     185   47561 my $copyright = shift;
189              
190             Carp::croak("String::Copyright strings require defined parts")
191 185 50       440 unless 1 + @_ == grep {defined} $copyright, @_;
  185         748  
192              
193             # String::Copyright objects are effectively immutable and can be reused
194 185 50 33     857 if ( !@_ && ref($copyright) eq __PACKAGE__ ) {
195 0         0 return $copyright;
196             }
197              
198             # stringify objects
199 185         383 $copyright = "$copyright";
200              
201             # TODO: also parse @_ - but each separately!
202 185         291 my @block;
203 185         292 my $skipped = 0;
204 185         4436 while ( $copyright =~ /$signs_and_more_re/g ) {
205 268         735 my $owners = $1;
206 268 100 66     1028 if ( $globals->{threshold_before} || $globals->{threshold} ) {
207             last
208             if (
209             !@block
210             and !length $owners
211             and ++$skipped >= (
212             $globals->{threshold_before} || $globals->{threshold}
213             )
214 24 100 100     89 );
      33        
      100        
215             }
216 266 100 66     850 if ( $globals->{threshold_after} || $globals->{threshold} ) {
217              
218             # "after" detects end of _current_ line so is skewed by one
219             last
220             if (
221             @block
222             and !length $owners
223             and ++$skipped >= 1 + (
224             $globals->{threshold_after} || $globals->{threshold}
225             )
226 14 100 100     66 );
      33        
      100        
227             }
228 264 100       2031 next if ( !length $owners );
229 121         178 $skipped = 0;
230              
231 121         179 my $years;
232 121         1206 my @span = $owners =~ /\G($yearspan_)(?:$comma_spacy|\Z)/gm;
233 121 100       327 if (@span) {
234 47         116 $owners = $';
235              
236             # deduplicate
237 47         76 my @ranges;
238 47         103 for (@span) {
239 74         342 my ( $y1, $y2 ) = split /$dash_spacy_re/;
240 74 100       213 if ( !$y2 ) {
    100          
241 59         153 push @ranges, $y1;
242             }
243             elsif ( $y1 > $y2 ) {
244 1         8 push @ranges, [ $y2, $y1 ];
245             }
246             else {
247 14         57 push @ranges, [ $y1, $y2 ];
248             }
249             }
250              
251             # normalize
252             $years = join ', ',
253 47 100       233 map { $_->[0] == $_->[1] ? $_->[0] : "$_->[0]-$_->[1]" }
  51         5268  
254             Set::IntSpan->new( \@ranges )->spans;
255             }
256 121 100       410 if ($owners) {
257 115         561 $owners =~ s/$owner_intro_A_re//;
258 115         275 $owners =~ s/\s{2,}/ /g;
259 115         365 $owners =~ s/$owner_intro_A_re//;
260 115         352 $owners =~ s/$boilerplate_X_re//g;
261             }
262              
263             # split owner into owner_id and owner
264              
265 121   100     1022 push @block, [ $years || undef, $owners || undef ];
      100        
266             }
267              
268             # TODO: save $skipped_lines to indicate how dirty parsing was
269              
270 185         414 my $ext_format = $globals->{format};
271             my $format
272             = $globals->{format}
273 110     110   298 ? sub { $ext_format->( $_->[0], $_->[1] ) }
274 185 100 33 25   937 : sub { join ' ', '©', $_->[0] || (), $_->[1] || () };
  25   33     138  
275              
276 185         1756 bless [ $copyright, \@block, $format ], __PACKAGE__;
277             }
278 13         118 }
279              
280             sub new
281             {
282 0     0 0 0 my ( $self, @data ) = @_;
283             Carp::croak("String::Copyright require defined, positive-length parts")
284 0 0       0 unless 1 + @_ == grep { defined && length } @data;
  0 0       0  
285              
286             # String::Copyright objects are simply stripped of their string part
287 0 0 0     0 if ( !@_ && ref($self) eq __PACKAGE__ ) {
288 0         0 return bless [ undef, $data[1] ], __PACKAGE__;
289             }
290              
291             # FIXME: properly validate data
292             Carp::croak("String::Copyright blocks must be an array of strings")
293 0 0       0 unless @_ == grep { ref eq 'ARRAY' } @data;
  0         0  
294              
295 0         0 bless [ undef, \@data ], __PACKAGE__;
296             }
297              
298             sub _compose
299             {
300 199     199   46648 my $format = $_[0]->[FORMAT];
301 199         334 join "\n", map {&$format} @{ $_[0]->[BLOCKS] };
  135         272  
  199         743  
302             }
303              
304 0     0 0   sub is_normalized { !defined $_[0]->[PLAINTEXT] }
305              
306             =head1 SEE ALSO
307              
308             =over 4
309              
310             =item *
311              
312             L
313              
314             =item *
315              
316             L
317              
318             =back
319              
320             =head1 BUGS/CAVEATS/etc
321              
322             L operates on strings, not bytes.
323             Data encoded as UTF-8, Latin1 or other formats
324             need to be decoded to strings before use.
325              
326             Only ASCII characters and B<©> (copyright sign) are directly processed.
327              
328             If copyright sign is not detected
329             or accents or multi-byte characters display wrong,
330             then most likely the data was not decoded into a string.
331              
332             If ranges or lists of years are not tidied,
333             then maybe it contained non-ASCII whitespace or digits.
334              
335             =head1 AUTHOR
336              
337             Jonas Smedegaard C<< >>
338              
339             =head1 COPYRIGHT AND LICENSE
340              
341             This program is based on the script "licensecheck" from the KDE SDK,
342             originally introduced by Stefan Westerfeld C<< >>.
343              
344             Copyright © 2007, 2008 Adam D. Barratt
345              
346             Copyright © 2005-2012, 2016, 2018, 2020-2021 Jonas Smedegaard
347              
348             Copyright © 2018, 2020-2021 Purism SPC
349              
350             This program is free software:
351             you can redistribute it and/or modify it
352             under the terms of the GNU Affero General Public License
353             as published by the Free Software Foundation,
354             either version 3, or (at your option) any later version.
355              
356             This program is distributed in the hope that it will be useful,
357             but WITHOUT ANY WARRANTY;
358             without even the implied warranty
359             of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
360             See the GNU Affero General Public License for more details.
361              
362             You should have received a copy
363             of the GNU Affero General Public License along with this program.
364             If not, see .
365              
366             =cut
367              
368             1;