File Coverage

blib/lib/WebFetch/Input/SiteNews.pm
Criterion Covered Total %
statement 200 213 93.9
branch 69 102 67.6
condition 20 36 55.5
subroutine 22 23 95.6
pod 1 9 11.1
total 312 383 81.4


line stmt bran cond sub pod time code
1             # WebFetch::Input::SiteNews
2             # ABSTRACT: download and save SiteNews headlines from a local file
3             #
4             # Copyright (c) 1998-2022 Ian Kluft. This program is free software; you can
5             # redistribute it and/or modify it under the terms of the GNU General Public
6             # License Version 3. See https://www.gnu.org/licenses/gpl-3.0-standalone.html
7              
8             # pragmas to silence some warnings from Perl::Critic
9             ## no critic (Modules::RequireExplicitPackage)
10             # This solves a catch-22 where parts of Perl::Critic want both package and use-strict to be first
11 3     3   2077 use strict;
  3         13  
  3         98  
12 3     3   15 use warnings;
  3         17  
  3         76  
13 3     3   14 use utf8;
  3         7  
  3         22  
14             ## use critic (Modules::RequireExplicitPackage)
15              
16             package WebFetch::Input::SiteNews;
17             $WebFetch::Input::SiteNews::VERSION = '0.15.8';
18 3     3   185 use base "WebFetch";
  3         6  
  3         285  
19              
20 3     3   19 use Carp;
  3         6  
  3         204  
21 3     3   23 use Readonly;
  3         16  
  3         186  
22 3     3   28 use Scalar::Util qw(reftype);
  3         8  
  3         146  
23 3     3   20 use DateTime;
  3         13  
  3         97  
24 3     3   18 use DateTime::Format::ISO8601;
  3         15  
  3         8802  
25              
26             # set defaults
27             my ( $cat_priorities, $now );
28             Readonly::Array my @Options => ( "short_path|short=s", "long_path|long=s", );
29             Readonly::Scalar my $Usage => "--short short-output-file --long long-output-file";
30              
31             # configuration parameters
32             Readonly::Scalar my $num_links => 5;
33             Readonly::Array my @dt_keys => qw(locale time_zone);
34              
35             # functions for access to internal data for testing
36             ## no critic (Subroutines::ProhibitUnusedPrivateSubroutines)
37             sub _config_params
38             {
39 4     4   47 return { Options => \@Options, Usage => $Usage, num_links => $num_links };
40             }
41 0     0   0 sub _cat_priorities { return $cat_priorities; }
42             ## critic (Subroutines::ProhibitUnusedPrivateSubroutines)
43              
44             # no user-servicable parts beyond this point
45              
46             # register capabilities with WebFetch
47             __PACKAGE__->module_register( _config_params(), "cmdline", "input:sitenews", "output:sitenews_long" );
48              
49             # constants for state names
50 299     299 0 602 sub initial_state { return 0; }
51 184     184 0 410 sub attr_state { return 1; }
52 234     234 0 559 sub text_state { return 2; }
53              
54             # fetch() is the WebFetch API call point to run this module
55             sub fetch
56             {
57 2     2 1 15 my ($self) = @_;
58              
59             # set parameters for WebFetch routines
60 2 50       8 if ( not defined $self->{num_links} ) {
61 2         8 $self->set_param( "num_links", WebFetch->config("num_links") );
62             }
63 2         7 $self->set_param( "para", 1 );
64              
65             # set up Webfetch Embedding API data
66 2         6 $self->{actions} = {};
67 2         11 $self->data->add_fields( "date", "title", "priority", "expired", "position", "label", "url", "category", "text" );
68              
69             # defined which fields match to which "well-known field names"
70 2         8 $self->data->add_wk_names(
71             "title" => "title",
72             "url" => "url",
73             "date" => "date",
74             "summary" => "text",
75             "category" => "category"
76             );
77              
78             # process the links
79              
80             # get local time for various date comparisons
81 2         4 my %dt_opts;
82 2 50 33     7 if ( exists $self->{datetime_settings} and reftype( $self->{datetime_settings} ) eq "HASH" ) {
83 0         0 %dt_opts = %{ $self->{datetime_settings} }; # get locale and time_zone settings if available
  0         0  
84             }
85 2 50       7 if ( exists $self->{testing_faketime} ) {
86              
87             # use a pre-specified timestamp for testing purposes so news elements are same age as expected result
88 2         23 $now = DateTime::Format::ISO8601->parse_datetime( $self->{testing_faketime}, %dt_opts );
89 2 50       1780 if ( exists $dt_opts{locale} ) {
90 0         0 $now->set_locale( $dt_opts{locale} );
91             }
92 2 50       8 if ( exists $dt_opts{time_zone} ) {
93 0         0 $now->set_time_zone( $dt_opts{time_zone} );
94             }
95             } else {
96 0         0 $now = DateTime->now(%dt_opts);
97             }
98              
99             # parse data file(s)
100 2         5 my @sources;
101 2 50 33     9 if ( ( exists $self->{sources} ) and ( ref $self->{sources} eq "ARRAY" ) ) {
102 0         0 @sources = @{ $self->{sources} };
  0         0  
103             }
104 2 50       6 if ( exists $self->{source} ) {
105 2         7 push @sources, $self->{source};
106             }
107 2         5 foreach my $source (@sources) {
108 2         7 $self->parse_input($source);
109             }
110              
111             # set parameters for the short news format
112 2 50       9 if ( defined $self->{short_path} ) {
113              
114             # create the HTML actions list
115 2         9 $self->{actions}{html} = [];
116              
117             # create the HTML-generation parameters
118 2         5 my $params = {};
119 2         5 $params = {};
120             $params->{sort_func} = sub {
121 48     48   77 my ( $a, $b ) = @_;
122              
123             # sort/compare news entries for the short display
124             # sorting priority:
125             # expiration status first (expired items last)
126             # priority second (category/age combo)
127             # label third (chronological order)
128              
129             # check expirations first
130 48         88 my $exp_fnum = $self->fname2fnum("expired");
131 48 50 33     108 ( $a->[$exp_fnum] and not $b->[$exp_fnum] ) and return 1;
132 48 50 33     188 ( not $a->[$exp_fnum] and $b->[$exp_fnum] ) and return -1;
133              
134             # compare priority - posting category w/ age penalty
135 48         104 my $pri_fnum = $self->fname2fnum("priority");
136 48 100       120 if ( $a->[$pri_fnum] != $b->[$pri_fnum] ) {
137 18         58 return $a->[$pri_fnum] <=> $b->[$pri_fnum];
138             }
139              
140             # otherwise sort by label (chronological order)
141 30         62 my $lbl_fnum = $self->fname2fnum("label");
142 30         76 return $a->[$lbl_fnum] cmp $b->[$lbl_fnum];
143 2         10 };
144             $params->{filter_func} = sub {
145              
146             # filter: skip expired items
147 33     33   70 my $exp_fnum = $self->fname2fnum("expired");
148 33         95 return not $_[$exp_fnum];
149 2         9 };
150             $params->{format_func} = sub {
151              
152             # generate HTML text
153 10     10   25 my $txt_fnum = $self->fname2fnum("text");
154 10         28 my $pri_fnum = $self->fname2fnum("priority");
155 10         52 return $_[$txt_fnum] . "\n<!--- priority " . $_[$pri_fnum] . " --->";
156 2         11 };
157              
158             # put parameters for fmt_handler_html() on the html list
159 2         4 push @{ $self->{actions}{html} }, [ $self->{short_path}, $params ];
  2         8  
160             }
161              
162             # set parameters for the long news format
163 2 50       6 if ( defined $self->{long_path} ) {
164              
165             # create the SiteNews-specific action list
166             # It will use WebFetch::Input::SiteNews::fmt_handler_sitenews_long()
167             # which is defined in this file
168 2         6 $self->{actions}{sitenews_long} = [];
169              
170             # put parameters for fmt_handler_sitenews_long() on the list
171 2         3 push @{ $self->{actions}{sitenews_long} }, [ $self->{long_path} ];
  2         5  
172             }
173 2         7 return;
174             }
175              
176             # inner portion of input parsing - called by parse_input()
177             # read SiteNews text file and parse it into news items to return to caller
178             sub parse_input_inner
179             {
180 2     2 0 5 my ( $self, $news_data_fd ) = @_;
181 2         5 my @news_items;
182 2         4 my $position = 0;
183 2         6 my $state = initial_state; # before first entry
184 2         6 my ($current);
185 2         9 $cat_priorities = {}; # priorities for sorting
186 2         84 while (<$news_data_fd>) {
187 233         358 chomp;
188 233 100       518 /^\s*\#/x and next; # skip comments
189 228 100       623 /^\s*$/x and next; # skip blank lines
190              
191 192 100       419 if (/^[^\s]/x) {
192              
193             # found attribute line
194 107 100       206 if ( $state == initial_state ) {
195 10 100       46 if (/^categories:\s*(.*)/x) {
    100          
196 2         15 my @cats = split( ' ', $1 );
197 2         5 my ($i);
198 2         6 $cat_priorities->{"default"} = 999;
199 2         9 for ( $i = 0 ; $i <= $#cats ; $i++ ) {
200 8         45 $cat_priorities->{ $cats[$i] } = $i + 1;
201             }
202 2         17 next;
203             } elsif (/^(\w+):\s*(.*)/x) {
204 6         22 $self->set_param( $1, $2 );
205             }
206             }
207 105 100 100     168 if ( $state == initial_state or $state == text_state ) {
    50          
208              
209             # found first attribute of a new entry
210 39 100       154 if (/^([^=]+)=(.*)/x) {
211 33         53 $current = {};
212 33         81 $current->{position} = $position++;
213 33         87 $current->{$1} = $2;
214 33         59 push( @news_items, $current );
215 33         48 $state = attr_state;
216             }
217             } elsif ( $state == attr_state ) {
218              
219             # found a followup attribute
220 66 50       200 if (/^([^=]+)=(.*)/x) {
221 66         349 $current->{$1} = $2;
222             }
223             }
224             } else {
225              
226             # found text line
227 85 50 66     127 if ( $state == initial_state ) {
    50          
228              
229             # cannot accept text before any attributes
230 0         0 next;
231             } elsif ( $state == attr_state or $state == text_state ) {
232 85 100       146 if ( defined $current->{text} ) {
233 52         126 $current->{text} .= "\n$_";
234             } else {
235 33         65 $current->{text} = $_;
236             }
237 85         131 $state = text_state;
238             }
239             }
240             }
241 2         16 return @news_items;
242             }
243              
244             # parse input file
245             sub parse_input
246             {
247 2     2 0 9 my ( $self, $input ) = @_;
248              
249             # parse data file
250 2         5 my $news_data;
251 2 50       123 if ( not open( $news_data, "<", $input ) ) {
252 0         0 croak "$0: failed to open $input: $!\n";
253             }
254 2         14 my @news_items = $self->parse_input_inner($news_data);
255 2         38 close $news_data;
256              
257             # translate parsed news into the WebFetch Embedding API data table
258 2         7 my ( %label_hash, $pos );
259 2         5 $pos = 0;
260 2         7 foreach my $item (@news_items) {
261              
262             # collect fields for the data record
263 33 50       93 my $title = ( defined $item->{title} ) ? $item->{title} : "";
264 33 50       102 my $posted = ( defined $item->{posted} ) ? $item->{posted} : "";
265 33 50       69 my $category = ( defined $item->{category} ) ? $item->{category} : "";
266 33 50       127 my $text = ( defined $item->{text} ) ? $item->{text} : "";
267             my $url_prefix =
268 33 50       104 ( defined $self->{url_prefix} ) ? $self->{url_prefix} : "";
269              
270             # timestamp processing using optional locale and time_zone from WebFetch object's datetime_settings hash
271             # This is usually set by SiteNews file's global settings at the top
272 33         60 my ( %dt_opts, $time_str, $anchor_time );
273 33 50 33     192 if ( exists $self->{datetime_settings} and reftype $self->{datetime_settings} eq "HASH" ) {
274 33         71 %dt_opts = %{ $self->{datetime_settings} }; # get locale and time_zone settings if available
  33         126  
275             }
276 33 50       79 if ($posted) {
277 33         83 my $date_ref = WebFetch::parse_date( \%dt_opts, $posted );
278 33 100       94 if ( defined $date_ref ) {
279 8 100       77 my $dt = ( ref $date_ref eq "DateTime" ) ? $date_ref : DateTime->new( @$date_ref, %dt_opts );
280 8         16687 $time_str = WebFetch::gen_timestamp( \%dt_opts, $dt );
281 8         9272 $anchor_time = WebFetch::anchor_timestr( \%dt_opts, $dt );
282             }
283             }
284 33 100 66     312 if ( not defined $time_str or not defined $anchor_time ) {
285 25         47 $time_str = "undated";
286 25         31 $anchor_time = "0000-undated";
287             }
288              
289             # generate an intra-page link label
290 33         55 my ( $label, $count );
291 33         46 $count = 0;
292 33   66     210 while ( ( $label = $anchor_time . "-" . sprintf( "%03d", $count ) )
293             and defined $label_hash{$label} )
294             {
295 300         1043 $count++;
296             }
297 33         159 $label_hash{$label} = 1;
298              
299             # generate data record for output
300 33         100 $self->data->add_record(
301             $time_str, $title, $self->priority($item),
302             expired($item), $pos, $label, $url_prefix . "#" . $label,
303             $category, $text
304             );
305 33         99 $pos++;
306             }
307 2         38 return;
308             }
309              
310             # format handler function specific to this module's long-news output format
311             sub fmt_handler_sitenews_long
312             {
313             # TODO move sort block to separate function and remove the no-critic exemption
314             ## no critic ( BuiltinFunctions::RequireSimpleSortBlock )
315 2     2 0 11 my ($self) = @_;
316              
317             # sort events for long display
318             my @long_news = sort {
319              
320             # sort news entries for long display
321             # sorting priority:
322             # date first
323             # category/priority second
324             # reverse file order last
325              
326             # sort by date
327 37         81 my $lbl_fnum = $self->fname2fnum("label");
328 37         84 my ( $a_date, $b_date ) = ( $a->[$lbl_fnum], $b->[$lbl_fnum] );
329 37         165 $a_date =~ s/-.*//x;
330 37         95 $b_date =~ s/-.*//x;
331 37 100       81 if ( $a_date ne $b_date ) {
332 7         27 return $b_date cmp $a_date;
333             }
334              
335             # sort by priority (within same date)
336 30         69 my $pri_fnum = $self->fname2fnum("priority");
337 30 100       79 if ( $a->[$pri_fnum] != $b->[$pri_fnum] ) {
338 1         20 return $a->[$pri_fnum] <=> $b->[$pri_fnum];
339             }
340              
341             # sort by chronological order (within same date and priority)
342 29         76 return $a->[$lbl_fnum] cmp $b->[$lbl_fnum];
343 2         7 } @{ $self->{data}{records} };
  2         23  
344              
345             # process the links for the long list
346 2         9 my ( @long_text, $prev, $url_prefix, $i );
347             $url_prefix =
348             ( defined $self->{url_prefix} )
349             ? $self->{url_prefix}
350 2 50       14 : "";
351 2         14 $prev = undef;
352 2         7 push @long_text, "<dl>";
353 2         8 my $lbl_fnum = $self->fname2fnum("label");
354 2         14 my $date_fnum = $self->fname2fnum("date");
355 2         26 my $title_fnum = $self->fname2fnum("title");
356 2         16 my $txt_fnum = $self->fname2fnum("text");
357 2         10 my $exp_fnum = $self->fname2fnum("expired");
358 2         9 my $pri_fnum = $self->fname2fnum("priority");
359              
360 2         19 for ( $i = 0 ; $i <= $#long_news ; $i++ ) {
361 33         60 my $news = $long_news[$i];
362 33 100 100     107 if ( ( !defined $prev->[$date_fnum] )
363             or $prev->[$date_fnum] ne $news->[$date_fnum] )
364             {
365 9         32 push @long_text, "<dt>" . $news->[$date_fnum];
366 9         16 push @long_text, "<dd>";
367             }
368 33 50       128 push @long_text,
369             "<a name=\""
370             . $news->[$lbl_fnum] . "\">"
371             . $news->[$title_fnum]
372             . "</a>\n"
373             . $news->[$txt_fnum]
374             . "<!--- priority: "
375             . $news->[$pri_fnum]
376             . ( $news->[$exp_fnum] ? " expired" : "" ) . " --->";
377 33         50 push @long_text, "<p>";
378 33         64 $prev = $news;
379             }
380 2         4 push @long_text, "</dl>";
381              
382             # store it for later save to disk
383 2         67 $self->html_savable( $self->{long_path}, join( "\n", @long_text ) . "\n" );
384 2         14 return;
385             }
386              
387             #
388             # utility functions
389             #
390              
391             # function to detect if a news entry is expired
392             sub expired
393             {
394 33     33 0 62 my ($entry) = @_;
395 33 50       229 return 0 if not exists $entry->{expires};
396 0 0       0 return 0 if not defined $entry->{expires};
397 0         0 return $entry->{expires} < $now;
398             }
399              
400             # function to compute a priority value which decays with age
401             sub priority
402             {
403 33     33 0 67 my ( $self, $entry ) = @_;
404              
405 33 50       83 return 999 if not exists $entry->{posted};
406 33 50       70 return 999 if not defined $entry->{posted};
407              
408 33         87 my $date_ref = WebFetch::parse_date( $entry->{posted} );
409 33 100       119 return 999 if not defined $date_ref;
410              
411 8         16 my %dt_opts;
412 8 50 33     51 if ( exists $self->{datetime_settings} and reftype( $self->{datetime_settings} ) eq "HASH" ) {
413 8         23 %dt_opts = %{ $self->{datetime_settings} }; # get locale and time_zone settings if available
  8         35  
414             }
415 8 100       29 my $dt = ( ref $date_ref eq "DateTime" ) ? $date_ref : DateTime->new( @$date_ref, %dt_opts );
416 8         709 my $age = ( $now->subtract_datetime($dt) )->delta_days();
417 8         2555 my $bonus = 0;
418              
419 8 50       22 if ( $age <= 2 ) {
420 0         0 $bonus -= 2 - $age;
421             }
422 8 100 66     47 if ( ( defined $entry->{category} )
423             and ( defined $cat_priorities->{ $entry->{category} } ) )
424             {
425             my $cat_pri =
426             ( exists $cat_priorities->{ $entry->{category} } )
427             ? $cat_priorities->{ $entry->{category} }
428 7 50       23 : 0;
429 7         52 return $cat_pri + $age * 0.025 + $bonus;
430             } else {
431 1         12 return $cat_priorities->{"default"} + $age * 0.025 + $bonus;
432             }
433             }
434              
435             1;
436              
437             =pod
438              
439             =encoding UTF-8
440              
441             =head1 NAME
442              
443             WebFetch::Input::SiteNews - download and save SiteNews headlines from a local file
444              
445             =head1 VERSION
446              
447             version 0.15.8
448              
449             =head1 SYNOPSIS
450              
451             In perl scripts:
452              
453             C<use WebFetch::Input::SiteNews;>
454              
455             From the command line:
456              
457             C<perl -w -MWebFetch::Input::SiteNews -e "&fetch_main" -- --dir directory
458             --source news-file --short short-form-output-file
459             --long long-form-output-file>
460              
461             =head1 DESCRIPTION
462              
463             This module gets the current headlines from a site-local file.
464              
465             The I<--source> parameter specifies a file name which contains news to be
466             posted. See L<"FILE FORMAT"> below for details on contents to put in the
467             file. I<--source> may be specified more than once, allowing a single news
468             output to come from more than one input. For example, one file could be
469             manually maintained in CVS or RCS and another could be entered from a
470             web form.
471              
472             After this runs, the file C<site_news.html> will be created or replaced.
473             If there already was a C<site_news.html> file, it will be moved to
474             C<Osite_news.html>.
475              
476             =head1 FILE FORMAT
477              
478             The WebFetch::Input::SiteNews data format is used to set up news for the
479             local web site and allow other sites to import it via WebFetch.
480             The file is plain text containing comments and the news items.
481              
482             There are three forms of outputs generated from these news files.
483              
484             The I<"short news" output> is a small number (5 by default)
485             of HTML text and links used for display in a small news window.
486             This list takes into account expiration dates and
487             priorities to pick which news entries are displayed and in what order.
488              
489             The I<"long news" output> lists all the news entries chronologically.
490             It does not take expiration or priority into account.
491             It is intended for a comprehensive site news list.
492              
493             The I<export modes> make news items available in formats other web sites
494             can retrieve to post news about your site. They are chronological
495             listings that omit expired items.
496             They do not take priorities into account.
497              
498             =over 4
499              
500             =item global parameters
501              
502             Lines coming before the first news item can set global parameters.
503              
504             =over 4
505              
506             =item categories
507              
508             A line before the first news item beginning with "categories:" contains a
509             whitespace-delimited list of news category names in order from highest
510             to lowest priority.
511             These priority names are used by the news item attributes and
512             then for sorting "short news" list items.
513              
514             =back
515              
516             =item data lines
517              
518             Non-blank non-indented non-comment lines are I<data lines>.
519             Each data line contains a I<name=value> pair.
520             Each group of consecutive data lines is followed by an arbitrary number
521             of indented lines which contain HTML text for the news entry.
522              
523             The recognized attributes are as follows:
524              
525             =over 4
526              
527             =item category
528              
529             used for prioritization, values are set by the categories global parameter
530             (required)
531              
532             =item posted
533              
534             date posted, format is a numerical date YYYYMMDD (required)
535              
536             =item expires
537              
538             expiration date, format is a numerical date YYYYMMDD (optional)
539              
540             =item title
541              
542             shorter title for use in news exports to other sites,
543             otherwise the whole news text will be used (optional)
544              
545             =back
546              
547             =item text lines
548              
549             Intended lines are HTML text for the news item.
550              
551             =item comments
552              
553             Comments are lines beginning with "#".
554             They are ignored so they can be used for human-readable information.
555              
556             =back
557              
558             Note that the "short news" list has some modifications to
559             priorities based on the age of the news item,
560             so that the short list will favor newer items when
561             they're the same priority.
562             There is a sorting "priority bonus" for items less than a
563             day old, which increases their priority by two priority levels.
564             Day-old news items get a bonus of one priority level.
565             All news items also "decay" in priority slightly every day,
566             dropping a whole priority level every 40 days.
567              
568             =head1 SEE ALSO
569              
570             L<WebFetch>
571             L<https://github.com/ikluft/WebFetch>
572              
573             =head1 BUGS AND LIMITATIONS
574              
575             Please report bugs via GitHub at L<https://github.com/ikluft/WebFetch/issues>
576              
577             Patches and enhancements may be submitted via a pull request at L<https://github.com/ikluft/WebFetch/pulls>
578              
579             =head1 AUTHOR
580              
581             Ian Kluft <https://github.com/ikluft>
582              
583             =head1 COPYRIGHT AND LICENSE
584              
585             This software is Copyright (c) 1998-2023 by Ian Kluft.
586              
587             This is free software, licensed under:
588              
589             The GNU General Public License, Version 3, June 2007
590              
591             =cut
592              
593             __END__
594             # POD docs follow
595