File Coverage

blib/lib/WebFetch/Input/SiteNews.pm
Criterion Covered Total %
statement 200 213 93.9
branch 69 102 67.6
condition 20 36 55.5
subroutine 22 23 95.6
pod 1 9 11.1
total 312 383 81.4


line stmt bran cond sub pod time code
1             # WebFetch::Input::SiteNews
2             # ABSTRACT: download and save SiteNews headlines from a local file
3             #
4             # Copyright (c) 1998-2022 Ian Kluft. This program is free software; you can
5             # redistribute it and/or modify it under the terms of the GNU General Public
6             # License Version 3. See https://www.gnu.org/licenses/gpl-3.0-standalone.html
7              
8             # pragmas to silence some warnings from Perl::Critic
9             ## no critic (Modules::RequireExplicitPackage)
10             # This solves a catch-22 where parts of Perl::Critic want both package and use-strict to be first
11 3     3   2082 use strict;
  3         9  
  3         100  
12 3     3   15 use warnings;
  3         6  
  3         87  
13 3     3   15 use utf8;
  3         7  
  3         29  
14             ## use critic (Modules::RequireExplicitPackage)
15              
16             package WebFetch::Input::SiteNews;
17             $WebFetch::Input::SiteNews::VERSION = '0.15.9';
18 3     3   181 use base "WebFetch";
  3         11  
  3         322  
19              
20 3     3   19 use Carp;
  3         6  
  3         208  
21 3     3   33 use Readonly;
  3         13  
  3         196  
22 3     3   21 use Scalar::Util qw(reftype);
  3         5  
  3         168  
23 3     3   17 use DateTime;
  3         14  
  3         93  
24 3     3   25 use DateTime::Format::ISO8601;
  3         19  
  3         8740  
25              
26             # set defaults
27             my ( $cat_priorities, $now );
28             Readonly::Array my @Options => ( "short_path|short=s", "long_path|long=s", );
29             Readonly::Scalar my $Usage => "--short short-output-file --long long-output-file";
30              
31             # configuration parameters
32             Readonly::Scalar my $num_links => 5;
33             Readonly::Array my @dt_keys => qw(locale time_zone);
34              
35             # functions for access to internal data for testing
36             ## no critic (Subroutines::ProhibitUnusedPrivateSubroutines)
37             sub _config_params
38             {
39 4     4   44 return { Options => \@Options, Usage => $Usage, num_links => $num_links };
40             }
41 0     0   0 sub _cat_priorities { return $cat_priorities; }
42             ## critic (Subroutines::ProhibitUnusedPrivateSubroutines)
43              
44             # no user-servicable parts beyond this point
45              
46             # register capabilities with WebFetch
47             __PACKAGE__->module_register( _config_params(), "cmdline", "input:sitenews", "output:sitenews_long" );
48              
49             # constants for state names
50 299     299 0 585 sub initial_state { return 0; }
51 184     184 0 426 sub attr_state { return 1; }
52 234     234 0 587 sub text_state { return 2; }
53              
54             # fetch() is the WebFetch API call point to run this module
55             sub fetch
56             {
57 2     2 1 6 my ($self) = @_;
58              
59             # set parameters for WebFetch routines
60 2 50       7 if ( not defined $self->{num_links} ) {
61 2         9 $self->set_param( "num_links", WebFetch->config("num_links") );
62             }
63 2         7 $self->set_param( "para", 1 );
64              
65             # set up Webfetch Embedding API data
66 2         4 $self->{actions} = {};
67 2         14 $self->data->add_fields( "date", "title", "priority", "expired", "position", "label", "url", "category", "text" );
68              
69             # defined which fields match to which "well-known field names"
70 2         11 $self->data->add_wk_names(
71             "title" => "title",
72             "url" => "url",
73             "date" => "date",
74             "summary" => "text",
75             "category" => "category"
76             );
77              
78             # process the links
79              
80             # get local time for various date comparisons
81 2         4 my %dt_opts;
82 2 50 33     10 if ( exists $self->{datetime_settings} and reftype( $self->{datetime_settings} ) eq "HASH" ) {
83 0         0 %dt_opts = %{ $self->{datetime_settings} }; # get locale and time_zone settings if available
  0         0  
84             }
85 2 50       6 if ( exists $self->{testing_faketime} ) {
86              
87             # use a pre-specified timestamp for testing purposes so news elements are same age as expected result
88 2         24 $now = DateTime::Format::ISO8601->parse_datetime( $self->{testing_faketime}, %dt_opts );
89 2 50       1729 if ( exists $dt_opts{locale} ) {
90 0         0 $now->set_locale( $dt_opts{locale} );
91             }
92 2 50       8 if ( exists $dt_opts{time_zone} ) {
93 0         0 $now->set_time_zone( $dt_opts{time_zone} );
94             }
95             } else {
96 0         0 $now = DateTime->now(%dt_opts);
97             }
98              
99             # parse data file(s)
100 2         4 my @sources;
101 2 50 33     14 if ( ( exists $self->{sources} ) and ( ref $self->{sources} eq "ARRAY" ) ) {
102 0         0 @sources = @{ $self->{sources} };
  0         0  
103             }
104 2 50       8 if ( exists $self->{source} ) {
105 2         7 push @sources, $self->{source};
106             }
107 2         7 foreach my $source (@sources) {
108 2         6 $self->parse_input($source);
109             }
110              
111             # set parameters for the short news format
112 2 50       10 if ( defined $self->{short_path} ) {
113              
114             # create the HTML actions list
115 2         10 $self->{actions}{html} = [];
116              
117             # create the HTML-generation parameters
118 2         5 my $params = {};
119 2         6 $params = {};
120             $params->{sort_func} = sub {
121 48     48   79 my ( $a, $b ) = @_;
122              
123             # sort/compare news entries for the short display
124             # sorting priority:
125             # expiration status first (expired items last)
126             # priority second (category/age combo)
127             # label third (chronological order)
128              
129             # check expirations first
130 48         117 my $exp_fnum = $self->fname2fnum("expired");
131 48 50 33     111 ( $a->[$exp_fnum] and not $b->[$exp_fnum] ) and return 1;
132 48 50 33     158 ( not $a->[$exp_fnum] and $b->[$exp_fnum] ) and return -1;
133              
134             # compare priority - posting category w/ age penalty
135 48         97 my $pri_fnum = $self->fname2fnum("priority");
136 48 100       117 if ( $a->[$pri_fnum] != $b->[$pri_fnum] ) {
137 18         48 return $a->[$pri_fnum] <=> $b->[$pri_fnum];
138             }
139              
140             # otherwise sort by label (chronological order)
141 30         71 my $lbl_fnum = $self->fname2fnum("label");
142 30         84 return $a->[$lbl_fnum] cmp $b->[$lbl_fnum];
143 2         15 };
144             $params->{filter_func} = sub {
145              
146             # filter: skip expired items
147 33     33   64 my $exp_fnum = $self->fname2fnum("expired");
148 33         84 return not $_[$exp_fnum];
149 2         9 };
150             $params->{format_func} = sub {
151              
152             # generate HTML text
153 10     10   22 my $txt_fnum = $self->fname2fnum("text");
154 10         28 my $pri_fnum = $self->fname2fnum("priority");
155 10         65 return $_[$txt_fnum] . "\n<!--- priority " . $_[$pri_fnum] . " --->";
156 2         8 };
157              
158             # put parameters for fmt_handler_html() on the html list
159 2         4 push @{ $self->{actions}{html} }, [ $self->{short_path}, $params ];
  2         9  
160             }
161              
162             # set parameters for the long news format
163 2 50       7 if ( defined $self->{long_path} ) {
164              
165             # create the SiteNews-specific action list
166             # It will use WebFetch::Input::SiteNews::fmt_handler_sitenews_long()
167             # which is defined in this file
168 2         7 $self->{actions}{sitenews_long} = [];
169              
170             # put parameters for fmt_handler_sitenews_long() on the list
171 2         4 push @{ $self->{actions}{sitenews_long} }, [ $self->{long_path} ];
  2         7  
172             }
173 2         6 return;
174             }
175              
176             # inner portion of input parsing - called by parse_input()
177             # read SiteNews text file and parse it into news items to return to caller
178             sub parse_input_inner
179             {
180 2     2 0 8 my ( $self, $news_data_fd ) = @_;
181 2         3 my @news_items;
182 2         6 my $position = 0;
183 2         8 my $state = initial_state; # before first entry
184 2         4 my ($current);
185 2         10 $cat_priorities = {}; # priorities for sorting
186 2         79 while (<$news_data_fd>) {
187 233         380 chomp;
188 233 100       462 /^\s*\#/x and next; # skip comments
189 228 100       722 /^\s*$/x and next; # skip blank lines
190              
191 192 100       423 if (/^[^\s]/x) {
192              
193             # found attribute line
194 107 100       175 if ( $state == initial_state ) {
195 10 100       55 if (/^categories:\s*(.*)/x) {
    100          
196 2         22 my @cats = split( ' ', $1 );
197 2         6 my ($i);
198 2         11 $cat_priorities->{"default"} = 999;
199 2         11 for ( $i = 0 ; $i <= $#cats ; $i++ ) {
200 8         37 $cat_priorities->{ $cats[$i] } = $i + 1;
201             }
202 2         18 next;
203             } elsif (/^(\w+):\s*(.*)/x) {
204 6         22 $self->set_param( $1, $2 );
205             }
206             }
207 105 100 100     165 if ( $state == initial_state or $state == text_state ) {
    50          
208              
209             # found first attribute of a new entry
210 39 100       174 if (/^([^=]+)=(.*)/x) {
211 33         54 $current = {};
212 33         91 $current->{position} = $position++;
213 33         89 $current->{$1} = $2;
214 33         61 push( @news_items, $current );
215 33         58 $state = attr_state;
216             }
217             } elsif ( $state == attr_state ) {
218              
219             # found a followup attribute
220 66 50       202 if (/^([^=]+)=(.*)/x) {
221 66         304 $current->{$1} = $2;
222             }
223             }
224             } else {
225              
226             # found text line
227 85 50 66     132 if ( $state == initial_state ) {
    50          
228              
229             # cannot accept text before any attributes
230 0         0 next;
231             } elsif ( $state == attr_state or $state == text_state ) {
232 85 100       140 if ( defined $current->{text} ) {
233 52         125 $current->{text} .= "\n$_";
234             } else {
235 33         69 $current->{text} = $_;
236             }
237 85         124 $state = text_state;
238             }
239             }
240             }
241 2         16 return @news_items;
242             }
243              
244             # parse input file
245             sub parse_input
246             {
247 2     2 0 9 my ( $self, $input ) = @_;
248              
249             # parse data file
250 2         4 my $news_data;
251 2 50       153 if ( not open( $news_data, "<", $input ) ) {
252 0         0 croak "$0: failed to open $input: $!\n";
253             }
254 2         13 my @news_items = $self->parse_input_inner($news_data);
255 2         26 close $news_data;
256              
257             # translate parsed news into the WebFetch Embedding API data table
258 2         7 my ( %label_hash, $pos );
259 2         7 $pos = 0;
260 2         5 foreach my $item (@news_items) {
261              
262             # collect fields for the data record
263 33 50       100 my $title = ( defined $item->{title} ) ? $item->{title} : "";
264 33 50       90 my $posted = ( defined $item->{posted} ) ? $item->{posted} : "";
265 33 50       70 my $category = ( defined $item->{category} ) ? $item->{category} : "";
266 33 50       116 my $text = ( defined $item->{text} ) ? $item->{text} : "";
267             my $url_prefix =
268 33 50       75 ( defined $self->{url_prefix} ) ? $self->{url_prefix} : "";
269              
270             # timestamp processing using optional locale and time_zone from WebFetch object's datetime_settings hash
271             # This is usually set by SiteNews file's global settings at the top
272 33         49 my ( %dt_opts, $time_str, $anchor_time );
273 33 50 33     182 if ( exists $self->{datetime_settings} and reftype $self->{datetime_settings} eq "HASH" ) {
274 33         52 %dt_opts = %{ $self->{datetime_settings} }; # get locale and time_zone settings if available
  33         119  
275             }
276 33 50       76 if ($posted) {
277 33         94 my $date_ref = WebFetch::parse_date( \%dt_opts, $posted );
278 33 100       111 if ( defined $date_ref ) {
279 8 100       64 my $dt = ( ref $date_ref eq "DateTime" ) ? $date_ref : DateTime->new( @$date_ref, %dt_opts );
280 8         16388 $time_str = WebFetch::gen_timestamp( \%dt_opts, $dt );
281 8         8521 $anchor_time = WebFetch::anchor_timestr( \%dt_opts, $dt );
282             }
283             }
284 33 100 66     346 if ( not defined $time_str or not defined $anchor_time ) {
285 25         34 $time_str = "undated";
286 25         41 $anchor_time = "0000-undated";
287             }
288              
289             # generate an intra-page link label
290 33         49 my ( $label, $count );
291 33         52 $count = 0;
292 33   66     222 while ( ( $label = $anchor_time . "-" . sprintf( "%03d", $count ) )
293             and defined $label_hash{$label} )
294             {
295 300         983 $count++;
296             }
297 33         113 $label_hash{$label} = 1;
298              
299             # generate data record for output
300 33         121 $self->data->add_record(
301             $time_str, $title, $self->priority($item),
302             expired($item), $pos, $label, $url_prefix . "#" . $label,
303             $category, $text
304             );
305 33         102 $pos++;
306             }
307 2         39 return;
308             }
309              
310             # format handler function specific to this module's long-news output format
311             sub fmt_handler_sitenews_long
312             {
313             # TODO move sort block to separate function and remove the no-critic exemption
314             ## no critic ( BuiltinFunctions::RequireSimpleSortBlock )
315 2     2 0 5 my ($self) = @_;
316              
317             # sort events for long display
318             my @long_news = sort {
319              
320             # sort news entries for long display
321             # sorting priority:
322             # date first
323             # category/priority second
324             # reverse file order last
325              
326             # sort by date
327 37         81 my $lbl_fnum = $self->fname2fnum("label");
328 37         84 my ( $a_date, $b_date ) = ( $a->[$lbl_fnum], $b->[$lbl_fnum] );
329 37         134 $a_date =~ s/-.*//x;
330 37         101 $b_date =~ s/-.*//x;
331 37 100       88 if ( $a_date ne $b_date ) {
332 7         17 return $b_date cmp $a_date;
333             }
334              
335             # sort by priority (within same date)
336 30         66 my $pri_fnum = $self->fname2fnum("priority");
337 30 100       71 if ( $a->[$pri_fnum] != $b->[$pri_fnum] ) {
338 1         5 return $a->[$pri_fnum] <=> $b->[$pri_fnum];
339             }
340              
341             # sort by chronological order (within same date and priority)
342 29         81 return $a->[$lbl_fnum] cmp $b->[$lbl_fnum];
343 2         4 } @{ $self->{data}{records} };
  2         19  
344              
345             # process the links for the long list
346 2         5 my ( @long_text, $prev, $url_prefix, $i );
347             $url_prefix =
348             ( defined $self->{url_prefix} )
349             ? $self->{url_prefix}
350 2 50       8 : "";
351 2         7 $prev = undef;
352 2         8 push @long_text, "<dl>";
353 2         7 my $lbl_fnum = $self->fname2fnum("label");
354 2         7 my $date_fnum = $self->fname2fnum("date");
355 2         6 my $title_fnum = $self->fname2fnum("title");
356 2         7 my $txt_fnum = $self->fname2fnum("text");
357 2         13 my $exp_fnum = $self->fname2fnum("expired");
358 2         7 my $pri_fnum = $self->fname2fnum("priority");
359              
360 2         16 for ( $i = 0 ; $i <= $#long_news ; $i++ ) {
361 33         48 my $news = $long_news[$i];
362 33 100 100     95 if ( ( !defined $prev->[$date_fnum] )
363             or $prev->[$date_fnum] ne $news->[$date_fnum] )
364             {
365 9         22 push @long_text, "<dt>" . $news->[$date_fnum];
366 9         17 push @long_text, "<dd>";
367             }
368 33 50       185 push @long_text,
369             "<a name=\""
370             . $news->[$lbl_fnum] . "\">"
371             . $news->[$title_fnum]
372             . "</a>\n"
373             . $news->[$txt_fnum]
374             . "<!--- priority: "
375             . $news->[$pri_fnum]
376             . ( $news->[$exp_fnum] ? " expired" : "" ) . " --->";
377 33         55 push @long_text, "<p>";
378 33         69 $prev = $news;
379             }
380 2         11 push @long_text, "</dl>";
381              
382             # store it for later save to disk
383 2         165 $self->html_savable( $self->{long_path}, join( "\n", @long_text ) . "\n" );
384 2         18 return;
385             }
386              
387             #
388             # utility functions
389             #
390              
391             # function to detect if a news entry is expired
392             sub expired
393             {
394 33     33 0 64 my ($entry) = @_;
395 33 50       233 return 0 if not exists $entry->{expires};
396 0 0       0 return 0 if not defined $entry->{expires};
397 0         0 return $entry->{expires} < $now;
398             }
399              
400             # function to compute a priority value which decays with age
401             sub priority
402             {
403 33     33 0 67 my ( $self, $entry ) = @_;
404              
405 33 50       79 return 999 if not exists $entry->{posted};
406 33 50       65 return 999 if not defined $entry->{posted};
407              
408 33         81 my $date_ref = WebFetch::parse_date( $entry->{posted} );
409 33 100       116 return 999 if not defined $date_ref;
410              
411 8         12 my %dt_opts;
412 8 50 33     56 if ( exists $self->{datetime_settings} and reftype( $self->{datetime_settings} ) eq "HASH" ) {
413 8         19 %dt_opts = %{ $self->{datetime_settings} }; # get locale and time_zone settings if available
  8         28  
414             }
415 8 100       32 my $dt = ( ref $date_ref eq "DateTime" ) ? $date_ref : DateTime->new( @$date_ref, %dt_opts );
416 8         724 my $age = ( $now->subtract_datetime($dt) )->delta_days();
417 8         2576 my $bonus = 0;
418              
419 8 50       23 if ( $age <= 2 ) {
420 0         0 $bonus -= 2 - $age;
421             }
422 8 100 66     46 if ( ( defined $entry->{category} )
423             and ( defined $cat_priorities->{ $entry->{category} } ) )
424             {
425             my $cat_pri =
426             ( exists $cat_priorities->{ $entry->{category} } )
427             ? $cat_priorities->{ $entry->{category} }
428 7 50       22 : 0;
429 7         51 return $cat_pri + $age * 0.025 + $bonus;
430             } else {
431 1         11 return $cat_priorities->{"default"} + $age * 0.025 + $bonus;
432             }
433             }
434              
435             1;
436              
437             =pod
438              
439             =encoding UTF-8
440              
441             =head1 NAME
442              
443             WebFetch::Input::SiteNews - download and save SiteNews headlines from a local file
444              
445             =head1 VERSION
446              
447             version 0.15.9
448              
449             =head1 SYNOPSIS
450              
451             In perl scripts:
452              
453             C<use WebFetch::Input::SiteNews;>
454              
455             From the command line:
456              
457             C<perl -w -MWebFetch::Input::SiteNews -e "&fetch_main" -- --dir directory
458             --source news-file --short short-form-output-file
459             --long long-form-output-file>
460              
461             =head1 DESCRIPTION
462              
463             This module gets the current headlines from a site-local file.
464              
465             The I<--source> parameter specifies a file name which contains news to be
466             posted. See L<"FILE FORMAT"> below for details on contents to put in the
467             file. I<--source> may be specified more than once, allowing a single news
468             output to come from more than one input. For example, one file could be
469             manually maintained in CVS or RCS and another could be entered from a
470             web form.
471              
472             After this runs, the file C<site_news.html> will be created or replaced.
473             If there already was a C<site_news.html> file, it will be moved to
474             C<Osite_news.html>.
475              
476             =head1 FILE FORMAT
477              
478             The WebFetch::Input::SiteNews data format is used to set up news for the
479             local web site and allow other sites to import it via WebFetch.
480             The file is plain text containing comments and the news items.
481              
482             There are three forms of outputs generated from these news files.
483              
484             The I<"short news" output> is a small number (5 by default)
485             of HTML text and links used for display in a small news window.
486             This list takes into account expiration dates and
487             priorities to pick which news entries are displayed and in what order.
488              
489             The I<"long news" output> lists all the news entries chronologically.
490             It does not take expiration or priority into account.
491             It is intended for a comprehensive site news list.
492              
493             The I<export modes> make news items available in formats other web sites
494             can retrieve to post news about your site. They are chronological
495             listings that omit expired items.
496             They do not take priorities into account.
497              
498             =over 4
499              
500             =item global parameters
501              
502             Lines coming before the first news item can set global parameters.
503              
504             =over 4
505              
506             =item categories
507              
508             A line before the first news item beginning with "categories:" contains a
509             whitespace-delimited list of news category names in order from highest
510             to lowest priority.
511             These priority names are used by the news item attributes and
512             then for sorting "short news" list items.
513              
514             =back
515              
516             =item data lines
517              
518             Non-blank non-indented non-comment lines are I<data lines>.
519             Each data line contains a I<name=value> pair.
520             Each group of consecutive data lines is followed by an arbitrary number
521             of indented lines which contain HTML text for the news entry.
522              
523             The recognized attributes are as follows:
524              
525             =over 4
526              
527             =item category
528              
529             used for prioritization, values are set by the categories global parameter
530             (required)
531              
532             =item posted
533              
534             date posted, format is a numerical date YYYYMMDD (required)
535              
536             =item expires
537              
538             expiration date, format is a numerical date YYYYMMDD (optional)
539              
540             =item title
541              
542             shorter title for use in news exports to other sites,
543             otherwise the whole news text will be used (optional)
544              
545             =back
546              
547             =item text lines
548              
549             Intended lines are HTML text for the news item.
550              
551             =item comments
552              
553             Comments are lines beginning with "#".
554             They are ignored so they can be used for human-readable information.
555              
556             =back
557              
558             Note that the "short news" list has some modifications to
559             priorities based on the age of the news item,
560             so that the short list will favor newer items when
561             they're the same priority.
562             There is a sorting "priority bonus" for items less than a
563             day old, which increases their priority by two priority levels.
564             Day-old news items get a bonus of one priority level.
565             All news items also "decay" in priority slightly every day,
566             dropping a whole priority level every 40 days.
567              
568             =head1 SEE ALSO
569              
570             L<WebFetch>
571             L<https://github.com/ikluft/WebFetch>
572              
573             =head1 BUGS AND LIMITATIONS
574              
575             Please report bugs via GitHub at L<https://github.com/ikluft/WebFetch/issues>
576              
577             Patches and enhancements may be submitted via a pull request at L<https://github.com/ikluft/WebFetch/pulls>
578              
579             =head1 AUTHOR
580              
581             Ian Kluft <https://github.com/ikluft>
582              
583             =head1 COPYRIGHT AND LICENSE
584              
585             This software is Copyright (c) 1998-2023 by Ian Kluft.
586              
587             This is free software, licensed under:
588              
589             The GNU General Public License, Version 3, June 2007
590              
591             =cut
592              
593             __END__
594             # POD docs follow
595