File Coverage

blib/lib/Catmandu/Importer/Text.pm
Criterion Covered Total %
statement 15 15 100.0
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 20 20 100.0


line stmt bran cond sub pod time code
1              
2             use Catmandu::Sane;
3 3     3   893  
  3         7  
  3         20  
4             our $VERSION = '1.2018';
5              
6             use Moo;
7 3     3   21 use namespace::clean;
  3         5  
  3         15  
8 3     3   900  
  3         6  
  3         16  
9             with 'Catmandu::Importer';
10              
11             has pattern => (
12             is => 'ro',
13             coerce => sub {
14             $_[0] =~ /\n/m ? qr{$_[0]}x : qr{$_[0]};
15             },
16             );
17              
18             has split => (
19             is => 'ro',
20             coerce => sub {
21             length $_[0] == 1 ? quotemeta($_[0]) : qr{$_[0]};
22             }
23             );
24              
25             my ($self) = @_;
26             sub {
27             state $pattern = $self->pattern;
28             state $split = $self->split;
29             state $count = 0;
30             state $line;
31              
32             while (defined($line = $self->fh->getline)) {
33             chomp $line;
34             next if $pattern and $line !~ $pattern;
35              
36             my $data = {_id => ++$count};
37              
38             if (@+ < 2) { # no capturing groups
39             $data->{text} = $line;
40             }
41             elsif (%+) { # named capturing groups
42             $data->{match} = {%+};
43 1     1   869 }
  1         314  
  1         46  
44             else { # numbered capturing groups
45             no strict 'refs';
46             $data->{match} = [map {$$_} 1 .. @+ - 1];
47 3     3   1150 }
  3         6  
  3         437  
48              
49             if ($split) {
50             $data->{text} = [split $split, $line];
51             }
52              
53             return $data;
54             }
55              
56             return;
57             };
58             }
59              
60             1;
61              
62              
63             =pod
64              
65             =head1 NAME
66              
67             Catmandu::Importer::Text - Package that imports textual data
68              
69             =head1 SYNOPSIS
70              
71             # From the command line
72              
73             # separate fields by whitespace sequences just like awk
74             catmandu convert Text --split '\s+'
75              
76             # import all lines starting with '#', omitting this character
77             catmandu convert Text --pattern '^#(.*)'
78              
79             # In a Perl script
80              
81             use Catmandu;
82              
83             my $importer = Catmandu->importer('Text' , file => "/foo/bar.txt" );
84              
85             # print all lines with line number
86             $importer->each(sub {
87             my $item = $_[0];
88             printf "%d: %s" , $item->{_id} , $item->{text};
89             });
90              
91             =head1 DESCRIPTION
92              
93             This package reads textual input line by line. Each line is
94             imported as item with line number in field C<_id> and text content in field
95             C<text>. Line separators are not included. Lines can further be split by
96             character or pattern and a regular expression can be specified to only import
97             selected lines and to translate pattern groups to fields.
98              
99             =head1 CONFIGURATION
100              
101             =over
102              
103             =item file
104              
105             Read input from a local file given by its path. Alternatively a scalar
106             reference can be passed to read from a string.
107              
108             =item fh
109              
110             Read input from an L<IO::Handle>. If not specified, L<Catmandu::Util::io> is used to
111             create the input stream from the C<file> argument or by using STDIN.
112              
113             =item encoding
114              
115             Binmode of the input stream C<fh>. Set to C<:utf8> by default.
116              
117             =item fix
118              
119             An ARRAY of one or more fixes or file scripts to be applied to imported items.
120              
121             =item split
122              
123             Single Character or regular expression (as string with a least two characters),
124             to split each line. Resulting parts are imported in field C<text> as array.
125              
126             =item pattern
127              
128             Regular expression, given as string, to only import matching lines.
129             Whitespaces in patterns are ignored or must be escaped if patterns consists of
130             multiple lines. If the pattern contains capturing groups, captured values are
131             imported in field C<match> instead of C<text>.
132              
133             For instance dates in C<YYYY-MM-DD> format can be imported as named fields with
134              
135             (?<year>\d\d\d\d)-(?<month>\d\d)-(?<day>\d\d)
136              
137             or as array with
138              
139             (\d\d\d\d)- # year
140             (\d\d)- # month
141             (\d\d) # day
142              
143             =back
144              
145             =head1 METHODS
146              
147             Every L<Catmandu::Importer> is a L<Catmandu::Iterable> with all its methods
148             inherited.
149              
150             =head1 SEE ALSO
151              
152             L<Catmandu::Exporter::Text>
153              
154             L<Catmandu::Fix::parse_text>
155              
156             Unix tools L<awk|https://en.wikipedia.org/wiki/AWK> and
157             L<sed|https://en.wikipedia.org/wiki/Sed>
158              
159             =cut