File Coverage

blib/lib/Catmandu/Importer/Text.pm
Criterion Covered Total %
statement 15 15 100.0
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 20 20 100.0


line stmt bran cond sub pod time code
1             package Catmandu::Importer::Text;
2              
3 3     3   1256 use Catmandu::Sane;
  3         7  
  3         24  
4              
5             our $VERSION = '1.2020';
6              
7 3     3   22 use Moo;
  3         6  
  3         19  
8 3     3   1198 use namespace::clean;
  3         7  
  3         26  
9              
10             with 'Catmandu::Importer';
11              
12             has pattern => (
13             is => 'ro',
14             coerce => sub {
15             $_[0] =~ /\n/m ? qr{$_[0]}x : qr{$_[0]};
16             },
17             );
18              
19             has split => (
20             is => 'ro',
21             coerce => sub {
22             length $_[0] == 1 ? quotemeta($_[0]) : qr{$_[0]};
23             }
24             );
25              
26             sub generator {
27             my ($self) = @_;
28             sub {
29             state $pattern = $self->pattern;
30             state $split = $self->split;
31             state $count = 0;
32             state $line;
33              
34             while (defined($line = $self->fh->getline)) {
35             chomp $line;
36             next if $pattern and $line !~ $pattern;
37              
38             my $data = {_id => ++$count};
39              
40             if (@+ < 2) { # no capturing groups
41             $data->{text} = $line;
42             }
43 1     1   1541 elsif (%+) { # named capturing groups
  1         485  
  1         58  
44             $data->{match} = {%+};
45             }
46             else { # numbered capturing groups
47 3     3   1424 no strict 'refs';
  3         13  
  3         526  
48             $data->{match} = [map {$$_} 1 .. @+ - 1];
49             }
50              
51             if ($split) {
52             $data->{text} = [split $split, $line];
53             }
54              
55             return $data;
56             }
57              
58             return;
59             };
60             }
61              
62             1;
63              
64             __END__
65              
66             =pod
67              
68             =head1 NAME
69              
70             Catmandu::Importer::Text - Package that imports textual data
71              
72             =head1 SYNOPSIS
73              
74             # From the command line
75              
76             # separate fields by whitespace sequences just like awk
77             catmandu convert Text --split '\s+'
78              
79             # import all lines starting with '#', omitting this character
80             catmandu convert Text --pattern '^#(.*)'
81              
82             # In a Perl script
83              
84             use Catmandu;
85              
86             my $importer = Catmandu->importer('Text' , file => "/foo/bar.txt" );
87              
88             # print all lines with line number
89             $importer->each(sub {
90             my $item = $_[0];
91             printf "%d: %s" , $item->{_id} , $item->{text};
92             });
93              
94             =head1 DESCRIPTION
95              
96             This package reads textual input line by line. Each line is
97             imported as item with line number in field C<_id> and text content in field
98             C<text>. Line separators are not included. Lines can further be split by
99             character or pattern and a regular expression can be specified to only import
100             selected lines and to translate pattern groups to fields.
101              
102             =head1 CONFIGURATION
103              
104             =over
105              
106             =item file
107              
108             Read input from a local file given by its path. Alternatively a scalar
109             reference can be passed to read from a string.
110              
111             =item fh
112              
113             Read input from an L<IO::Handle>. If not specified, L<Catmandu::Util::io> is used to
114             create the input stream from the C<file> argument or by using STDIN.
115              
116             =item encoding
117              
118             Binmode of the input stream C<fh>. Set to C<:utf8> by default.
119              
120             =item fix
121              
122             An ARRAY of one or more fixes or file scripts to be applied to imported items.
123              
124             =item split
125              
126             Single Character or regular expression (as string with a least two characters),
127             to split each line. Resulting parts are imported in field C<text> as array.
128              
129             =item pattern
130              
131             Regular expression, given as string, to only import matching lines.
132             Whitespaces in patterns are ignored or must be escaped if patterns consists of
133             multiple lines. If the pattern contains capturing groups, captured values are
134             imported in field C<match> instead of C<text>.
135              
136             For instance dates in C<YYYY-MM-DD> format can be imported as named fields with
137              
138             (?<year>\d\d\d\d)-(?<month>\d\d)-(?<day>\d\d)
139              
140             or as array with
141              
142             (\d\d\d\d)- # year
143             (\d\d)- # month
144             (\d\d) # day
145              
146             =back
147              
148             =head1 METHODS
149              
150             Every L<Catmandu::Importer> is a L<Catmandu::Iterable> with all its methods
151             inherited.
152              
153             =head1 SEE ALSO
154              
155             L<Catmandu::Exporter::Text>
156              
157             L<Catmandu::Fix::parse_text>
158              
159             Unix tools L<awk|https://en.wikipedia.org/wiki/AWK> and
160             L<sed|https://en.wikipedia.org/wiki/Sed>
161              
162             =cut