File Coverage

blib/lib/Catmandu/Importer/Text.pm

Criterion	Covered	Total	%
statement	15	15	100.0
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	20	20	100.0

line	stmt	sub	time	code
1
2				use Catmandu::Sane;
3	3	3	893
	3		7
	3		20
4				our $VERSION = '1.2018';
5
6				use Moo;
7	3	3	21	use namespace::clean;
	3		5
	3		15
8	3	3	900
	3		6
	3		16
9				with 'Catmandu::Importer';
10
11				has pattern => (
12				is => 'ro',
13				coerce => sub {
14				$_[0] =~ /\n/m ? qr{$_[0]}x : qr{$_[0]};
15				},
16				);
17
18				has split => (
19				is => 'ro',
20				coerce => sub {
21				length $_[0] == 1 ? quotemeta($_[0]) : qr{$_[0]};
22				}
23				);
24
25				my ($self) = @_;
26				sub {
27				state $pattern = $self->pattern;
28				state $split = $self->split;
29				state $count = 0;
30				state $line;
31
32				while (defined($line = $self->fh->getline)) {
33				chomp $line;
34				next if $pattern and $line !~ $pattern;
35
36				my $data = {_id => ++$count};
37
38				if (@+ < 2) { # no capturing groups
39				$data->{text} = $line;
40				}
41				elsif (%+) { # named capturing groups
42				$data->{match} = {%+};
43	1	1	869	}
	1		314
	1		46
44				else { # numbered capturing groups
45				no strict 'refs';
46				$data->{match} = [map {$$_} 1 .. @+ - 1];
47	3	3	1150	}
	3		6
	3		437
48
49				if ($split) {
50				$data->{text} = [split $split, $line];
51				}
52
53				return $data;
54				}
55
56				return;
57				};
58				}
59
60				1;
61
62
63				=pod
64
65				=head1 NAME
66
67				Catmandu::Importer::Text - Package that imports textual data
68
69				=head1 SYNOPSIS
70
71				# From the command line
72
73				# separate fields by whitespace sequences just like awk
74				catmandu convert Text --split '\s+'
75
76				# import all lines starting with '#', omitting this character
77				catmandu convert Text --pattern '^#(.*)'
78
79				# In a Perl script
80
81				use Catmandu;
82
83				my $importer = Catmandu->importer('Text' , file => "/foo/bar.txt" );
84
85				# print all lines with line number
86				$importer->each(sub {
87				my $item = $_[0];
88				printf "%d: %s" , $item->{_id} , $item->{text};
89				});
90
91				=head1 DESCRIPTION
92
93				This package reads textual input line by line. Each line is
94				imported as item with line number in field C<_id> and text content in field
95				C<text>. Line separators are not included. Lines can further be split by
96				character or pattern and a regular expression can be specified to only import
97				selected lines and to translate pattern groups to fields.
98
99				=head1 CONFIGURATION
100
101				=over
102
103				=item file
104
105				Read input from a local file given by its path. Alternatively a scalar
106				reference can be passed to read from a string.
107
108				=item fh
109
110				Read input from an L<IO::Handle>. If not specified, L<Catmandu::Util::io> is used to
111				create the input stream from the C<file> argument or by using STDIN.
112
113				=item encoding
114
115				Binmode of the input stream C<fh>. Set to C<:utf8> by default.
116
117				=item fix
118
119				An ARRAY of one or more fixes or file scripts to be applied to imported items.
120
121				=item split
122
123				Single Character or regular expression (as string with a least two characters),
124				to split each line. Resulting parts are imported in field C<text> as array.
125
126				=item pattern
127
128				Regular expression, given as string, to only import matching lines.
129				Whitespaces in patterns are ignored or must be escaped if patterns consists of
130				multiple lines. If the pattern contains capturing groups, captured values are
131				imported in field C<match> instead of C<text>.
132
133				For instance dates in C<YYYY-MM-DD> format can be imported as named fields with
134
135				(?<year>\d\d\d\d)-(?<month>\d\d)-(?<day>\d\d)
136
137				or as array with
138
139				(\d\d\d\d)- # year
140				(\d\d)- # month
141				(\d\d) # day
142
143				=back
144
145				=head1 METHODS
146
147				Every L<Catmandu::Importer> is a L<Catmandu::Iterable> with all its methods
148				inherited.
149
150				=head1 SEE ALSO
151
152				L<Catmandu::Exporter::Text>
153
154				L<Catmandu::Fix::parse_text>
155
156				Unix tools L<awk\|https://en.wikipedia.org/wiki/AWK> and
157				L<sed\|https://en.wikipedia.org/wiki/Sed>
158
159				=cut