File Coverage

blib/lib/Treex/Block/Read/BaseTextReader.pm
Criterion Covered Total %
statement 16 26 61.5
branch 3 10 30.0
condition 0 3 0.0
subroutine 5 5 100.0
pod 1 2 50.0
total 25 46 54.3


line stmt bran cond sub pod time code
1             package Treex::Block::Read::BaseTextReader;
2             $Treex::Block::Read::BaseTextReader::VERSION = '2.20210102';
3 2     2   1275 use Moose;
  2         5  
  2         14  
4 2     2   13562 use Treex::Core::Common;
  2         13  
  2         15  
5             extends 'Treex::Block::Read::BaseReader';
6             #use File::Slurp 9999;
7 2     2   11612 use PerlIO::via::gzip;
  2         7  
  2         859  
8              
9             # By default read from STDIN
10             has '+from' => (
11             default => '-',
12             handles => [qw(current_filename current_filehandle file_number _set_file_number next_filehandle)],
13             );
14              
15             has lines_per_doc => ( isa => 'Int', is => 'ro', default => 0 );
16             has merge_files => ( isa => 'Bool', is => 'ro', default => 0 );
17             has encoding => ( isa => 'Str', is => 'ro', default => 'utf8' );
18              
19             sub BUILD {
20 8     8 0 96 my ($self) = @_;
21 8 50       288 if ( $self->lines_per_doc ) {
22 0         0 $self->set_is_one_doc_per_file(0);
23             }
24 8 50       298 if ($self->encoding ne 'utf8'){
25 0         0 $self->from->set_encoding($self->encoding);
26             }
27 8         29 return;
28             }
29              
30             sub next_document_text {
31 2     2 1 5 my ($self) = @_;
32 2 50       107 if ( $self->is_one_doc_per_file ) {
33 2         67 return $self->from->next_file_text();
34             }
35              
36 0           my $text = '';
37             LINE:
38 0           for my $line ( 1 .. $self->lines_per_doc ) {
39 0           $line = $self->from->next_line();
40 0 0         if (!defined $line){
41 0 0 0       return if $text eq '' && !$self->from->has_next_file();
42 0           last LINE;
43             }
44            
45 0           $text .= $line;
46             }
47 0           return $text;
48             }
49              
50             1;
51              
52             __END__
53              
54             =pod
55              
56             =encoding utf-8
57              
58             =for Pod::Coverage BUILD
59              
60             =head1 NAME
61              
62             Treex::Block::Read::BaseTextReader - abstract ancestor for document readers
63              
64             =head1 VERSION
65              
66             version 2.20210102
67              
68             =head1 DESCRIPTION
69              
70             This class serves as an common ancestor for document readers,
71             that have parameter C<from> with a space or comma separated list of filenames
72             to be loaded and load the documents from plain text files.
73             It is designed to implement the L<Treex::Core::DocumentReader> interface.
74              
75             In derived classes you need to define the C<next_document> method,
76             and you can use C<next_document_text> and C<new_document> methods.
77              
78             =head1 ATTRIBUTES
79              
80             =over
81              
82             =item language (required)
83              
84             =item lines_per_doc
85              
86             If you want to split one file to more documents.
87             The default is 0 which means, don't split.
88              
89             =item merge_files
90              
91             Merge the content of all files (specified in C<from> attribute) into one stream.
92             Useful in combination with C<lines_per_doc> to get equally-sized documents
93             even from non-equally-sized files.
94              
95             =item encoding
96              
97             What is the encoding of the input files. E.g. C<utf8> (the default), C<cp1250> etc.
98              
99             =back
100              
101             =head1 METHODS
102              
103             =over
104              
105             =item next_document_text
106              
107             Returns a content of each file (specified in C<from> attribute) as a text string.
108              
109             =item next_filehandle
110              
111             Helper method - you can use this instead of C<next_document_text>
112             if you don't want to load the whole text into memory
113             (but do e.g. SAX-like parsing).
114              
115             =back
116              
117             =head1 SEE
118              
119             L<Treex::Block::Read::BaseReader>
120             L<Treex::Block::Read::Text>
121              
122             =head1 AUTHOR
123              
124             Martin Popel
125              
126             =head1 COPYRIGHT AND LICENSE
127              
128             Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
129              
130             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.