File Coverage

blib/lib/Lingua/DxExtractor.pm
Criterion Covered Total %
statement 17 114 14.9
branch 0 42 0.0
condition 0 18 0.0
subroutine 6 11 54.5
pod 0 5 0.0
total 23 190 12.1


line stmt bran cond sub pod time code
1             package Lingua::DxExtractor;
2              
3 1     1   13106 use 5.008008;
  1         3  
4 1     1   6 use strict;
  1         2  
  1         28  
5 1     1   4 use warnings;
  1         7  
  1         41  
6              
7             our $VERSION = '2.3';
8              
9 1     1   401 use Text::Sentence qw( split_sentences );
  1         325  
  1         52  
10 1     1   417 use Lingua::NegEx qw( negation_scope );
  1         794  
  1         69  
11              
12              
13             use Class::MakeMethods (
14 1         6 'Template::Hash:array' => [
15             'target_phrases', 'skip_phrases',
16             'absolute_present_phrases', 'absolute_negative_phrases',
17             ],
18             'Template::Hash:scalar' => [
19             'orig_text', 'final_answer', 'ambiguous', 'start_phrase',
20             ],
21             'Template::Hash:hash' => [
22             'target_sentence', 'negex_debug',
23             ],
24 1     1   531 );
  1         1716  
25              
26             ######################################################################
27              
28             sub new {
29 0     0 0   my $callee = shift;
30 0   0       my $package = ref $callee || $callee;
31 0           my $self = shift;
32 0           bless $self, $package;
33 0 0         die unless $self->target_phrases;
34 0           return $self;
35             }
36              
37             sub process_text {
38 0     0 0   my ($self,$text) = @_;
39 0           $self->orig_text( $text );
40 0           $self->examine_text;
41 0           return $self->final_answer;
42             }
43              
44             sub examine_text {
45 0     0 0   my $self = shift;
46 0           my $text = $self->orig_text;
47 0 0         return if ! $text;
48            
49 0           my $start_phrase = $self->start_phrase;
50 0 0 0       if ( $start_phrase and $text =~ /$start_phrase(.*)\Z/ix ) {
51 0           $text = $1;
52             }
53 0           $text =~ s/\s+/ /gxms;
54             # treat colon ':' like a period '.'
55 0           $text =~ s/:/./g;
56            
57 0           my @sentences = split_sentences( $text );
58 0           foreach my $line ( @sentences ) {
59            
60 0 0         next if scalar grep { $line =~ /\b$_\b/i } @{$self->skip_phrases};
  0            
  0            
61 0 0         next unless grep { $line =~ /\b$_\b/i } @{$self->target_phrases};
  0            
  0            
62            
63 0           $self->target_sentence->{ $line } = 'present';
64 0           my $n_scope = negation_scope( $line );
65              
66 0 0         if ( $n_scope ) {
67 0           $self->negex_debug->{ $line } = @$n_scope[0] . ' - ' . @$n_scope[1];
68 0           my @words;
69 0           foreach ( split /\s/xms, $line ) {
70 0           s/\W//xms;
71 0           push @words, $_;
72             }
73 0           foreach my $c ( @$n_scope[0] .. @$n_scope[1] ) {
74 0           my @match = grep { $words[ $c ] =~ /$_/ixms } @{$self->target_phrases};
  0            
  0            
75            
76 0 0         if ( scalar @match ) {
77 0           $self->target_sentence->{ $line } = 'absent';
78 0           last;
79             }
80             }
81             }
82             }
83            
84 0 0         if ( scalar keys %{$self->target_sentence} ) {
  0 0          
85 0           my %final_answer;
86 0           while ( my($sentence,$answer) = each %{$self->target_sentence} ) {
  0            
87 0           $final_answer{ $answer }++;
88 0           $self->final_answer( $answer );
89             }
90 0 0         if ( scalar keys %final_answer > 1 ) {
91 0           $self->ambiguous( 1 );
92 0   0       $final_answer{ 'absent' } ||= 0;
93 0   0       $final_answer{ 'present' } ||= 0;
94              
95 0 0         if ( $final_answer{ 'absent' } > $final_answer{ 'present' } ) {
    0          
96 0           $self->final_answer( 'absent' );
97             } elsif ( $final_answer{ 'present' } > $final_answer{ 'absent' } ) {
98 0           $self->final_answer( 'present' );
99             } else {
100             # There are an equal number of absent/present findings - defaulting to present
101 0           $self->final_answer( 'present' );
102             }
103             }
104              
105 0           } elsif ( ! scalar keys %{$self->target_sentence} ) {
106 0           $self->final_answer( 'absent' );
107             }
108            
109 0 0 0       if ( grep { $text =~ /$_/i } @{$self->absolute_present_phrases} and $self->final_answer eq 'absent' ) {
  0            
  0            
110 0           $self->final_answer( 'present' );
111 0           $self->ambiguous( 2 );
112             }
113 0 0 0       if ( grep { $text =~ /$_/i } @{$self->absolute_negative_phrases} and $self->final_answer eq 'present' ) {
  0            
  0            
114 0           $self->final_answer( 'absent' );
115 0           $self->ambiguous( 3 );
116             }
117             }
118              
119              
120             sub debug {
121 0     0 0   my $self = shift;
122 0           my $out = "Target Phrases(" . (join ', ', map { qq{'$_'} } @{$self->target_phrases}) . ")\r\n\r\n";
  0            
  0            
123 0 0         $out .= "Skip Phrases(" . (join ', ', map { qq{'$_'} } @{$self->skip_phrases}) . ")\r\n\r\n" if $self->skip_phrases;
  0            
  0            
124 0 0         $out .= "Absolute Present Phrases(" . (join ', ', map { qq{'$_'} } @{$self->absolute_present_phrases}) . ")\r\n\r\n" if $self->absolute_present_phrases;
  0            
  0            
125 0 0         $out .= "Absolute Negative Phrases(" . (join ', ', map { qq{'$_'} } @{$self->absolute_negative_phrases}) . ")\r\n\r\n" if $self->absolute_negative_phrases;
  0            
  0            
126 0 0         $out .= "Start Phrase( '" . $self->start_phrase . "' )\r\n\r\n" if $self->start_phrase;
127            
128 0           $out .= "Sentences with a target phrase match:\r\n";
129 0           my $count = 1;
130 0           while ( my($sentence,$answer) = each %{$self->target_sentence} ) {
  0            
131 0           $out .= "$count) $sentence -- $answer. ";
132 0           $count++;
133 0   0       $out .= "NegEx: " . ($self->negex_debug->{ $sentence } || 'None') . "\r\n";
134             }
135 0 0         $out .= "\r\nAmbiguous: " . ($self->ambiguous == 1 ? 'Yes' : ( $self->ambiguous == 2 ? 'Absolute Present Phrase was present but the answer was going to be absent.' : ( $self->ambiguous == 3 ? 'Absolute Negative Phrase was present but the answer was going to be present.' : 'No' ) ) );
    0          
    0          
136 0           $out .= "\r\nFinal Answer: " . $self->final_answer . "\r\n";
137 0           return $out;
138             }
139              
140             sub reset {
141 0     0 0   my $self = shift;
142 0           $self->orig_text( '' );
143 0           $self->target_sentence( {} );
144 0           $self->final_answer( '' );
145 0           $self->ambiguous( '' );
146             }
147              
148             1;
149              
150             =head1 NAME
151              
152             Lingua::DxExtractor - Perl extension to extract the presence or absence of a clinical condition from radiology reports.
153              
154             =head1 SYNOPSIS
155              
156             use Lingua::DxExtractor;
157              
158             $extractor = Lingua::DxExtractor->new( {
159             target_phrases => [ qw( embolus embolism emboli defect pe clot clots ) ],
160             skip_phrases => [ qw( history indication technique nondiagnostic ) ],
161             absolute_present_phrases => [ ( 'This is definitely a PE', 'absolutely positive for pe' ) ],
162             absolute_negative_phrases => [ ( 'there is no way this is a pe', 'no clots seen at all' ) ],
163             start_phrase => 'Impression:',
164             } );
165            
166             $text = <
167             Indication: To rule out pulmonary embolism. Findings: There is no evidence of vascular filling defect to the subsegmental level...
168             END
169              
170             $final_answer = $extractor->process_text( $text ); # 'absent' or 'present'
171             $is_final_answer_ambiguous = $extractor->ambiguous; # 1 or 0
172             $debug = $extractor->debug;
173              
174             $original_text = $extractor->orig_text;
175             $final_answer = $extractor->final_answer;
176             $ambiguous = $extractor->ambiguous;
177              
178             $extractor->clear; # clears orig_text, final_answer, target_sentence and ambiguous
179              
180             =head1 DESCRIPTION
181              
182             A tool to be used to look for the presence or absence of a clinical condition as reported in radiology reports. The extractor reports a 'final answer', 'absent' or 'present', as well as reports whether this answer is 'ambiguous' or not.
183              
184             The 'use case' for this is when performing a research project with a large number of records and you need to identify a subset based on a diagnostic entity, you can use this tool to reduce the number of charts that have to be manually examined. In this 'use case' I wanted to keep the sensitivity as high as possible in order to not miss real cases.
185              
186             The radiographic reports don't require textual preprocessing however clearly the selection of target_phrases and skip_phrases requires reading through reports to get a sense of what vocabulary is being used in the particular dataset that is being evaluated.
187              
188             Negated terms are identified using Lingua::NegEx which is a perl implementation of Wendy Chapman's NegEx algorithm.
189              
190             =head2 GETTING STARTED
191              
192             Create a new extractor object with your extraction rules:
193              
194             target_phrases( \@words );
195              
196             This is a list of phrases that describe the clinical entity in question. All forms of the entity in question need to explicitly stated since the package is currently not using lemmatization or stemming. This is the only required parameter for the extractor object.
197              
198             skip_phrases( \@skip );
199              
200             This is a list of phrases that can be used to eliminate sentences in the text that might confuse the extractor. For example most radiographic reports start with a brief description of the indication for the test. This statement may state the clinical entity in question but does not mean it is present in the study (ie. Indication: to rule out pulmonary embolism).
201              
202             absolute_negative_phrases( \@absolute_negative_assertions );
203            
204             This is a list of phrases which if present in the text mean the condition is certainly not there and all ambiguity checking can be skipped.
205            
206             absolute_present_phrases( \@absolute_positive_assertions );
207              
208             This is a list of phrases which if present in the text mean the condition is certainly there and all ambiguity checking can be skipped.
209              
210             start_phrase( $start_phrase );
211              
212             A phrase if present in the text which indicates where to focus the search.
213              
214             =head2 Analysis
215              
216             Once defined, the extractor object you created can be used to analyze target text. The analysis consists of:
217              
218             1. If there is a start phrase defined, eliminate all text for analysis prior to the start phrase.
219              
220             2. Change all colons ':' into periods '.' to treat them as sentence breaks
221              
222             3. Split text into sentences using Text::Sentence
223              
224             4. Examine each sentence for the presence of any skip phrases and if found, ignore the sentence.
225              
226             5. Examine each sentence for the presence of any target phrases and if found evaluate for negation using Lingua:Negex.
227             if no negation found, mark this sentence as 'present'
228             if the target phrase is negated then mark the sentence as 'absent'
229              
230             6. Go through all the flagged sentences and see if there is any discrepancy -- if so set the ambiguous flag. If there are more sentences that indicate absent than those that indicate present then mark the final answer as absent and vice versa. If there are an equal number of absent and present phrases mark the final answer as present.
231              
232             7. The possible values for ambiguous: 1 = there were some positive and some absent sentences; 2 = there was a match on an absolute positive phrase but the answer was going to be absent had this absolute phrase not been indicated; 3 = there was a match on an absolute negative phrase but the answer was going to be present had this absolute phrase not been indicated;
233              
234             =head2 EXPORT
235              
236             None by default.
237              
238             =head1 SEE ALSO
239              
240             This module depends on:
241              
242             Lingua::NegEx
243              
244             Text::Sentence
245              
246             Class::MakeMethods
247              
248             =head1 To Do
249              
250             Add lemmatization or stemming to target_phrases so you don't have to explicitly write out all forms of words
251              
252             =head1 AUTHOR
253              
254             Eduardo Iturrate, ed@iturrate.comE
255              
256             =head1 COPYRIGHT AND LICENSE
257              
258             Copyright (C) 2016 by Eduardo Iturrate
259              
260             This library is free software; you can redistribute it and/or modify
261             it under the same terms as Perl itself, either Perl version 5.8.8 or,
262             at your option, any later version of Perl 5 you may have available.
263              
264             =cut