File Coverage

blib/lib/Lingua/DxExtractor.pm
Criterion Covered Total %
statement 17 104 16.3
branch 0 30 0.0
condition 0 16 0.0
subroutine 6 11 54.5
pod 0 5 0.0
total 23 166 13.8


line stmt bran cond sub pod time code
1             package Lingua::DxExtractor;
2              
3 1     1   13368 use 5.008008;
  1         4  
4 1     1   4 use strict;
  1         1  
  1         15  
5 1     1   3 use warnings;
  1         8  
  1         39  
6              
7             our $VERSION = '2.22';
8              
9 1     1   443 use Text::Sentence qw( split_sentences );
  1         321  
  1         62  
10 1     1   383 use Lingua::NegEx qw( negation_scope );
  1         770  
  1         69  
11              
12              
13             use Class::MakeMethods (
14 1         6 'Template::Hash:array' => [
15             'target_phrases', 'skip_phrases',
16             'absolute_present_phrases', 'absolute_negative_phrases',
17             ],
18             'Template::Hash:scalar' => [
19             'orig_text', 'final_answer', 'ambiguous', 'start_phrase',
20             ],
21             'Template::Hash:hash' => [
22             'target_sentence', 'negex_debug',
23             ],
24 1     1   520 );
  1         1663  
25              
26             ######################################################################
27              
28             sub new {
29 0     0 0   my $callee = shift;
30 0   0       my $package = ref $callee || $callee;
31 0           my $self = shift;
32 0           bless $self, $package;
33 0 0         die unless $self->target_phrases;
34 0           return $self;
35             }
36              
37             sub process_text {
38 0     0 0   my ($self,$text) = @_;
39 0           $self->orig_text( $text );
40 0           $self->examine_text;
41 0           return $self->final_answer;
42             }
43              
44             sub examine_text {
45 0     0 0   my $self = shift;
46 0           my $text = $self->orig_text;
47 0 0         return if ! $text;
48            
49 0           my $start_phrase = $self->start_phrase;
50 0 0 0       if ( $start_phrase and $text =~ /$start_phrase(.*)\Z/ix ) {
51 0           $text = $1;
52             }
53 0           $text =~ s/\s+/ /gxms;
54             # treat colon ':' like a period '.'
55 0           $text =~ s/:/./g;
56            
57 0           my @sentences = split_sentences( $text );
58 0           foreach my $line ( @sentences ) {
59            
60 0 0         next if scalar grep { $line =~ /\b$_\b/i } @{$self->skip_phrases};
  0            
  0            
61 0 0         next unless grep { $line =~ /\b$_\b/i } @{$self->target_phrases};
  0            
  0            
62            
63 0           $self->target_sentence->{ $line } = 'present';
64 0           my $n_scope = negation_scope( $line );
65              
66 0 0         if ( $n_scope ) {
67 0           $self->negex_debug->{ $line } = @$n_scope[0] . ' - ' . @$n_scope[1];
68 0           my @words;
69 0           foreach ( split /\s/xms, $line ) {
70 0           s/\W//xms;
71 0           push @words, $_;
72             }
73 0           foreach my $c ( @$n_scope[0] .. @$n_scope[1] ) {
74 0           my @match = grep { $words[ $c ] =~ /$_/ixms } @{$self->target_phrases};
  0            
  0            
75            
76 0 0         if ( scalar @match ) {
77 0           $self->target_sentence->{ $line } = 'absent';
78 0           last;
79             }
80             }
81             }
82             }
83            
84 0 0         if ( scalar keys %{$self->target_sentence} ) {
  0 0          
85 0           my %final_answer;
86 0           while ( my($sentence,$answer) = each %{$self->target_sentence} ) {
  0            
87 0           $final_answer{ $answer }++;
88 0           $self->final_answer( $answer );
89             }
90 0 0         if ( scalar keys %final_answer > 1 ) {
91 0           $self->ambiguous( 1 );
92 0   0       $final_answer{ 'absent' } ||= 0;
93 0   0       $final_answer{ 'present' } ||= 0;
94              
95 0 0         if ( $final_answer{ 'absent' } > $final_answer{ 'present' } ) {
    0          
96 0           $self->final_answer( 'absent' );
97             } elsif ( $final_answer{ 'present' } > $final_answer{ 'absent' } ) {
98 0           $self->final_answer( 'present' );
99             } else {
100             # There are an equal number of absent/present findings - defaulting to present
101 0           $self->final_answer( 'present' );
102             }
103             }
104              
105 0           } elsif ( ! scalar keys %{$self->target_sentence} ) {
106 0           $self->final_answer( 'absent' );
107             }
108            
109 0 0 0       if ( grep { $text =~ /$_/i } @{$self->absolute_present_phrases} and $self->final_answer eq 'absent' ) {
  0            
  0            
110 0           $self->final_answer( 'present' );
111 0           $self->ambiguous( 2 );
112             }
113 0 0 0       if ( grep { $text =~ /$_/i } @{$self->absolute_negative_phrases} and $self->final_answer eq 'present' ) {
  0            
  0            
114 0           $self->final_answer( 'absent' );
115 0           $self->ambiguous( 3 );
116             }
117             }
118              
119              
120             sub debug {
121 0     0 0   my $self = shift;
122 0           my $out = "DxExtractor Debug:\n";
123 0           $out .= "Target Words: " . (join ', ', @{$self->target_phrases}) . "\n";
  0            
124 0           $out .= "Skip Words: " . (join ', ', @{$self->skip_phrases}) . "\n";
  0            
125 0           $out .= "Sentences:\n";
126 0           while ( my($sentence,$answer) = each %{$self->target_sentence} ) {
  0            
127 0           $out .= "$sentence\n$answer\n";
128 0           $out .= "NegEx: " . $self->negex_debug->{ $sentence } . "\n";
129             }
130 0           $out .= "Final Answer: " . $self->final_answer . "\n";;
131 0 0         $out .= "Ambiguous: " . ($self->ambiguous ? 'Yes' : 'No');
132 0           return $out;
133             }
134              
135             sub reset {
136 0     0 0   my $self = shift;
137 0           $self->orig_text( '' );
138 0           $self->target_sentence( {} );
139 0           $self->final_answer( '' );
140 0           $self->ambiguous( '' );
141             }
142              
143             1;
144              
145             =head1 NAME
146              
147             Lingua::DxExtractor - Perl extension to extract the presence or absence of a clinical condition from radiology reports.
148              
149             =head1 SYNOPSIS
150              
151             use Lingua::DxExtractor;
152              
153             $extractor = Lingua::DxExtractor->new( {
154             target_phrases => [ qw( embolus embolism emboli defect pe clot clots ) ],
155             skip_phrases => [ qw( history indication technique nondiagnostic ) ],
156             absolute_present_phrases => [ ( 'This is definitely a PE', 'absolutely positive for pe' ) ],
157             absolute_negative_phrases => [ ( 'there is no way this is a pe', 'no clots seen at all' ) ],
158             start_phrase => 'Impression:',
159             } );
160            
161             $text = <
162             Indication: To rule out pulmonary embolism. Findings: There is no evidence of vascular filling defect to the subsegmental level...
163             END
164              
165             $final_answer = $extractor->process_text( $text ); # 'absent' or 'present'
166             $is_final_answer_ambiguous = $extractor->ambiguous; # 1 or 0
167             $debug = $extractor->debug;
168              
169             $original_text = $extractor->orig_text;
170             $final_answer = $extractor->final_answer;
171             $ambiguous = $extractor->ambiguous;
172              
173             $extractor->clear; # clears orig_text, final_answer, target_sentence and ambiguous
174              
175             =head1 DESCRIPTION
176              
177             A tool to be used to look for the presence or absence of a clinical condition as reported in radiology reports. The extractor reports a 'final answer', 'absent' or 'present', as well as reports whether this answer is 'ambiguous' or not.
178              
179             The 'use case' for this is when performing a research project with a large number of records and you need to identify a subset based on a diagnostic entity, you can use this tool to reduce the number of charts that have to be manually examined. In this 'use case' I wanted to keep the sensitivity as high as possible in order to not miss real cases.
180              
181             The radiographic reports don't require textual preprocessing however clearly the selection of target_phrases and skip_phrases requires reading through reports to get a sense of what vocabulary is being used in the particular dataset that is being evaluated.
182              
183             Negated terms are identified using Lingua::NegEx which is a perl implementation of Wendy Chapman's NegEx algorithm.
184              
185             target_phrases( \@words );
186              
187             This is a list of phrases that describe the clinical entity in question. All forms of the entity in question need to explicitly stated since the package is currently not using lemmatization or stemming.
188              
189             skip_phrases( \@skip );
190              
191             This is a list of phrases that can be used to eliminate sentences in the text that might confuse the extractor. For example most radiographic reports start with a brief description of the indication for the test. This statement may state the clinical entity in question but does not mean it is present in the study (ie. Indication: to rule out pulmonary embolism).
192              
193             absolute_negative_phrases( \@absolute_negative_assertions );
194            
195             This is a list of phrases which if present in the text mean the condition is certainly not there and all ambiguity checking can be skipped.
196            
197             absolute_present_phrases( \@absolute_positive_assertions );
198              
199             This is a list of phrases which if present in the text mean the condition is certainly there and all ambiguity checking can be skipped.
200              
201             start_phrase( $start_phrase );
202              
203             A phrase if present in the text which indicates where to focus the search.
204              
205             =head2 GETTING STARTED
206              
207             Define your extraction rules:
208              
209             1. Choose your target phrases that describe the condition of interest.
210              
211             2. Choose any skip phrases that will eliminate a sentence from an analysis if the phrase exists. For example, many radiology reports may have a sentence that states the indication for the study which might state the condition you are looking for but doesn't indicate that the condition was found in the study.
212              
213             3. Choose any absolute present phrases to use which are phrases that if they are present in the text mean the condition is certainly there and all ambiguity checking can be skipped.
214              
215             4. Choose any absolute negative phrases that mean the condition is absolutely not present and all ambiguity checking can be skipped.
216              
217             5. Choose a start phrase which can be used to focus the search to only the text that occurs after this start phrase. For example, if there is an 'Impression:' section at the end of a report that summarizes findings then skip to that section to avoid possible ambiguity generated by the verbose description of findings in the full report.
218            
219             * Only the target phrases are required.
220            
221             Once defined the extractor object you created can be used to analyze target text. The analysis consists of:
222              
223             1. If there is a start phrase defined, eliminate all text for analysis prior to the start phrase.
224              
225             2. Change all colons ':' into periods '.' to treat them as sentence breaks
226              
227             3. Split text into sentences using Text::Sentence
228              
229             4. Examine each sentence for the presence of any skip phrases and if found, ignore the sentence.
230              
231             5. Examine each sentence for the presence of any target phrases and if found evaluate for negation using Lingua:Negex.
232             if no negation found, mark this sentence as 'present'
233             if the target phrase is negated then mark the sentence as 'absent'
234              
235             6. Go through all the flagged sentences and see if there is any discrepancy -- if so set the ambiguous flag. If there are more sentences that indicate absent than those that indicate present then mark the final answer as absent and vice versa. If there are an equal number of absent and present phrases mark the final answer as present.
236              
237             7. The possible values for ambiguous: 1 = there were some positive and some absent sentences; 2 = there was a match on an absolute positive phrase but the answer was going to be absent had this absolute phrase not been indicated; 3 = there was a match on an absolute negative phrase but the answer was going to be present had this absolute phrase not been indicated;
238              
239             =head2 EXPORT
240              
241             None by default.
242              
243             =head1 SEE ALSO
244              
245             This module depends on:
246              
247             Lingua::NegEx
248              
249             Text::Sentence
250              
251             Class::MakeMethods
252              
253             =head1 To Do
254              
255             Add lemmatization or stemming to target_phrases so you don't have to explicitly write out all forms of words
256              
257             =head1 AUTHOR
258              
259             Eduardp Iturrate, Eed@iturrate.comE
260              
261             =head1 COPYRIGHT AND LICENSE
262              
263             Copyright (C) 2016 by Eduardo Iturrate
264              
265             This library is free software; you can redistribute it and/or modify
266             it under the same terms as Perl itself, either Perl version 5.8.8 or,
267             at your option, any later version of Perl 5 you may have available.
268              
269             =cut