File Coverage

lib/Convert/Pheno/CSV.pm

Criterion	Covered	Total	%
statement	220	267	82.4
branch	46	66	69.7
condition	5	15	33.3
subroutine	34	38	89.4
pod	0	17	0.0
total	305	403	75.6

line	stmt	bran	cond	sub	pod	time	code
1							package Convert::Pheno::CSV;
2
3	6			6		46	use strict;
	6					14
	6					244
4	6			6		32	use warnings;
	6					17
	6					176
5	6			6		43	use autodie;
	6					19
	6					54
6	6			6		34369	use feature qw(say);
	6					16
	6					686
7	6			6		45	use File::Basename;
	6					16
	6					581
8	6			6		6630	use Text::CSV_XS qw(csv);
	6					102480
	6					500
9	6			6		3777	use Sort::Naturally qw(nsort);
	6					28887
	6					392
10	6			6		43	use List::Util qw(any);
	6					10
	6					388
11	6			6		37	use File::Spec::Functions qw(catdir);
	6					10
	6					308
12	6			6		4925	use IO::Compress::Gzip qw($GzipError);
	6					204087
	6					681
13	6			6		3100	use IO::Uncompress::Gunzip qw($GunzipError);
	6					78211
	6					630
14
15							#use Devel::Size qw(size total_size);
16	6			6		50	use Convert::Pheno;
	6					12
	6					226
17	6			6		3310	use Convert::Pheno::OMOP;
	6					20
	6					730
18	6			6		2764	use Convert::Pheno::IO;
	6					24
	6					377
19	6			6		2428	use Convert::Pheno::Schema;
	6					33
	6					374
20	6			6		62	use Convert::Pheno::Mapping;
	6					12
	6					951
21	6			6		55	use Exporter 'import';
	6					22
	6					497
22							our @EXPORT =
23							qw(read_csv read_csv_stream read_redcap_dict_and_mapping_file transpose_ohdsi_dictionary read_sqldump_stream read_sqldump sqldump2csv transpose_omop_data_structure open_filehandle load_exposures transpose_visit_occurrence);
24
25	6			6		47	use constant DEVEL_MODE => 0;
	6					12
	6					24243
26
27							#########################
28							#########################
29							# SUBROUTINES FOR CSV #
30							#########################
31							#########################
32
33							sub read_redcap_dictionary {
34
35	13			13	0	35	my $filepath = shift;
36
37							# Define split record separator from file extension
38	13					55	my ( $separator, $encoding ) = define_separator( $filepath, undef );
39
40							# We'll create an HoH using as 1D-key the 'Variable / Field Name'
41	13					53	my $key = 'Variable / Field Name';
42
43							# We'll be adding the key <_labels>. See sub add_labels
44	13					34	my $labels = 'Choices, Calculations, OR Slider Labels';
45
46							# Loading data directly from Text::CSV_XS
47							# NB1: We want HoH and sub read_csv returns AoH
48							# NB2: By default the Text::CSV module treats all fields in a CSV file as strings, regardless of their actual data type.
49							my $hoh = csv(
50							in => $filepath,
51							sep_char => $separator,
52
53							#binary => 1, # default
54							auto_diag => 1,
55							encoding => $encoding,
56							key => $key,
57	5083			5083		357053	on_in => sub { $_{_labels} = add_labels( $_{$labels} ) }
58	13					167	);
59	13					572	return $hoh;
60							}
61
62							sub add_labels {
63
64	5083			5083	0	6128	my $value = shift;
65
66							# * IMPORTANT *
67							# This sub can return undef, i.e., $_{labels} = undef
68							# That's OK as we won't perform exists $_{_label}
69							# Note that in $hoh (above) empty columns are key = ''.
70
71							# Premature return if empty ('' = 0)
72	5083	100				9213	return undef unless $value; # perlcritic Severity: 5
73
74							# We'll skip values that don't provide even number of key-values
75	2912					22754	my @tmp = map { s/^\s//; s/\s+$//; $_; }
	27053					41637
	27053					35676
	27053					36866
76							( split /\\|\|,/, $value ); # perlcritic Severity: 5
77
78							# Return undef for non-valid entries
79	2912	100				17867	return @tmp % 2 == 0 ? {@tmp} : undef;
80							}
81
82							sub read_redcap_dict_and_mapping_file {
83
84	13			13	0	46	my $arg = shift;
85
86							# Read and load REDCap CSV dictionary
87	13					71	my $data_redcap_dict = read_redcap_dictionary( $arg->{redcap_dictionary} );
88
89							# Read and load mapping file
90							my $data_mapping_file =
91	13					172	io_yaml_or_json( { filepath => $arg->{mapping_file}, mode => 'read' } );
92
93							# Validate mapping file against JSON schema
94							my $jv = Convert::Pheno::Schema->new(
95							{
96							data => $data_mapping_file,
97							debug => $arg->{self_validate_schema},
98							schema_file => $arg->{schema_file}
99							}
100	7					323	);
101	7					61	$jv->json_validate;
102
103							# Return if succesful
104	6					188	return ( $data_redcap_dict, $data_mapping_file );
105							}
106
107							sub transpose_ohdsi_dictionary {
108
109	3			3	0	9	my $data = shift;
110	3					12	my $column = 'concept_id';
111
112							# The idea is the following:
113							# $data comes as an array (from SQL/CSV)
114							#
115							# $VAR1 = [
116							# {
117							# 'concept_class_id' => '4-char billing code',
118							# 'concept_code' => 'K92.2',
119							# 'concept_id' => 35208414,
120							# 'concept_name' => 'Gastrointestinal hemorrhage, unspecified',
121							# 'domain_id' => 'Condition',
122							# 'invalid_reason' => undef,
123							# 'standard_concept' => undef,
124							# 'valid_end_date' => '2099-12-31',
125							# 'valid_start_date' => '2007-01-01',
126							# 'vocabulary_id' => 'ICD10CM'
127							# },
128							#
129							# and we convert it to hash to allow for quick searches by 'concept_id'
130							#
131							# $VAR1 = {
132							# '1107830' => {
133							# 'concept_class_id' => 'Ingredient',
134							# 'concept_code' => 28889,
135							# 'concept_id' => 1107830,
136							# 'concept_name' => 'Loratadine',
137							# 'domain_id' => 'Drug',
138							# 'invalid_reason' => undef,
139							# 'standard_concept' => 'S',
140							# 'valid_end_date' => '2099-12-31',
141							# 'valid_start_date' => '1970-01-01',
142							# 'vocabulary_id' => 'RxNorm'
143							# },
144							#
145							# NB: We store all columns yet we'll use 4:
146							# 'concept_id', 'concept_code', 'concept_name', 'vocabulary_id'
147							# Note that we're duplicating @$data with $hoh
148							#my $hoh = { map { $_->{$column} => $_ } @{$data} }; <--map is slower than for
149	3					8	my $hoh;
150	3					5	for my $item ( @{$data} ) {
	3					62
151	1332					4328	$hoh->{ $item->{$column} } = $item;
152							}
153
154							#say "transpose_ohdsi_dictionary:", to_gb( total_size($hoh) ) if DEVEL_MODE;
155	3					26	return $hoh;
156							}
157
158							sub read_sqldump_stream {
159
160	1			1	0	3	my $arg = shift;
161	1					3	my $filein = $arg->{in};
162	1					4	my $self = $arg->{self};
163	1					3	my $person = $arg->{person};
164	1					2	my $fileout = $self->{out_file};
165	1					2	my $switch = 0;
166	1					2	my @headers;
167	1					2	my $table_name = $self->{omop_tables}[0];
168	1					3	my $table_name_lc = lc($table_name);
169
170							# Open filehandles
171	1					5	my $fh_in = open_filehandle( $filein, 'r' );
172	1					4	my $fh_out = open_filehandle( $fileout, 'a' );
173
174							# Start printing the array
175							#say $fh_out "[";
176
177							# Now we we start processing line by line
178	1					3	my $count = 0;
179	1					7	while ( my $line = <$fh_in> ) {
180
181							# Only parsing $table_name_lc and discarding others
182							# Note that double quotes are optional
183							# - COPY "OMOP_cdm_eunomia".person
184							# . COPY omop_cdm_eunomia_2.person
185	386339	100				27855128	if ( $line =~ /^COPY \"?(\w+)\"?\.$table_name_lc / ) {
186	1					9	chomp $line;
187
188							# Create an array to hold the column names for this table
189	1					22	$line =~ s/[,]//g; # getting rid of (),
190	1					29	@headers = split /\s+/, $line;
191
192							# Discarding headers which are not terms/variables
193	1					18	@headers = @headers[ 2 .. $#headers - 2 ];
194
195							# Turning on the switch for later
196	1					3	$switch++;
197
198							# Jump one line
199	1					5	$line = <$fh_in>;
200
201							}
202
203							# Loading the data if $switch
204	386339	100				1052483	if ($switch) {
205
206							# get rid of \n
207	67708					145153	chomp $line;
208
209							# Order matters. We exit before loading
210	67708	100				125231	last if $line =~ /^\\\.$/;
211
212							# Solitting by tab, it's ok
213	67707					354159	my @fields = split /\t/, $line;
214
215							# Using tmp hashref to load all fields at once with slice
216	67707					86039	my $hash_slice;
217	67707					808117	@{$hash_slice}{@headers} =
218	67707					106292	map { dotify_and_coerce_number($_) } @fields;
	1421847					1980952
219
220							# Initialize $data each time
221							# Adding them as an array element (AoH)
222							die
223							"We could not find person_id:$hash_slice->{person_id}. Try increasing the #lines with --max-lines-sql\n"
224	67707	50				280799	unless exists $person->{ $hash_slice->{person_id} };
225
226							# Increase counter
227	67707					84416	$count++;
228
229							# Encode data
230	67707					116376	my $encoded_data =
231							encode_omop_stream( $table_name, $hash_slice, $person, $count,
232							$self );
233
234							# Only after encoding we are able to discard 'null'
235	67707	50				388812	say $fh_out $encoded_data if $encoded_data ne 'null';
236
237							# Print if verbose
238							say "Rows processed: $count"
239	67707	50	33			9295205	if ( $self->{verbose} && $count % 10_000 == 0 );
240							}
241							}
242	1	50				6	say "==============\nRows total: $count\n" if $self->{verbose};
243
244							#say $fh_out "]"; # not needed
245
246							# Closing filehandles
247	1					10	close $fh_in;
248	1					227	close $fh_out;
249	1					539	return 1;
250							}
251
252							sub encode_omop_stream {
253
254	67707			67707	0	131697	my ( $table_name, $hash_slice, $person, $count, $self ) = @_;
255
256							# * IMPORTANT *
257							# We only print person_id ONCE!!!
258	67707					104837	my $person_id = $hash_slice->{person_id};
259							my $data = {
260							$table_name => [$hash_slice],
261							PERSON => $count == 1
262							? $person->{$person_id}
263							: {
264	67707	100				166979	map { $_ => $person->{$person_id}{$_} }
	203118					614487
265							qw(person_id gender_concept_id birth_datetime)
266							}
267							};
268
269							# Print line by line (->canonical has some overhead but needed for t/)
270	67707					468828	return JSON::XS->new->utf8->canonical->encode(
271							Convert::Pheno::omop2bff_stream_processing( $self, $data ) );
272							}
273
274							sub read_sqldump {
275
276	3			3	0	9	my $arg = shift;
277	3					9	my $filepath = $arg->{in};
278	3					9	my $self = $arg->{self};
279
280							# Before resorting to writting this subroutine I performed an exhaustive search on CPAN:
281							# - Tested MySQL::Dump::Parser::XS but I could not make it work...
282							# - App-MysqlUtils-0.022 has a CLI utility (mysql-sql-dump-extract-tables)
283							# - Of course one can always use *nix tools (sed, grep, awk, etc) or other programming languages....
284							# Anyway, I ended up writting the parser myself...
285							# The parser is based in reading COPY paragraphs from PostgreSQL dump by using Perl's paragraph mode $/ = "";
286							# NB: Each paragraph (TABLE) is loaded into memory. Not great for large files.
287
288							# Define variables that modify what we load
289	3					8	my $max_lines_sql = $self->{max_lines_sql};
290	3					7	my @omop_tables = @{ $self->{omop_tables} };
	3					10
291
292							# Set record separator to paragraph
293	3					18	local $/ = "";
294
295							#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, attribute_description, attribute_type_concept_id, attribute_syntax) FROM stdin;
296							# ......
297							# \.
298
299							# Start reading the SQL dump
300	3					20	my $fh = open_filehandle( $filepath, 'r' );
301
302							# We'll store the data in the hashref $data
303	3					12	my $data = {};
304
305							# Process paragraphs
306	3					12436	while ( my $paragraph = <$fh> ) {
307
308							# Discarding paragraphs not having m/^COPY/
309	618	100				1729075	next unless $paragraph =~ m/^COPY/;
310
311							# Load all lines into an array (via "\n")
312	117					831265	my @lines = split /\n/, $paragraph;
313	117	100				1326	next unless scalar @lines > 2;
314	54					145	pop @lines; # last line eq '\.'
315
316							# First line contains the headers
317							#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, ..., attribute_syntax) FROM stdin;
318	54					1213	$lines[0] =~ s/[,]//g; # getting rid of (),
319	54					896	my @headers = split /\s+/, $lines[0];
320	54					432	my $table_name =
321							uc( ( split /\./, $headers[1] )[1] ); # ATTRIBUTE_DEFINITION
322
323							# Discarding non @$omop_tables:
324							# This step improves RAM consumption
325	54	100		283		835	next unless any { $_ eq $table_name } @omop_tables;
	283					63312
326
327							# Say if verbose
328	19	50				171	say "Processing table ... <$table_name>" if $self->{verbose};
329
330							# Discarding first line
331	19					43	shift @lines;
332
333							# Discarding headers which are not terms/variables
334	19					348	@headers = @headers[ 2 .. $#headers - 2 ];
335
336							# Initializing $data>key as empty arrayref
337	19					126	$data->{$table_name} = [];
338
339							# Ad hoc counter for dev
340	19					49	my $count = 0;
341
342							# Processing line by line
343	19					100	for my $line (@lines) {
344	12063					12343	$count++;
345
346							# Columns are separated by \t
347							# NB: 'split' and 'Text::CSV' split to strings
348							# We go with 'split'. Coercing a posteriori
349	12063					67123	my @fields = split /\t/, $line;
350
351							# Loading the fields like this:
352							#
353							# $VAR1 = {
354							# 'PERSON' => [ # NB: This is the table name
355							# {
356							# 'person_id' => 123,
357							# 'test' => 'abc'
358							# },
359							# {
360							# 'person_id' => 456,
361							# 'test' => 'def'
362							# }
363							# ]
364							# };
365
366							# Using tmp hashref to load all fields at once with slice
367	12063					13739	my $hash_slice;
368	12063					151462	@{$hash_slice}{@headers} =
369	12063					15359	map { dotify_and_coerce_number($_) } @fields;
	197109					272283
370
371							# Adding them as an array element (AoH)
372	12063					19713	push @{ $data->{$table_name} }, $hash_slice;
	12063					21168
373
374							# adhoc filter to speed-up development
375	12063	100				20435	last if $count == $max_lines_sql;
376							say "Rows processed: $count"
377	12049	50	33			36170	if ( $self->{verbose} && $count % 1_000 == 0 );
378
379							}
380
381							# Print if verbose
382	19	50				65438	say "==============\nRows total: $count\n" if $self->{verbose};
383							}
384	3					52	close $fh;
385
386							#say total_size($data) and die;
387	3					3147	return $data;
388							}
389
390							sub sqldump2csv {
391
392	0			0	0	0	my ( $data, $dir ) = @_;
393
394							# CSV sep character
395	0					0	my $sep = "\t";
396
397							# The idea is to save a CSV table for each $data->key
398	0					0	for my $table ( keys %{$data} ) {
	0					0
399
400							# File path for CSV file
401	0					0	my $filepath = catdir( $dir, "$table.csv" );
402
403							# We get header fields from row[0] and nsort them
404							# NB: The order will not be the same as that in <.sql>
405	0					0	my @headers = nsort keys %{ $data->{$table}[0] };
	0					0
406
407							# Print data as CSV
408							write_csv(
409							{
410							sep => $sep,
411							filepath => $filepath,
412							headers => \@headers,
413	0					0	data => $data->{$table}
414							}
415							);
416							}
417	0					0	return 1;
418							}
419
420							sub transpose_omop_data_structure {
421
422	2			2	0	7	my $data = shift;
423
424							# The situation is the following, $data comes in format:
425							#
426							#$VAR1 = {
427							# 'MEASUREMENT' => [
428							# {
429							# 'measurement_concept_id' => 1,
430							# 'person_id' => 666
431							# },
432							# {
433							# 'measurement_concept_id' => 2,
434							# 'person_id' => 666
435							# }
436							# ],
437							# 'PERSON' => [
438							# {
439							# 'person_id' => 666
440							# },
441							# {
442							# 'person_id' => 1
443							# }
444							# ]
445							# };
446
447							# where all 'person_id' are together inside the TABLE_NAME.
448							# But, BFF "ideally" works at the individual level so we are going to
449							# transpose the data structure to end up into something like this
450							# NB: MEASUREMENT and OBSERVATION (among others, i.e., CONDITION_OCCURRENCE, PROCEDURE_OCCURRENCE)
451							# can have multiple values for one 'person_id' so they will be loaded as arrays
452							#
453							#
454							#$VAR1 = {
455							# '1' => {
456							# 'PERSON' => {
457							# 'person_id' => 1
458							# }
459							# },
460							# '666' => {
461							# 'MEASUREMENT' => [
462							# {
463							# 'measurement_concept_id' => 1,
464							# 'person_id' => 666
465							# },
466							# {
467							# 'measurement_concept_id' => 2,
468							# 'person_id' => 666
469							# }
470							# ],
471							# 'PERSON' => {
472							# 'person_id' => 666
473							# }
474							# }
475							# };
476
477	2					5	my $omop_person_id = {};
478
479							# Only performed for $omop_main_table
480	2					5	for my $table ( @{ $omop_main_table->{$omop_version} } ) { # global
	2					15
481
482							# Loop over tables
483	30					30	for my $item ( @{ $data->{$table} } ) {
	30					128
484
485	6000	50	33			30039	if ( exists $item->{person_id} && $item->{person_id} ) {
486	6000					6755	my $person_id = $item->{person_id};
487
488							# {person_id} can have multiple rows in @omop_array_tables
489	6000	100		21000		13417	if ( any { $_ eq $table } @omop_array_tables ) {
	21000					22674
490	5000					4712	push @{ $omop_person_id->{$person_id}{$table} },
	5000					18778
491							$item; # array
492							}
493
494							# {person_id} only has one value in a given table
495							else {
496	1000					3841	$omop_person_id->{$person_id}{$table} = $item; # scalar
497							}
498							}
499							}
500							}
501
502							# To get back unused memory for later..
503	2					6	$data = undef;
504
505							# Finally we get rid of the 'person_id' key and return values as an array
506							#
507							#$VAR1 = [
508							# {
509							# 'PERSON' => {
510							# 'person_id' => 1
511							# }
512							# },
513							# ------------------------------------------------
514							# {
515							# 'MEASUREMENT' => [
516							# {
517							# 'measurement_concept_id' => 1,
518							# 'person_id' => 666
519							# },
520							# {
521							# 'measurement_concept_id' => 2,
522							# 'person_id' => 666
523							# }
524							# ],
525							# 'PERSON' => {
526							# 'person_id' => 666
527							# }
528							# }
529							# ];
530							# NB: We nsort keys to always have the same result but it's not needed
531							# v1 - Easier but duplicates data structure
532							# my $aoh = [ map { $omop_person_id->{$_} } nsort keys %{$omop_person_id} ];
533							# v2 - This version cleans memory after loading $aoh <=== Implemented
534	2					7	my $aoh;
535	2					10	for my $key ( nsort keys %{$omop_person_id} ) {
	2					566
536	1940					576827	push @{$aoh}, $omop_person_id->{$key};
	1940					3698
537	1940					2190	delete $omop_person_id->{$key};
538							}
539	2					1224	if (DEVEL_MODE) {
540
541							#say 'transpose_omop_data_structure(omop_person_id):',
542							# to_gb( total_size($omop_person_id) );
543							#say 'transpose_omop_data_structure(map):', to_gb( total_size($aoh) );
544							}
545	2					27	return $aoh;
546							}
547
548							sub transpose_visit_occurrence {
549
550	3			3	0	14	my $data = shift; # arrayref
551
552							# Going from
553							#$VAR1 = [
554							# {
555							# 'admitting_source_concept_id' => 0,
556							# 'visit_occurrence_id' => 85,
557							# ...
558							# }
559							# ];
560
561							# To
562							#$VAR1 = {
563							# '85' => {
564							# 'admitting_source_concept_id' => 0,
565							# 'visit_occurrence_id' => 85,
566							# ...
567							# }
568							# };
569							#my $hash = { map { $_->{visit_occurrence_id} => $_ } @$data }; # map is slower than for
570	3					10	my $hash;
571	3					12	for my $item (@$data) {
572							my $key = $item->{visit_occurrence_id}
573	2037					2759	; # otherwise $item->{visit_occurrence_id} goes from Int to Str in JSON and tests fail
574	2037					4238	$hash->{$key} = $item;
575							}
576	3					20	return $hash;
577							}
578
579							sub read_csv {
580
581	15			15	0	39	my $arg = shift;
582	15					49	my $filepath = $arg->{in};
583	15					38	my $sep = $arg->{sep};
584
585							# Define split record separator from file extension
586	15					68	my ( $separator, $encoding ) = define_separator( $filepath, $sep );
587
588							# Transform $filepath into an AoH
589							# Using Text::CSV_XS functional interface
590	15					147	my $aoh = csv(
591							in => $filepath,
592							sep_char => $separator,
593							headers => "auto",
594							eol => "\n",
595
596							# binary => 1, # default
597							encoding => $encoding,
598							auto_diag => 1
599							);
600
601							# $aoh = [
602							# {
603							# 'abdominal_mass' => 0,
604							# 'age_first_diagnosis' => 0,
605							# 'alcohol' => 4,
606							# }, {},,,
607							# ]
608
609							# Coercing the data before returning it
610	14					218413	for my $item (@$aoh) {
611	1035					1272	for my $key ( keys %{$item} ) {
	1035					19831
612	177462					253307	$item->{$key} = dotify_and_coerce_number( $item->{$key} );
613							}
614							}
615
616	14					184	return $aoh;
617							}
618
619							sub read_csv_stream {
620
621	0			0	0	0	my $arg = shift;
622	0					0	my $filein = $arg->{in};
623	0					0	my $self = $arg->{self};
624	0					0	my $sep = $arg->{sep};
625	0					0	my $person = $arg->{person};
626	0					0	my $fileout = $self->{out_file};
627
628							# Define split record separator
629	0					0	my ( $separator, $encoding, $table_name ) =
630							define_separator( $filein, $sep );
631	0					0	my $table_name_lc = lc($table_name);
632
633							# Using Text::CSV_XS OO interface
634	0					0	my $csv = Text::CSV_XS->new(
635							{ binary => 1, auto_diag => 1, sep_char => $separator, eol => "\n" } );
636
637							# Open filehandles
638	0					0	my $fh_in = open_filehandle( $filein, 'r' );
639	0					0	my $fh_out = open_filehandle( $fileout, 'a' );
640
641							# Get rid of \n on first line
642	0					0	chomp( my $line = <$fh_in> );
643	0					0	my @headers = split /$separator/, $line;
644
645	0					0	my $hash_slice;
646	0					0	my $count = 0;
647
648							# * IMPORTANT *
649							# On Feb-19-2023 I tested Parallel::ForkManager and:
650							# 1 - The performance was by far slower than w/o it
651							# 2 - We hot SQLite errors for concurring fh
652							# Thus, it was not implemented
653
654	0					0	while ( my $row = $csv->getline($fh_in) ) {
655
656							# Load the values a a hash slice;
657	0					0	my $hash_slice;
658	0					0	@{$hash_slice}{@headers} = map { dotify_and_coerce_number($_) } @$row;
	0					0
	0					0
659
660							# Encode data
661	0					0	my $encoded_data =
662							encode_omop_stream( $table_name, $hash_slice, $person, $count,
663							$self );
664
665							# Only after encoding we are able to discard 'null'
666	0	0				0	say $fh_out $encoded_data if $encoded_data ne 'null';
667
668							# Increment $count
669	0					0	$count++;
670							say "Rows processed: $count"
671	0	0	0			0	if ( $self->{verbose} && $count % 10_000 == 0 );
672							}
673	0	0				0	say "==============\nRows total: $count\n" if $self->{verbose};
674
675	0					0	close $fh_in;
676	0					0	close $fh_out;
677	0					0	return 1;
678							}
679
680							sub write_csv {
681
682	0			0	0	0	my $arg = shift;
683	0					0	my $sep = $arg->{sep};
684	0					0	my $aoh = $arg->{data};
685	0					0	my $filepath = $arg->{filepath};
686	0					0	my $headers = $arg->{headers};
687
688							# Using Text::CSV_XS functional interface
689							# NB: About speed:
690							# https://metacpan.org/pod/Text::CSV#csv1
691							csv(
692							in => $aoh,
693							out => $filepath,
694							sep_char => $sep,
695							eol => "\n",
696
697							#binary => 1, # default
698							encoding => 'UTF-8',
699							headers => $arg->{headers}
700	0					0	);
701	0					0	return 1;
702							}
703
704							sub open_filehandle {
705
706	7			7	0	29	my ( $filepath, $mode ) = @_;
707	7	50				76	my $handle = $mode eq 'a' ? '>>' : $mode eq 'w' ? '>' : '<';
		100
708	7					15	my $fh;
709	7	100				42	if ($filepath =~ /\.gz$/) {
710	3	100	66			20	if ($mode eq 'a' \|\| $mode eq 'w') {
711	1	50				15	$fh = IO::Compress::Gzip->new($filepath, Append => ($mode eq 'a' ? 1 : 0));
712							}
713							else {
714	2					30	$fh = IO::Uncompress::Gunzip->new($filepath, MultiStream => 1);
715							}
716	3					9516	binmode($fh, ":encoding(UTF-8)");
717							}
718							else {
719	4					39	open $fh, qq($handle:encoding(UTF-8)), $filepath;
720							}
721	7					5802	return $fh;
722							}
723
724							sub define_separator {
725
726	28			28	0	93	my ( $filepath, $sep ) = @_;
727
728							# Define split record separator from file extension
729	28					79	my @exts = map { $_, $_ . '.gz' } qw(.csv .tsv .sql .txt);
	112					359
730	28					2871	my ( $table_name, undef, $ext ) = fileparse( $filepath, @exts );
731
732							# Defining separator character
733	28	50				190	my $separator =
		50
		50
		100
		100
734							$sep
735							? $sep
736							: $ext eq '.csv' ? ';' # Note we don't use comma but semicolon
737							: $ext eq '.csv.gz' ? ';' # idem
738							: $ext eq '.tsv' ? "\t"
739							: $ext eq '.tsv.gz' ? "\t"
740							: "\t";
741
742	28	50				130	my $encoding =
743							$ext =~ m/\.gz/ ? ':gzip:encoding(utf-8)' : 'encoding(utf-8)';
744
745							# Return 3 but some get only 2
746	28					129	return ( $separator, $encoding, $table_name );
747							}
748
749							sub to_gb {
750
751	0			0	0	0	my $bytes = shift;
752
753							# base 2 => 1,073,741,824
754	0					0	my $gb = $bytes / 1_073_741_824;
755	0					0	return sprintf( '%8.4f', $gb ) . ' GB';
756							}
757
758							sub load_exposures {
759
760	3			3	0	30	my $data = read_csv( { in => shift, sep => "\t" } );
761
762							# We will only use the key 'concept_id' and discard the rest
763							#$VAR1 = {
764							# '4138352' => 1
765							# };
766	3					15	my %hash = map { $_->{concept_id} => 1 } @$data;
	639					1433
767
768							# Returning hashref
769	3					747	return \%hash;
770							}
771
772							1;