File Coverage

blib/lib/Graph/Undirected/Components/External.pm
Criterion Covered Total %
statement 277 340 81.4
branch 76 130 58.4
condition 15 39 38.4
subroutine 24 26 92.3
pod 4 12 33.3
total 396 547 72.3


line stmt bran cond sub pod time code
1             package Graph::Undirected::Components::External;
2 1     1   12004 use strict;
  1         3  
  1         44  
3 1     1   5 use warnings;
  1         2  
  1         131  
4 1     1   15 use File::Path qw(make_path remove_tree);
  1         2  
  1         110  
5 1     1   13591 use File::Temp qw(tempdir tempfile);
  1         25852  
  1         68  
6 1     1   1368 use Sort::External;
  1         6036  
  1         53  
7 1     1   4855 use Text::CSV;
  1         26301  
  1         8  
8 1     1   17697 use Log::Log4perl;
  1         112625  
  1         9  
9 1     1   66 use Graph::Undirected::Components;
  1         2  
  1         57  
10 1     1   6 use Time::HiRes;
  1         2  
  1         10  
11             #use Data::Dump qw(dump);
12              
13             BEGIN
14             {
15 1     1   154 use Exporter ();
  1         3  
  1         24  
16 1     1   6 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  1         2  
  1         143  
17 1     1   3 $VERSION = '0.31';
18 1         19 @ISA = qw(Exporter);
19 1         2 @EXPORT = qw();
20 1         2 @EXPORT_OK = qw();
21 1         4110 %EXPORT_TAGS = ();
22             }
23              
24             my $el = "\n";
25              
26             #01234567890123456789012345678901234567891234
27             #Computes components of an undirected graph.
28              
29             =head1 NAME
30              
31             C - Computes components of an undirected graph.
32              
33             =head1 SYNOPSIS
34              
35             use Data::Dump qw(dump);
36             use Log::Log4perl qw(:easy);
37             use Graph::Undirected::Components::External;
38             Log::Log4perl->easy_init ($WARN);
39             my $componenter = Graph::Undirected::Components::External->new(outputFile => 'vertexCompId.txt', purgeSizeBytes => 5000);
40             my $vertices = 10000;
41             for (my $i = 0; $i < $vertices; $i++)
42             {
43             $componenter->add_edge (int rand $vertices, int rand $vertices);
44             }
45             dump $componenter->finish ();
46              
47             =head1 DESCRIPTION
48              
49             C computes the components of an undirected
50             graph limited only by the amount of free disk space. All errors, warnings, and
51             informational messages are logged using L.
52              
53             =head1 CONSTRUCTOR
54              
55             =head2 C
56              
57             The method C creates an instance of C
58             with the following parameters.
59              
60             =over
61              
62             =item C
63              
64             workingDirectory => File::Temp::tempdir()
65              
66             C is an optional parameter specifying the path
67             to a directory that all temporary files are written to; the default
68             is set using L.
69              
70              
71             =item C
72              
73             purgeSizeBytes => 1000000
74              
75             C is an optional parameter specifying the aggregate byte size
76             that all the vertices added to the internal instance of
77             L must exceed before its content is purged
78             to disk. The optimal value depends on the total internal memory
79             available.
80              
81             =item C
82              
83             purgeSizeVertices => undef
84              
85             C is an optional parameter specifying the total
86             vertices added to the internal instance of
87             L that must be exceed before its content is purged
88             to disk. If C and C are both defined, then a purge
89             occurs when either threshold is exceeded.
90              
91             =item C
92              
93             retainPercentage => 0.10
94              
95             C is an optional parameter specifying the percentage of
96             the most recently used vertices to be retained in the internal instance of
97             L when it is purged. If the edges of the
98             graph are not added in a random order, caching some of the vertices can
99             speedup the computation of the components.
100              
101             =item C
102              
103             outputFile => ...
104              
105             C is the path to the file that the C<(vertex,component-id)> pairs
106             are written to separated by the C; the directory of the file should exist. An exception is
107             thrown if C is undefined or the file cannot be written to.
108              
109             =item C
110              
111             delimiter => ','
112              
113             C is the delimiter used to separate the vertices of an edge when
114             they are written to temporary files. All vertices should be encoded so that
115             they do not contain the delimiter, that is, it should be true that
116             C for all vertices.
117              
118             =back
119              
120             =cut
121              
122             sub new
123             {
124              
125             # get the object type and parameters.
126 7     7 1 889578 my ($Class, %Parameters) = @_;
127 7   33     69 my $Self = bless({}, ref($Class) || $Class);
128              
129             # set the flag when finished is called so no more edges are added.
130 7         32 $Self->{finishedCalled} = 0;
131              
132             # set the start time.
133 7         36 $Self->{startTime} = $Self->getCpuTime();
134              
135             # flag if temporary files and directories should be deleted;
136             # used for debugging.
137 7         22 $Self->{cleanup} = 1;
138              
139             # set the default delimiter for the input and output files.
140 7         14 $Self->{delimiter} = ',';
141 7 100       33 $Self->{delimiter} = $Parameters{delimiter} if exists $Parameters{delimiter};
142              
143             # set the recursion level
144 7         18 $Self->{level} = 0;
145 7 100 66     53 $Self->{level} = $Parameters{level} if (exists($Parameters{level}) && defined($Parameters{level}));
146              
147             # set the workingDirectory and create it if needed.
148 7 100       28 if (exists($Parameters{baseDirectory}))
149             {
150 6         14 $Self->{baseDirectory} = $Parameters{baseDirectory};
151             }
152             else
153             {
154 1         2 my $workingDirectory;
155 1 50 33     6 if (exists($Parameters{workingDirectory}) && defined($Parameters{workingDirectory}))
156             {
157              
158             # if the directory does not exist created it.
159 0 0       0 unless (-d $Parameters{workingDirectory})
160             {
161 0         0 make_path($Parameters{workingDirectory}, { verbose => 0, mode => 0700 });
162 0         0 $Self->{unlinkWorkingDirectory} = 1;
163             }
164              
165 0         0 $workingDirectory = $Parameters{workingDirectory};
166             }
167             else
168             {
169              
170             # none given as a parameters, so create a temporary one.
171 1 50       9 $workingDirectory = tempdir(CLEANUP => $Self->{cleanup}) unless defined $workingDirectory;
172             }
173              
174             # if the directory does not exist log an error and die.
175 1 50       874 unless (-e $workingDirectory)
176             {
177 0         0 my $logger = Log::Log4perl->get_logger();
178 0         0 $logger->logdie("error: could not create directory '$workingDirectory'.\n");
179             }
180              
181             # if workingDirectory is not a directory log an error and die.
182 1 50       23 unless (-d $workingDirectory)
183             {
184 0         0 my $logger = Log::Log4perl->get_logger();
185 0         0 $logger->logdie("error: '$workingDirectory' is not a directory.\n");
186             }
187              
188             # now create the base directory in the working directory.
189 1         5 my $baseDirectory = tempdir(DIR => $workingDirectory, CLEANUP => $Self->{cleanup});
190              
191             # if $baseDirectory is not a directory log an error and die.
192 1 50       359 unless (-d $baseDirectory)
193             {
194 0         0 my $logger = Log::Log4perl->get_logger();
195 0         0 $logger->logdie("error: $baseDirectory is not a directory.\n");
196             }
197 1         4 $Self->{baseDirectory} = $baseDirectory;
198             }
199              
200             # create the object to compute the components using internal memory.
201 7         56 $Self->{componenter} = Graph::Undirected::Components->new();
202              
203             # set the purge size of the componenter.
204 7         17 my $purgeSizeBytes = 1000000;
205 7 50 33     80 $purgeSizeBytes = int abs $Parameters{purgeSizeBytes} if exists($Parameters{purgeSizeBytes}) && defined($Parameters{purgeSizeBytes});
206 7         19 $Self->{purgeSizeBytes} = $purgeSizeBytes;
207 7 50 66     49 $Self->{purgeSizeVertices} = int abs $Parameters{purgeSizeVertices} if exists($Parameters{purgeSizeVertices}) && defined($Parameters{purgeSizeVertices});
208 7         16 $Self->{totalEdgesAddedSinceLastPurge} = 0;
209 7         13 $Self->{totalEdges} = 0;
210 7         15 $Self->{totalVertices} = 0;
211              
212             # set the percentage of vertices to retain in the componenter when purging.
213 7         12 my $retainPercentage = 0.10;
214 7 100 66     45 $retainPercentage = abs $Parameters{retainPercentage}
215             if (exists($Parameters{retainPercentage}) && defined($Parameters{retainPercentage}));
216 7 50       24 $retainPercentage = 1 if $retainPercentage > 1;
217 7         136 $Self->{retainPercentage} = $retainPercentage;
218              
219             # set the file that the "vertex,compId" pairs will be written to.
220 7 50 33     46 unless (exists($Parameters{outputFile}) && defined($Parameters{outputFile}))
221             {
222 0         0 my $logger = Log::Log4perl->get_logger();
223 0         0 $logger->logdie("error: parameter outputFile was not defined.\n");
224             }
225 7         17 $Self->{outputFile} = $Parameters{outputFile};
226              
227             # make sure we can write to the output file before devoting time to
228             # computing the connected components.
229             {
230 7         7 my $outputFileHandle;
  7         10  
231 7 50       447 unless (open($outputFileHandle, '>', $Self->{outputFile}))
232             {
233 0         0 my $logger = Log::Log4perl->get_logger();
234 0         0 $logger->logdie("error: could not open file '$Self->{outputFile}' for writing.\n");
235             }
236 7         107 close $outputFileHandle;
237             }
238 7         430 unlink $Self->{outputFile};
239              
240 7         27 return $Self;
241             }
242              
243             =head1 METHODS
244              
245             =head2 C
246              
247             The method C updates the components of the graph using the edge
248             C<(vertexA, vertexB)>.
249              
250             =over
251              
252             =item vertexA, vertexB
253              
254             The vertices of the edge C<(vertexA, vertexB)> are Perl strings. If only C
255             is defined, then the edge C<(vertexA, vertexA)> is added to the graph. The method always returns
256             undef.
257              
258             =back
259              
260             =cut
261              
262             sub add_edge
263             {
264 2012     2012 1 6035 my ($Self, @Edge) = @_;
265              
266             # if nothing to add return now.
267 2012 50       4254 return undef unless @Edge;
268              
269 2012 50       4227 if ($Self->{finishedCalled})
270             {
271 0         0 my $logger = Log::Log4perl->get_logger();
272 0         0 $logger->logdie("error: cannot add more edges after call to finish().\n");
273             }
274              
275 2012         6363 $Self->{componenter}->add_edge(@Edge);
276 2012         3177 ++$Self->{totalEdgesAddedSinceLastPurge};
277 2012         2370 ++$Self->{totalEdges};
278              
279             # if componenter is too large, purge it to disk.
280 2012 100 33     5789 if
      66        
281             (
282             ($Self->{componenter}->getSizeBytes() > $Self->{purgeSizeBytes}) ||
283             ((exists $Self->{purgeSizeVertices}) && ($Self->{componenter}->getSizeVertices() > $Self->{purgeSizeVertices}))
284             )
285             {
286             # add the root node pairs to the external component finder.
287 402         985 $Self->purge($Self->{retainPercentage});
288             }
289              
290 2012         10588 return undef;
291             }
292              
293             =head2 C
294              
295             The method C adds all the edges in a file to the graph.
296              
297             =over
298              
299             =item fileOfEdges => ...
300              
301             C specifies the path to the file containing the edges to
302             add. An exception is thrown if there are problems openning or reading the file.
303              
304             =item delimiter
305              
306             The edges are read from C using L; C
307             must be to the delimiter used to separate the vertices of an edge in the file. The default
308             is the value set with the L constructor.
309              
310             =back
311              
312             =cut
313              
314             sub add_file
315             {
316 0     0 1 0 my ($Self, %Parameters) = @_;
317              
318             # if no fileOfEdges, return now.
319 0 0 0     0 if (!exists ($Parameters{fileOfEdges}) || !defined ($Parameters{fileOfEdges}))
320             {
321 0         0 my $logger = Log::Log4perl->get_logger();
322 0         0 $logger->logdie("error: parameter 'fileOfEdges' was not defined.\n");
323             }
324 0         0 my $fileOfEdges = $Parameters{fileOfEdges};
325              
326             # set the default delimiter.
327 0         0 my $delimiter = $Self->{delimiter};
328 0 0       0 $delimiter = $Parameters{delimiter} if exists $Parameters{delimiter};
329              
330             # make sure the file exists.
331 0         0 my $fileOfEdgesHandle;
332 0 0       0 unless (open($fileOfEdgesHandle, '<:encoding(utf8)', $fileOfEdges))
333             {
334 0         0 my $logger = Log::Log4perl->get_logger();
335 0         0 $logger->logdie("error: could not open file '$fileOfEdges' for reading: $!\n");
336             }
337              
338             # create the CSV parser.
339 0         0 my $csvParser = Text::CSV->new({ binary => 1, sep_char => $delimiter });
340 0 0       0 unless ($csvParser)
341             {
342 0         0 my $logger = Log::Log4perl->get_logger();
343 0         0 $logger->logdie("error: could not open CSV parser; " . Text::CSV->error_diag() . "\n");
344             }
345              
346             # add each edge in the file to the graph.
347 0         0 while (my $edge = $csvParser->getline($fileOfEdgesHandle))
348             {
349              
350             # if no edge, skip it.
351 0 0       0 next unless defined $edge;
352              
353             # if the first column is empty, skip it.
354 0 0 0     0 next unless exists($edge->[0]) && defined($edge->[0]);
355              
356             # if the second column is emtpy, set it to the first.
357 0 0 0     0 $edge->[1] = $edge->[0] if (!exists($edge->[1]) || !defined($edge->[1]));
358              
359             # add the edge.
360 0         0 $Self->add_edge($edge->[0], $edge->[1]);
361             }
362 0         0 close $fileOfEdgesHandle;
363              
364 0         0 return undef;
365             }
366              
367             sub purge
368             {
369 408     408 0 637 my ($Self, $RetainPercentage) = @_;
370              
371             # set the default for the percentage of vertices retained in vertexCompIdSorter.
372 408 50       785 $RetainPercentage = 0 unless defined $RetainPercentage;
373              
374             # create the vertexCompIdSorter if it does not exist.
375 408 100       1042 unless (exists $Self->{vertexCompIdSorter})
376             {
377 6         53 $Self->{vertexCompIdSorter} = Sort::External->new(mem_threshold => 64 * 1024 * 1024, working_dir => $Self->{baseDirectory});
378             }
379              
380             # get the list of vertexCompIds.
381 408 100       4207 $RetainPercentage = 0 if $Self->{totalEdgesAddedSinceLastPurge} < 2;
382 408         1161 my $listOfVertexCompIds = $Self->{componenter}->get_vertexCompIdPairs($RetainPercentage);
383 408         636 my $totalVertexCompIds = scalar(@$listOfVertexCompIds);
384              
385             # add each vertexCompId to the external sorter.
386 408         944 for (my $i = 0 ; $i < $totalVertexCompIds ; $i++)
387             {
388 3281         7666 my $vertexCompIdString = $listOfVertexCompIds->[$i][0] . $Self->{delimiter} . $listOfVertexCompIds->[$i][1];
389 3281         13669 $Self->{vertexCompIdSorter}->feed($vertexCompIdString);
390             }
391 408         505 $listOfVertexCompIds = undef;
392              
393             # keep track of the number of edges added between purges.
394 408         2611 $Self->{totalEdgesAddedSinceLastPurge} = 0;
395              
396             # log the purge as an info message.
397             {
398 408         623 my $logger = Log::Log4perl->get_logger();
  408         1473  
399 408         19054 $logger->info("purged $totalVertexCompIds vertex,component-id pairs.\n");
400             }
401              
402 408         3267 return undef;
403             }
404              
405             =head2 C
406              
407             The method C completes the computation of the connected components
408             and writes the pairs C<(vertex,component-id)> to the L. For
409             each component C is the lexographical minimum of all the
410             vertices in the component.
411              
412             No edges can be added to the graph after C is called.
413              
414             =cut
415              
416             sub finish
417             {
418 7     7 1 2422 my ($Self) = @_;
419              
420             # once finish is called no more edges can be added.
421 7         21 $Self->{finishedCalled} = 1;
422              
423 7 100       25 if (exists $Self->{vertexCompIdSorter})
424             {
425             # purge the last of the internal vertexCompIds and do not retain any vertices.
426 6         19 $Self->purge(0);
427              
428             # finish sorting the vertexCompId pairs.
429 6         35 $Self->{vertexCompIdSorter}->finish();
430              
431             # get the sorter for the oldCompId-to-oldCompId file of pairs.
432 6         3471 my $oldCompIdToOldSorter = Sort::External->new(mem_threshold => 64 * 1024 * 1024, working_dir => $Self->{baseDirectory});
433              
434             # get the temporay file for the oldCompId-to-vertex file of pairs.
435 6         3486 my ($oldCompIdToVertexFileFh, $oldCompIdToVertexFile) =
436             tempfile("OV_XXXXXXXXXX", DIR => $Self->{baseDirectory}, UNLINK => $Self->{cleanup});
437 6         2627 close $oldCompIdToVertexFileFh;
438              
439             # compute the subgraph based on the oldCompId-to-oldCompId pairs and
440             # write the oldCompId-to-vertex pairs.
441 6         33 $Self->writeSubgraphInfoToFiles($oldCompIdToOldSorter, $oldCompIdToVertexFile);
442            
443             # finish the sort.
444 6         2542 $oldCompIdToOldSorter->finish();
445              
446             # get the temporay file for the oldCompId-to-newCompId file of pairs.
447 6         1814 my ($oldCompIdToNewCompIdFileFh, $oldCompIdToNewCompIdFile) =
448             tempfile("ON_XXXXXXXXXX", DIR => $Self->{baseDirectory}, UNLINK => $Self->{cleanup});
449 6         2907 close $oldCompIdToNewCompIdFileFh;
450              
451             # compute the components of the subgraph from the oldCompId-to-oldCompId pairs.
452             {
453 6         14 my $externalComponenter =
  6         103  
454             Graph::Undirected::Components::External->new(
455             baseDirectory => $Self->{baseDirectory},
456             outputFile => $oldCompIdToNewCompIdFile,
457             level => 1 + $Self->{level},
458             delimiter => $Self->{delimiter},
459             purgeSizeBytes => $Self->{purgeSizeBytes},
460             purgeSizeVertices => $Self->{purgeSizeVertices},
461             retainPercentage => $Self->{retainPercentage}
462             );
463            
464             # add the edges in sorted order.
465 6         20 my $previousEdgeStr = '';
466 6         46 while (defined (my $edgeStr = $oldCompIdToOldSorter->fetch()))
467             {
468             # skip the edge if a duplicate.
469 1978 100       14361 next if $previousEdgeStr eq $edgeStr;
470 1524         1723 $previousEdgeStr = $edgeStr;
471            
472             # split the edge.
473 1524         5562 my @edge = split (/$Self->{delimiter}/, $edgeStr);
474            
475             # add the edge to the graph.
476 1524         3381 $externalComponenter->add_edge (@edge);
477             }
478            
479             # purge the edge sorted.
480 6         123 $oldCompIdToOldSorter = undef;
481            
482             # finish computing the components of the graph.
483 6         60 my $processingStats = $externalComponenter->finish();
484 6 50       6113 $Self->{processingStats} = [] unless exists $Self->{processingStats};
485 6         17 push @{ $Self->{processingStats} }, @$processingStats;
  6         49  
486             }
487              
488             # map the components of the subgraph to the original nodes.
489 6         41 $Self->mapComponentsOfSubgraphToNodes($oldCompIdToNewCompIdFile, $oldCompIdToVertexFile, $Self->{outputFile}, $Self->{baseDirectory});
490              
491 6 50       14955 unlink $oldCompIdToNewCompIdFile if $Self->{cleanup};
492 6 50       1266 unlink $oldCompIdToVertexFile if $Self->{cleanup};
493             }
494             else
495             {
496              
497             # the edges fit in memory, so just compute the components and write the results to the file.
498 1         6 $Self->outputVertexCompId();
499             }
500              
501             # store the processing stats.
502 7 100       53 $Self->{processingStats} = [] unless exists $Self->{processingStats};
503 7         49 my $totalTime = $Self->getCpuTime($Self->{startTime});
504 7         45 push @{ $Self->{processingStats} }, { level => $Self->{level}, time => $totalTime, edges => $Self->{totalEdges} };
  7         70  
505              
506             # log the stats as an info message.
507             {
508 7         16 my $logger = Log::Log4perl->get_logger();
  7         136  
509 7         685 $logger->info("processed $Self->{totalEdges} edges in $totalTime seconds; recusion level is $Self->{level}.\n");
510             }
511              
512 7         363 return $Self->{processingStats};
513             }
514              
515             sub mapComponentsOfSubgraphToNodes
516             {
517 6     6 0 19 my ($Self, $OldCompIdToNewCompIdFile, $OldCompIdToVertexFile, $VertexToNewCompIdFile, $WorkingDirectory) = @_;
518              
519             # get the delimiter to use for the records.
520 6         16 my $delimiter = $Self->{delimiter};
521              
522             # create the sorter to merge the OldCompId-NewCompId and OldCompId-Vertex edges.
523 6         131 my $mergeSorter = Sort::External->new(mem_threshold => 64 * 1024 * 1024, working_dir => $WorkingDirectory);
524              
525             {
526              
527             # open the file $OldCompIdToNewCompIdFile to read each of the compComp edges.
528 6         5889 my $oldCompIdToNewCompIdFileHandle;
  6         13  
529 6 50       690 unless (open($oldCompIdToNewCompIdFileHandle, '<', $OldCompIdToNewCompIdFile))
530             {
531 0         0 my $logger = Log::Log4perl->get_logger();
532 0         0 $logger->logdie("could not open file '$OldCompIdToNewCompIdFile' for reading: $!\n");
533             }
534              
535             # set the delimiter for the oldCompId-NewCompId so the pairs are first when sorted.
536 6         21 my $oldCompIdToNewCompIdDelimiter = $delimiter . 'cc' . $delimiter;
537              
538             # cache the previous string to test for skipping of duplicates.
539 6         13 my $previousOldCompIdToNewCompIdString = '';
540 6         356 while (defined(my $oldCompIdToNewCompIdString = <$oldCompIdToNewCompIdFileHandle>))
541             {
542              
543             # remove the line feed from the string.
544 676         1510 chop $oldCompIdToNewCompIdString;
545              
546             # convert the string to its pair of records.
547 676         2615 my @oldCompIdToNewCompIdRecord = split(/$delimiter/, $oldCompIdToNewCompIdString);
548              
549             # make sure the strings parses into only two items.
550 676 50       1519 if (@oldCompIdToNewCompIdRecord != 2)
551             {
552 0         0 my $logger = Log::Log4perl->get_logger();
553 0         0 $logger->logdie("error: oldCompId to newCompId string record does not have two values.\n");
554             }
555              
556             # feed the oldCompId-NewCompId pairs to the sorter.
557 676 100 66     5511 $mergeSorter->feed($oldCompIdToNewCompIdRecord[0] . $oldCompIdToNewCompIdDelimiter . $oldCompIdToNewCompIdRecord[1])
558             if ( ($previousOldCompIdToNewCompIdString ne $oldCompIdToNewCompIdString)
559             && ($oldCompIdToNewCompIdRecord[0] ne $oldCompIdToNewCompIdRecord[1]));
560              
561             # store the string.
562 676         3119 $previousOldCompIdToNewCompIdString = $oldCompIdToNewCompIdString;
563             }
564 6         99 close $oldCompIdToNewCompIdFileHandle;
565             }
566              
567             {
568              
569             # open the file $OldCompIdToVertexFile to read each of the pairs.
570 6         13 my $oldCompIdToVertexFileHandle;
  6         11  
571 6 50       804 unless (open($oldCompIdToVertexFileHandle, '<', $OldCompIdToVertexFile))
572             {
573 0         0 my $logger = Log::Log4perl->get_logger();
574 0         0 $logger->logdie("could not open file '$OldCompIdToVertexFile' for reading: $!\n");
575             }
576              
577             # set the delimiter for the oldCompId-Vertex so the pairs are second when sorted.
578 6         275 my $oldCompIdToVertexDelimiter = $delimiter . 'cn' . $delimiter;
579              
580             # cache the previous string to test for skipping of duplicates.
581 6         9 my $previousOldCompIdToVertexString = '';
582 6         189 while (defined(my $oldCompIdToVertexString = <$oldCompIdToVertexFileHandle>))
583             {
584 1159         1472 chop $oldCompIdToVertexString;
585 1159         3198 my @oldCompIdToVertexRecord = split(/$delimiter/, $oldCompIdToVertexString);
586 1159 50       2427 if (@oldCompIdToVertexRecord != 2)
587             {
588 0         0 my $logger = Log::Log4perl->get_logger();
589 0         0 $logger->logdie("error: oldCompId to vertex string record does not have two values.\n");
590             }
591 1159 50       7420 $mergeSorter->feed($oldCompIdToVertexRecord[0] . $oldCompIdToVertexDelimiter . $oldCompIdToVertexRecord[1])
592             if $previousOldCompIdToVertexString ne $oldCompIdToVertexString;
593 1159         4406 $previousOldCompIdToVertexString = $oldCompIdToVertexString;
594             }
595 6         87 close $oldCompIdToVertexFileHandle;
596             }
597              
598             # sort the edges.
599 6         37 $mergeSorter->finish;
600              
601             {
602              
603             # open the file to write the Vertex to NewCompId pairs.
604 6         1445 my $vertexToNewCompIdFileHandle;
  6         10  
605 6 50       1185 unless (open($vertexToNewCompIdFileHandle, '>', $VertexToNewCompIdFile))
606             {
607 0         0 my $logger = Log::Log4perl->get_logger();
608 0         0 $logger->logdie("error: could not open file '$VertexToNewCompIdFile' for writing.\n");
609             }
610              
611             # get first record pair from the sorter.
612 6         14 my $previousOldToNewOrOldVertexString = '';
613 6         29 my $oldToNewOrOldVertexString = $mergeSorter->fetch;
614              
615             # split the string into a record.
616 6         13 my $oldToNewOrOldVertexRecord;
617             my @listOfOldToNewOrOldVertexRecords;
618 6 50       21 if (defined $oldToNewOrOldVertexString)
619             {
620 6         45 $oldToNewOrOldVertexRecord = [ split(/$delimiter/, $oldToNewOrOldVertexString) ];
621 6         20 @listOfOldToNewOrOldVertexRecords = ($oldToNewOrOldVertexRecord);
622             }
623              
624 6         35 while (defined($oldToNewOrOldVertexString = $mergeSorter->fetch))
625             {
626              
627             # convert the string to a record.
628 1823         7503 my $oldToNewOrOldVertexRecord = [ split(/$delimiter/, $oldToNewOrOldVertexString) ];
629              
630             # when the compId changes, process the records in the list.
631 1823 100       5586 if ($listOfOldToNewOrOldVertexRecords[-1]->[0] ne $oldToNewOrOldVertexRecord->[0])
632             {
633              
634             # remap the oldCompId of each vertex to the newCompId and write it to the file.
635 670         1583 $Self->mapComponentsOfSubgraphToVerticesInList(\@listOfOldToNewOrOldVertexRecords, $vertexToNewCompIdFileHandle);
636              
637             # empty the cache of records.
638 670         2106 @listOfOldToNewOrOldVertexRecords = ();
639             }
640              
641             # cache the record if unique.
642 1823 50       4406 if ($oldToNewOrOldVertexString ne $previousOldToNewOrOldVertexString)
643             {
644 1823         2146 push @listOfOldToNewOrOldVertexRecords, $oldToNewOrOldVertexRecord;
645 1823         13406 $previousOldToNewOrOldVertexString = $oldToNewOrOldVertexString;
646             }
647              
648             }
649              
650             # remap the oldCompId of each vertex to the newCompId and write it to the file.
651 6         146 $Self->mapComponentsOfSubgraphToVerticesInList(\@listOfOldToNewOrOldVertexRecords, $vertexToNewCompIdFileHandle);
652 6         125 @listOfOldToNewOrOldVertexRecords = ();
653              
654             # close the file.
655 6         1543 close $vertexToNewCompIdFileHandle;
656             }
657              
658 6         65 return undef;
659             }
660              
661             sub mapComponentsOfSubgraphToVerticesInList # (\@listOfOldToNewOrOldVertexRecords, $vertexToNewCompIdFileHandle);
662             {
663 676     676 0 1273 my ($Self, $ListOfOldToNewOrOldVertexRecords, $VertexToNewCompIdFileHandle) = @_;
664              
665             # get the string record delimiter.
666 676         1080 my $delimiter = $Self->{delimiter};
667              
668             # separate the Cc and Cn records.
669 676         730 my $totalRecords = @$ListOfOldToNewOrOldVertexRecords;
670 676         646 my $indexOfFirstCnRecord = 0;
671 676         1660 for ($indexOfFirstCnRecord = 0 ; $indexOfFirstCnRecord < @$ListOfOldToNewOrOldVertexRecords ; $indexOfFirstCnRecord++)
672             {
673 1111 100       4448 last if ($ListOfOldToNewOrOldVertexRecords->[$indexOfFirstCnRecord][1] eq 'cn');
674             }
675 676         928 my $totalCcRecords = $indexOfFirstCnRecord;
676 676         731 my $totalCnRecords = $totalRecords - $totalCcRecords;
677              
678             # if there are no oldCompIdToVertex records in the list, there is nothing to do.
679 676 100       1546 if ($totalCnRecords == 0)
680             {
681              
682             #my $logger = Log::Log4perl->get_logger();
683             #$logger->info ("info: no oldCompId to vertex records in list.\n");
684 235         425 return undef;
685             }
686              
687             # if there is more than one oldCompIdToNewCompId record in the list, log the info.
688 441 50       1168 if ($totalCcRecords > 1)
689             {
690 0         0 my $logger = Log::Log4perl->get_logger();
691 0         0 $logger->info("info: there were $totalCcRecords oldCompId-newCompId records in the list.\n");
692             }
693              
694             # if $totalCcRecords is zero, there are no oldCompIdToNewCompId mapping records,
695             # so then just add the oldCompIdToVertex records to the file.
696 441 100       670 if ($totalCcRecords == 0)
697             {
698              
699             # write each node,comp record to $VertexToNewCompIdFileHandle.
700 6         14 my $previousRecord = '';
701 6         21 for (my $i = $indexOfFirstCnRecord ; $i < $totalRecords ; $i++)
702             {
703              
704             # convert the record to a string.
705 36         70 my $recordString = $ListOfOldToNewOrOldVertexRecords->[$i][2] . $delimiter . $ListOfOldToNewOrOldVertexRecords->[$i][0];
706              
707             # skip duplicate records.
708 36 50       69 next if $previousRecord eq $recordString;
709              
710             # print the record.
711 36         212 print $VertexToNewCompIdFileHandle $recordString . $el;
712              
713             # store a copy of the record to remove duplicates.
714 36         85 $previousRecord = $recordString;
715             }
716             }
717             else
718             {
719              
720             # get the newCompId.
721 435         592 my $newCompId = $ListOfOldToNewOrOldVertexRecords->[0][2];
722 435         505 my $previousRecord = '';
723              
724 435         1126 for (my $i = $indexOfFirstCnRecord ; $i < $totalRecords ; $i++)
725             {
726              
727             # convert the record to a string.
728 1123         2673 my $recordString = $ListOfOldToNewOrOldVertexRecords->[$i][2] . $delimiter . $newCompId;
729              
730             # skip duplicate records.
731 1123 50       2222 next if $previousRecord eq $recordString;
732              
733             # print the record.
734 1123         2116 print $VertexToNewCompIdFileHandle $recordString . $el;
735              
736             # store a copy of the record to remove duplicates.
737 1123         2982 $previousRecord = $recordString;
738             }
739             }
740              
741 441         1026 return undef;
742             }
743              
744             sub writeSubgraphInfoToFiles
745             {
746 6     6 0 14 my ($Self, $OldCompIdToOldSorter, $CompIdVertexFile) = @_;
747              
748             # open the oldCompId to vertex file for writing.
749 6         9 my $oldCompIdToVertexFileHandle;
750 6 50       376 unless (open($oldCompIdToVertexFileHandle, '>', $CompIdVertexFile))
751             {
752 0         0 my $logger = Log::Log4perl->get_logger();
753 0         0 $logger->logdie("could not open file '$CompIdVertexFile' for writing.\n");
754             }
755              
756             # get the vertex component-id sorter.
757 6         18 my $vertexCompIdSorter = $Self->{vertexCompIdSorter};
758              
759             # counts the number of edges in the subgraph generated ($OldCompIdToOldCompIdFile)
760 6         16 my $totalSubgraphEdges = 0;
761              
762             # used to skip duplicated vertexCompId edges.
763 6         22 my $previousVertexCompIdString = '';
764              
765             # get the first vertexCompId as a string.
766 6         30 my $vertexCompIdString = $vertexCompIdSorter->fetch;
767 6 50       72 my $vertexCompIdPair = [ split($Self->{delimiter}, $vertexCompIdString) ] if defined $vertexCompIdString;
768 6         15 my @listOfVertexCompIdPairs = ($vertexCompIdPair);
769 6         34 while (defined($vertexCompIdString = $vertexCompIdSorter->fetch))
770             {
771              
772             # extract the vertex and component id from the string.
773 3275         10417 my $vertexCompIdPair = [ split($Self->{delimiter}, $vertexCompIdString) ];
774              
775             # when the vertex changes the pairs in @listOfVertexCompIdPairs are used
776             # to create part of the subgraph.
777 3275 100       8563 if ($listOfVertexCompIdPairs[-1]->[0] ne $vertexCompIdPair->[0])
778             {
779 1153         2662 $Self->writeSubgraphInfoToFilesFromList(\@listOfVertexCompIdPairs, $OldCompIdToOldSorter,
780             $oldCompIdToVertexFileHandle, \$totalSubgraphEdges);
781              
782             # clear the list of pairs.
783 1153         2629 @listOfVertexCompIdPairs = ();
784             }
785              
786             # only store unique pairs.
787 3275 100       11929 if ($previousVertexCompIdString ne $vertexCompIdString)
788             {
789 2145         2553 push @listOfVertexCompIdPairs, $vertexCompIdPair;
790 2145         8410 $previousVertexCompIdString = $vertexCompIdString;
791             }
792             }
793              
794             # process any remaining pairs in @listOfVertexCompIdPairs.
795 6         164 $Self->writeSubgraphInfoToFilesFromList(\@listOfVertexCompIdPairs, $OldCompIdToOldSorter,
796             $oldCompIdToVertexFileHandle, \$totalSubgraphEdges);
797 6         18 @listOfVertexCompIdPairs = ();
798              
799             # done with the sorter.
800 6         24 delete $Self->{vertexCompIdSorter};
801              
802 6         70 return undef;
803             }
804              
805             sub writeSubgraphInfoToFilesFromList
806             {
807 1159     1159 0 1793 my ($Self, $ListOfVertexCompIdPairs, $OldCompIdToOldSorter, $OldCompIdToVertexFileHandle, $TotalSubgraphEdges) = @_;
808              
809             # the first record has the minimum component id.
810 1159         1650 my $minCompId = $ListOfVertexCompIdPairs->[0]->[1];
811              
812             # compute the edges of the subgraph.
813 1159         1201 my @listOfSubgraphEdgesA;
814             my @listOfSubgraphEdgesB;
815 1159         2806 for (my $i = 1 ; $i < @$ListOfVertexCompIdPairs ; $i++)
816             {
817              
818             # get the component-id for the pair.
819 992         1395 my $compId = $ListOfVertexCompIdPairs->[$i]->[1];
820              
821             # add the new edge.
822 992         2148 push @listOfSubgraphEdgesA, join($Self->{delimiter}, $minCompId, $compId);
823 992         1431 ++$$TotalSubgraphEdges;
824              
825             # we need to add the symmetric edge also to ensure log convergence.
826 992 100       1878 if ($minCompId ne $compId)
827             {
828 986         1853 push @listOfSubgraphEdgesB, join($Self->{delimiter}, $compId, $minCompId);
829 986         2989 ++$$TotalSubgraphEdges;
830             }
831             }
832              
833             # write the edges to the file.
834 1159         1821 push @listOfSubgraphEdgesA, sort @listOfSubgraphEdgesB;
835 1159         1541 @listOfSubgraphEdgesB = ();
836 1159         5562 $OldCompIdToOldSorter->feed (@listOfSubgraphEdgesA);
837 1159         1909 @listOfSubgraphEdgesA = ();
838              
839             # write the compId,vertex to the file.
840 1159         2620 my $record = join($Self->{delimiter}, $minCompId, $ListOfVertexCompIdPairs->[0]->[0]) . $el;
841 1159         8227 print $OldCompIdToVertexFileHandle $record;
842              
843 1159         2287 return undef;
844             }
845              
846             sub outputVertexCompId
847             {
848 1     1 0 3 my ($Self, %Parameters) = @_;
849              
850             # set the default delimiter.
851 1         3 my $delimiter = $Self->{delimiter};
852 1 50       4 $delimiter = $Parameters{delimiter} if exists $Parameters{delimiter};
853              
854             # get the list of vertices and component ids.
855 1         5 my $listOfVertexCompIds = $Self->{componenter}->get_vertexCompIdPairs(0);
856              
857             # sort the list of vertices and component ids.
858 1 50       4 $listOfVertexCompIds = [ sort { ($a->[0] cmp $b->[0]) || $a->[1] cmp $b->[1] } @$listOfVertexCompIds ];
  9         20  
859              
860             # open the component file for writing.
861 1         2 my $outputFh;
862 1 50   1   11 unless (open($outputFh, '>:encoding(utf8)', $Self->{outputFile}))
  1         1  
  1         9  
  1         55  
863             {
864 0         0 my $logger = Log::Log4perl->get_logger();
865 0         0 $logger->logdie("could not open file '$Self->{outputFile}' for writing.\n");
866             }
867              
868             # write the vertex compId to the file.
869 1         72243 foreach my $vertexCompId (@$listOfVertexCompIds)
870             {
871 6         38 print $outputFh $vertexCompId->[0] . $delimiter . $vertexCompId->[1] . $el;
872             }
873              
874             # close the output file of edges.
875 1         749 close $outputFh;
876              
877 1         19 return undef;
878             }
879              
880             sub printSorter
881             {
882 0     0 0 0 my ($Self, $Sorter) = @_;
883              
884 0         0 my $previousRecord = '';
885 0         0 while (defined(my $recordString = $Sorter->fetch))
886             {
887 0 0       0 next if $previousRecord eq $recordString;
888 0         0 $previousRecord = $recordString;
889 0         0 print $recordString . $el;
890             }
891              
892 0         0 return undef;
893             }
894              
895             sub getCpuTime # ($startTime)
896             {
897 14     14 0 27 my $startTime = 0;
898 14 100       57 $startTime = $_[1] if exists $_[1];
899 14         221 return Time::HiRes::clock() - $startTime;
900             }
901              
902             sub DESTROY
903             {
904 7     7   3312 my ($Self) = @_;
905 7 100       395 return undef if $Self->{level} > 0;
906 1 50       4 return undef unless $Self->{cleanup};
907 1 50       5 return undef unless exists $Self->{baseDirectory};
908 1 50       30 return undef unless -e $Self->{baseDirectory};
909 1         525 remove_tree($Self->{baseDirectory});
910              
911 1 50       285 return undef unless $Self->{unlinkWorkingDirectory};
912 0           remove_tree($Self->{workingDirectory});
913 0           return undef;
914             }
915              
916             =head1 INSTALLATION
917              
918             Use L to install the module and all its prerequisites:
919              
920             perl -MCPAN -e shell
921             cpan[1]> install Graph::Undirected::Components
922              
923             =head1 BUGS
924              
925             Please email bugs reports or feature requests to C, or through
926             the web interface at L. The author
927             will be notified and you can be automatically notified of progress on the bug fix or feature request.
928              
929             =head1 AUTHOR
930              
931             Jeff Kubina
932              
933             =head1 COPYRIGHT
934              
935             Copyright (c) 2009 Jeff Kubina. All rights reserved.
936             This program is free software; you can redistribute
937             it and/or modify it under the same terms as Perl itself.
938              
939             The full text of the license can be found in the
940             LICENSE file included with this module.
941              
942             =head1 KEYWORDS
943              
944             connected components, network, undirected graph
945              
946             =head1 SEE ALSO
947              
948             L, L, L, L
949              
950             =begin html
951              
952             connected component,
953             graph,
954             network,
955              
956             =end html
957              
958             =cut
959              
960             1;
961              
962             # The preceding line will help the module return a true value