File Coverage

blib/lib/Microarray/CdtDataset.pm
Criterion Covered Total %
statement 4 6 66.6
branch n/a
condition n/a
subroutine 2 2 100.0
pod n/a
total 6 8 75.0


line stmt bran cond sub pod time code
1             package Microarray::CdtDataset;
2              
3             # License information (the MIT license)
4              
5             # Copyright (c) 2003 Christian Rees, Janos Demeter, John Matese, Gavin
6             # Sherlock; Stanford University
7              
8             # Permission is hereby granted, free of charge, to any person
9             # obtaining a copy of this software and associated documentation files
10             # (the "Software"), to deal in the Software without restriction,
11             # including without limitation the rights to use, copy, modify, merge,
12             # publish, distribute, sublicense, and/or sell copies of the Software,
13             # and to permit persons to whom the Software is furnished to do so,
14             # subject to the following conditions:
15              
16             # The above copyright notice and this permission notice shall be
17             # included in all copies or substantial portions of the Software.
18              
19             # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20             # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21             # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22             # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
23             # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
24             # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25             # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26             # SOFTWARE.
27              
28              
29             # Author : Gavin Sherlock (based on Christian Rees' dataset object)
30             # Date : 14th August 2002
31              
32             # Re-visited by John Matese, under auspices of the GMOD project
33             # Date : 28th May 2003
34              
35             # Description: This package implements an object that serves as an
36             # abstraction to a cdtDataset. It is different than the
37             # Microarray::DataMatrix::CdtFile abstraction, because it deals with
38             # the cdtFile in the context of gtr and/or atr files. It also
39             # provides methods by which the geneXplorer program can interact with
40             # a cdtDataset.
41             # The essential purpose of CdtDataset's initialization functions is to
42             # de-construct the .cdt file into its constituent data parts of the
43             # dataset:
44             # 1) the data matrix (.data_matrix)
45             # 2) the bioassay names or slidenames (.expt_info)
46             # 3) the annotations of the spotted features/reporters/sequences
47             # (.feature_info)
48             # 4) any additional meta information about the set (.meta)
49             # 5) additionally, it computes or creates the following:
50             # a) a binary file containing a list of feature-feature
51             # correlations (.binCor)
52             # b) a 2-color image representation of the data matrix
53             # (.data_matrix.png)
54             # c) a image representation of the expt_info file
55             # (.expt_info.png)
56             #
57             # Known Issues: There are good reasons to add additional meta data to
58             # a dataset, including possibly the organism of the set or the
59             # location of the default display configuration file to display the
60             # .feature_info. These would probably have to be called in the constructor.
61             #
62             #
63             # Note: there are many vestiges of code left over from previous
64             # developers. These could be viewed as either an emergent/embryonic
65             # API or alternatively as vestigial limbs needing to be lopped off.
66             # i.e. the need for these methods/accessors existed at some time in
67             # the past, and may indeed be resurrected in the future. Currently,
68             # they are not supported.
69             #
70             #
71             # Future Plans: Currently, only the .cdt file of a clustered dataset
72             # is utilized. In the future, the other data files detailing the
73             # clustering [gene tree(.gtr) and array tree(.atr)] should be
74             # utilized, and DatasetImageMaker should export suitable image
75             # representations for these files. Furthermore, It would be great to
76             # pull general dataset methods from this class into a future class,
77             # Microarray::Dataset. That way, you could make a MageMLDataset class
78             # as well, and still keep many of the general class attributes/methods
79             # in the same locations. Microarray:Dataset would inherit constructor
80             # methods (i.e. knowledge of the file structure) from either
81             # CdtDataset orMageMLDataset at initialization (perhaps a run-time ISA
82             # declaration within the constructor). Otherwise, I don't see a huge
83             # advantage to having these specialized (and somewhat misnamed)
84             # classes, in the sense that Dataset only need to know how to parse
85             # the initialization file while convertind a new dataset
86              
87              
88              
89 1     1   8277 use strict;
  1         2  
  1         35  
90 1     1   1741 use GD;
  0            
  0            
91             use File::Basename;
92              
93             use Microarray::Config;
94             use Microarray::DatasetImageMaker;
95             use Microarray::Utilities::Filesystem qw(DirectoryIsValid EnsureTrailingSlash);
96             use Microarray::DataMatrix::CdtFile;
97             use Microarray::DataMatrix::PclFile;
98              
99              
100             my $dbg = 0;
101              
102             my $VERSION = "0.1";
103              
104             my $PACKAGE = 'Microarray::CdtDataset';
105              
106             my $kCdtBase = $PACKAGE.'::__cdtBase';
107             my $kCdtPath = $PACKAGE.'::__cdtPath';
108              
109              
110             my $kName = $PACKAGE.'::__name'; # the full qualified name of the dataset
111             my $kFileBaseName = $PACKAGE.'::__fileBaseName'; # the stem of the file names (for dataset)
112              
113             my $kDataPath = $PACKAGE.'::__dataPath';
114             my $kImagePath = $PACKAGE.'::__imagePath';
115             my $kContrast = $PACKAGE.'::__contrast';
116             my $kShouldInitialize = $PACKAGE.'::__shouldInitialize';
117             my $kCdtFileObject = $PACKAGE.'::__cdtFileObject';
118             my $kCdtFileName = $PACKAGE.'::__cdtFileName';
119             my $kHeight = $PACKAGE.'::__height';
120             my $kWidth = $PACKAGE.'::__width';
121             my $kColorScheme = $PACKAGE.'::__colorscheme';
122             my $kConfig = $PACKAGE.'::__config';
123             my $kCorrCutoff = $PACKAGE.'::__corrCutoff';
124              
125             my $kDefaultContrast = 4;
126             my $kDefaultInitialization = 0;
127             my $kDefaultCorrCutoff = 0.5;
128             my $kDefaultColorScheme = 'yb'; # yellow/blue
129              
130             my $kMinCorrCutoff = 0.2;
131              
132             my $kImgType = Microarray::Config->ImageType;
133              
134             my @metaColumns = ($kWidth, $kHeight, $kContrast, $kColorScheme, $kCorrCutoff);
135              
136             my %kColorSchemeTranslation = Microarray::Config->ColorSchemeTranslationHash;
137              
138             my $kCdtSuffix = '.cdt';
139             my $kMetaSuffix = '.meta';
140             my $kLockSuffix = '.lock';
141             my $kFeatureSuffix = '.feature_info';
142             my $kMatrixSuffix = '.data_matrix';
143             my $kExptInfoSuffix = '.expt_info';
144             my $kPclSuffix = '.pcl';
145             my $kStdCorrSuffix = '.stdCor';
146             my $kBinCorrSuffix = '.binCor';
147              
148             my $kInfoGifSuffix = $kExptInfoSuffix.'.'.$kImgType;
149             my $kMatrixGifSuffix = $kMatrixSuffix.'.'.$kImgType;
150              
151             my @kRequiredFileSuffixes = (
152             # cdt not required, unless we start copying it there
153             # $kCdtSuffix,
154             $kMetaSuffix,
155             $kFeatureSuffix,
156             $kMatrixSuffix,
157             $kExptInfoSuffix,
158             $kBinCorrSuffix
159             # images may reside in a different directory than the data, should fix this
160             # $kInfoGifSuffix,
161             # $kMatrixGifSuffix
162             );
163              
164              
165             ####################################################################
166             sub new {
167             ####################################################################
168             # This is the constructor. There are two modes in which the
169             # constructor can be used. In one mode, it will create various files
170             # which support the dataset, using the cdt, (and hopefully in the
171             # future, gtr and atr files). In the second mode, it will assume that
172             # these files already exist and just return the constructed objevt.
173             # Thus when a dataset is first created, there will be the overhead of
174             # creating the additional files, but subsequent creation of a
175             # cdtDataset object will not have that overhead. The constructor
176             # takes the following arguments:
177             #
178             # name : The fully qualified name of the dataset (slash/delimited),
179             # which encodes the location and stem of the files,
180             # without any extensions, and with no path
181             # information. If the 'initialize' argument is set
182             # (see below), a directory tructure of the same name
183             # will also be created to contain the exported data
184             # files.
185             #
186             # datapath : This required path prefix is where any newly created data
187             # files should be placed (or read from).
188             #
189             # imagepath : An optional path prefix where any newly created image files
190             # should be placed (or read from). Will default to
191             # datapath if none is specified.
192             #
193             # contrast : If a dataset is being instantiated for the first
194             # time, then a contrast is needed for image
195             # generation. If no contrast is provided, then a
196             # default value of 4 will be used. As the data are
197             # expected to be in log base 2, this corresponds to a
198             # 16-fold change as the maximum color in any image.
199             #
200             # colorscheme : Can either be 'red/green' (the default if none is
201             # specified) or 'yellow/blue'
202             #
203             # corrcutoff : If a dataset is initiated for the first time, correlation
204             # values are generated for each feature-pair and values
205             # above the cutoff are saved in a binary .binCor file
206             #
207             # initialize : A filepath of the originating .cdt file indicate
208             # whether to initialize all the required supporting
209             # files that a cdtDataset needs. This defaults to 0
210             # (assumes that the necessary supporting files already
211             # exist. If it is a filepath, then the dataset is
212             # initialized using it
213             #
214             #
215             # Note that if you supply a contrast, you must set initialize to 1, as
216             # a contrast is useless in the absence of initialization. Both the
217             # 'dataset' and 'path' arguments are absolutely required.
218             #
219             # Usage, eg if you have a file:
220             #
221             # my $ds = Microarray::CdtDataset->new(name=>dataset/name, # name of the dataset
222             # datapath=>$dir, # prefix path where dataset files will be written
223             # contrast=>2, # image contrast
224             # initialize=>/path/to/file.cdt);
225              
226              
227             my $class = shift;
228             my $self = { };
229              
230             bless ($self, $class);
231              
232             eval {
233             $self->__init(@_);
234             };
235              
236             if ($@) {
237             die "The following error occurred: $@\n";
238             };
239              
240             return $self;
241             }
242              
243             ############################################################################
244             sub __init{
245             ############################################################################
246             # This method takes care of all of the initialization of the
247             # attributes of the cdtDataset
248              
249             my $self = shift;
250              
251             $self->__checkAndSetConstructorArguments(@_);
252              
253             if ($self->__shouldInitialize){
254              
255             $self->__initializeDataset;
256              
257             $self->__setShouldInitialize(0); # so we know it's done
258              
259             }else{
260              
261             $self->__checkRequiredFilesExist;
262              
263             # we need to load some meta information instead
264             $self->_load_meta;
265              
266             }
267              
268             # now load all the required feature and experiment info into
269             # memory
270              
271             $self->__loadExptInfo;
272             $self->__loadFeatureInfo;
273              
274             }
275              
276             ############################################################################
277             sub __checkAndSetConstructorArguments{
278             ############################################################################
279             # This private method checks that the constructor arguments pass all
280             # sanity checks, and that files that should exist do exist.
281              
282             my ($self, %args) = @_;
283              
284             $self->__checkAndSetInitializationState(%args);
285             $self->__checkAndSetConfig(%args);
286             $self->__checkAndSetDataPath(%args);
287             $self->__checkAndSetImagePath(%args);
288             $self->__checkAndSetDatasetName(%args);
289             $self->__checkAndSetContrast(%args);
290             $self->__checkAndSetColorScheme(%args);
291             $self->__checkAndSetCorrCutoff(%args);
292              
293             }
294              
295              
296             ############################################################################
297             sub __checkAndSetInitializationState{
298             ############################################################################
299             # This method checks and sets whether the object needs full
300             # initialization. There are meant to be 2 initilization requests.
301             # The first (initialization=>) would request that the dataset be
302             # created de novo from an initial file, and the second
303             # (initialization=>1) would just remake the images with a different
304             # constrast and different colors. The second initialization has not
305             # been adequately tested.
306              
307             my ($self, %args) = @_;
308            
309             if (exists($args{'initialize'})){
310              
311             # the argument must be a cdt file path or a boolean
312             if (-e $args{'initialize'}){
313              
314             if (!-r $args{'initialize'}){
315            
316             die "The cdt file for initialization, $args{'initialize'}, is not readable.";
317            
318             }elsif (!-T $args{'initialize'}){
319            
320             die "The cdt file for initialization, $args{'initialize'}, is not a text file.";
321              
322             }
323              
324             $self->__setCdtFileName($args{'initialize'});
325             $self->__setShouldInitialize(1);
326              
327             }elsif ($args{'initialize'} == 1){
328              
329             # this should signify that the caller wants to re-make the
330             # images only, based on a pre-exisiting dataset. JCM
331             # note: This probably has not been thoroughly tested! (no
332             # explicit client available (though easy to modify
333             # bin/makeMicroarrayDataset), and change of constructor
334             # API has occurred)
335              
336             $self->__setShouldInitialize(1);
337              
338             }else {
339              
340             die "The 'initialize' argument must be equal to 1 or be a valid $kCdtSuffix filepath. \nA value of '$args{'initialize'}' was supplied\n";
341             }
342              
343             }else{
344              
345             $self->__setShouldInitialize($kDefaultInitialization); # set default
346              
347             }
348             }
349              
350             ############################################################################
351             sub __checkAndSetConfig{
352             ############################################################################
353             # This private method checks and stores the config object that should
354             # be passed in
355              
356             my ($self, %args) = @_;
357              
358             if (exists($args{'config'})){
359              
360             if ($args{'config'}->isa("Microarray::Config")){
361              
362             $self->{$kConfig} = $args{'config'};
363              
364             }else{
365              
366             die "The 'config' argument you provided is not a Microarray::Config object.";
367              
368             }
369              
370             }else{
371              
372             die "A 'config' argument was not supplied to the ".ref($self)." constructor.";
373              
374             }
375              
376             }
377              
378             ############################################################################
379             sub __checkAndSetDataPath{
380             ############################################################################
381             # This private method checks that an Path is supplied, that
382             # corresponds to an existent directory, then stores it in the object.
383              
384             my ($self, %args) = @_;
385              
386             if (exists($args{'datapath'})){
387              
388             # check that it's good
389              
390             unless (&DirectoryIsValid($args{'datapath'})) {
391              
392             die "The supplied 'datapath', $args{'datapath'}, either does not exist or is not a directory.";
393              
394             }
395              
396             # fix up the Path so it has a trailing forward slash
397             $args{'datapath'} = &EnsureTrailingSlash($args{'datapath'});
398              
399             }else{
400              
401             die "A 'datapath' argument was not supplied to the ".ref($self)." constructor.";
402            
403             }
404              
405             $self->__setDataPath($args{'datapath'});
406              
407             }
408              
409              
410             ############################################################################
411             sub __checkAndSetImagePath{
412             ############################################################################
413             # This private method checks that an Path is supplied, that
414             # corresponds to an existent directory, then stores it in the object.
415              
416             my ($self, %args) = @_;
417              
418             if (exists($args{'imagepath'})){
419              
420             # check it's good
421              
422             unless (&DirectoryIsValid($args{'imagepath'})) {
423              
424             die "The supplied 'imagepath', $args{'imagepath'}, either does not exist or is not a directory.";
425              
426             }
427              
428             # fix up the Path so it has a trailing forward slash
429             $args{'imagepath'} = &EnsureTrailingSlash($args{'imagepath'});
430              
431             }else{
432              
433             # use datapath as the default (generic module)
434             $args{'imagepath'} = $self->datapath;
435              
436             }
437              
438             $self->__setImagePath($args{'imagepath'});
439              
440             }
441              
442              
443             ############################################################################
444             sub __checkAndSetDatasetName{
445             ############################################################################
446             # This method checks that a dataset was given to the constructor. In
447             # addition because CdtDataset creates and stores all its images and
448             # data in a directory hierarchy, the initially specified data and
449             # image paths are augmented with the dataset name directories (which
450             # are created upon initialization)
451              
452             my ($self, %args) = @_;
453              
454             # check we have a good name
455              
456             if (!exists($args{'name'})){
457              
458             die "The required 'name' argument was not supplied to the ".ref($self)." constructor.";
459              
460             }
461              
462             my $fullPathToData = &EnsureTrailingSlash($self->datapath.$args{'name'});
463             my $fullPathToImages = &EnsureTrailingSlash($self->imagepath.$args{'name'});
464              
465             if ($self->__shouldInitialize) { # make them if initialization was requested
466              
467             $self->__setDataPath($self->__ensureDirectoriesExist($self->datapath, $args{'name'}));
468             $self->__setImagePath($self->__ensureDirectoriesExist($self->imagepath, $args{'name'}));
469              
470              
471             }else{ # make certain the dataset is where they said it is
472              
473             &DirectoryIsValid($fullPathToData) || die "Could not validate dataset directories (datapath + dataset name) residence at $fullPathToData.\n";
474              
475             &DirectoryIsValid($fullPathToImages) || die "Could not validate dataset image directories (image path + dataset name) residence at $fullPathToImages.\n";
476              
477              
478             }
479              
480             my $datasetBase = basename($args{'name'});
481              
482             $self->__setDataPath($fullPathToData); # augmenting datapath with data set directory name
483             $self->__setImagePath($fullPathToImages); # augmenting imagepath with data set directory name
484             $self->__setDatasetName($args{'name'});
485             $self->__setFileBaseName($datasetBase);
486              
487             }
488              
489              
490              
491             ############################################################################
492             sub __checkAndSetContrast{
493             ############################################################################
494             # This method determines if the contrast is valid, and then stores the
495             # value in the object
496              
497             my ($self, %args) = @_;
498              
499             if (exists($args{'contrast'})){
500            
501             if (!exists($args{'initialize'})){
502            
503             die "A 'contrast' argument was provided to the ".ref($self)." constructor, but an initialize argument was not.";
504            
505             }elsif ($args{'contrast'} <= 0){
506            
507             die "The supplied value for the 'contrast' argument must be greater than zero. A value of $args{'contrast'} was supplied.";
508            
509             }
510            
511             $self->__setContrast($args{'contrast'});
512            
513             }else{
514              
515             $self->__setContrast($kDefaultContrast); # set the default
516              
517             }
518              
519             }
520              
521             ############################################################################
522             sub __checkAndSetColorScheme{
523             ############################################################################
524             # This method determines if the colorscheme is valid, and then stores
525             # the value in the object
526              
527             my ($self, %args) = @_;
528              
529             if (exists($args{'colorscheme'})){
530            
531             if (!exists($args{'initialize'})){
532            
533             die "A 'colorscheme' argument was provided to the ".ref($self)." constructor, but an initialize argument was not.";
534            
535             }elsif (!exists($kColorSchemeTranslation{$args{'colorscheme'}})){
536            
537             die "The supplied value for the 'colorscheme' argument must be one of \n\n".
538             join("\n", keys %kColorSchemeTranslation)."\n\nA value of $args{'colorscheme'} was supplied.";
539            
540             }
541            
542             $self->__setColorScheme(($kColorSchemeTranslation{$args{'colorscheme'}}));
543            
544             }else{
545              
546             $self->__setColorScheme($kDefaultColorScheme); # set the default
547              
548             }
549              
550             }
551              
552             ############################################################################
553             sub __checkAndSetCorrCutoff{
554             ############################################################################
555             # This method determines if the correlation cutoff value is valid, and then stores
556             # the value in the object
557              
558             my ($self, %args) = @_;
559              
560             if (exists($args{'corrcutoff'})){
561            
562             if (!exists($args{'initialize'})){
563            
564             die "A 'corrcutoff' argument was provided to the ".ref($self)." constructor, but an initialize argument was not.";
565            
566             }elsif ($args{'corrcutoff'} !~ /^[\d\.]+$/g
567             || $args{'corrcutoff'} > 1
568             || $args{'corrcutoff'} < $kMinCorrCutoff){
569            
570             die "The supplied value for the 'corrcutoff' argument must be a number \n\n".
571             "$kMinCorrCutoff =< corrcutoff =< 1"."\n\nA value of $args{'corrcutoff'} was supplied.";
572             }
573            
574             $self->__setCorrCutoff($args{'corrcutoff'});
575            
576             }else{
577              
578             $self->__setCorrCutoff($kDefaultCorrCutoff); # set the default
579              
580             }
581              
582             }
583              
584              
585             #####################################################################
586             sub __checkRequiredFilesExist{
587             #####################################################################
588             # This method checks that all the required files for the dataset exist
589             # If they do not, it will cause a fatal error
590              
591             my $self = shift;
592              
593             my $prefix = $self->datapath;
594             $prefix = &EnsureTrailingSlash($prefix);
595              
596             $prefix .= $self->fileBaseName;
597              
598             foreach my $suffix (@kRequiredFileSuffixes){
599              
600             die $prefix.$suffix." does not exist." if (!-e $prefix.$suffix);
601              
602             }
603             }
604              
605              
606             ############################################################################
607             sub __setCdtInfo {
608             ############################################################################
609             # this subroutine takes the initalize arguement and store the path and
610             # the stem of the .cdt filename
611              
612             my ($self, $filepath);
613              
614             my ($base, $inpath, $suffix) = fileparse($filepath, $kCdtSuffix);
615             $self->{$kCdtBase} = $base;
616             $self->{$kCdtPath} = $inpath;
617              
618              
619             }
620              
621              
622              
623             #####################################################################
624             sub __setFileBaseName {
625             #####################################################################
626             # This method allows the filename stem (no suffix) of the datafiles
627             # use to initialize the dataset to be set
628              
629             my ($self, $nameBase) = @_;
630             $self->{$kFileBaseName} = $nameBase;
631             }
632              
633              
634             #####################################################################
635             sub __setDataPath {
636             #####################################################################
637             # This method allows the path to where the data files for the dataset
638             # exist to be set
639              
640             my ($self, $outpath) = @_;
641             $self->{$kDataPath} = $outpath;
642             }
643              
644              
645             #####################################################################
646             sub __setImagePath {
647             #####################################################################
648             # This method allows the path to where the image files for the dataset
649             # exist to be set
650              
651             my ($self, $outpath) = @_;
652             $self->{$kImagePath} = $outpath;
653             }
654              
655              
656             #####################################################################
657             sub __setDatasetName {
658             #####################################################################
659             # This method allows the name of the dataset to be set.
660              
661             my ($self, $datasetName) = @_;
662             $dbg && print "\t dataset name $datasetName \n";
663             $self->{$kName} = $datasetName;
664             }
665              
666              
667             #####################################################################
668             sub __setCdtFileName {
669             #####################################################################
670             # This method sets the name of the cdtFile
671              
672             my ($self, $cdtFile) = @_;
673             $self->{$kCdtFileName} = $cdtFile;
674             }
675              
676              
677             #####################################################################
678             sub __setContrast {
679             #####################################################################
680             # This method allows the contrast to be set.
681              
682             my ($self, $contrast) = @_;
683             $self->{$kContrast} = $contrast;
684             }
685              
686              
687             #####################################################################
688             sub __setColorScheme {
689             #####################################################################
690             # This method allows the colorscheme to be set.
691              
692             my ($self, $colorScheme) = @_;
693             $self->{$kColorScheme} = $colorScheme;
694             }
695              
696              
697             #####################################################################
698             sub __setCorrCutoff {
699             #####################################################################
700             # This method allows the correaltion cutoff to be set.
701              
702             my ($self, $corrCutoff) = @_;
703             $self->{$kCorrCutoff} = $corrCutoff;
704             }
705              
706              
707             #####################################################################
708             sub __setShouldInitialize {
709             #####################################################################
710             # This method allows a flag to be set as to whether full
711             # initialization need to take place
712              
713             my ($self, $shouldInitialize) = @_;
714             $self->{$kShouldInitialize} = $shouldInitialize;
715             }
716              
717              
718             #####################################################################
719             sub __setHeight {
720             #####################################################################
721             # This private method allows the 'height' of the dataset to be set.
722             # This in fact corresponds to the number of rows in the cdt file.
723              
724             my ($self, $height) = @_;
725             $self->{$kHeight} = $height;
726             }
727              
728              
729             #####################################################################
730             sub __setWidth {
731             #####################################################################
732             # This private method allows the 'height' of the dataset to be set.
733             # This in fact corresponds to the number of rows in the cdt file.
734              
735             my ($self, $width) = @_;
736             $self->{$kWidth} = $width;
737             }
738              
739              
740             #####################################################################
741             sub name {
742             #####################################################################
743             # This method returns the fully qualified name of the dataset
744              
745             return $_[0]->{$kName};
746             }
747              
748              
749             #####################################################################
750             sub _cdtFileName {
751             #####################################################################
752             # This method returns the name of the cdtFile
753              
754             return $_[0]->{$kCdtFileName};
755             }
756              
757              
758             #####################################################################
759             sub _cdtBase {
760             #####################################################################
761             # This method returns the base name string of the files comprising of
762             # the dataset, sans suffices
763              
764             return $_[0]->{$kCdtBase};
765             }
766              
767              
768             #####################################################################
769             sub _cdtPath {
770             #####################################################################
771             # This method returns the path to the cdt file of thebeing converted
772             # into a dataset
773              
774             return $_[0]->{$kCdtPath};
775             }
776              
777              
778             ######################################################################
779             sub __ensureDirectoriesExist {
780             ######################################################################
781             # This subroutine checks to see that the full outpath is created if
782             # necessary, by extended a previouslt validated filepath. It is
783             # tended for use only when initializating a dataset, where the dataset
784             # directories might need to be created and appended to the data and
785             # image out paths
786              
787             my ($self, $path, $possibleExtension) = @_;
788              
789             my @dirnames = split(/\//, $possibleExtension);
790              
791             $dbg && print "Ensuring that all directories exist for the dataset initialization\n";
792             $dbg && print "\t$path + @dirnames\n";
793              
794              
795             $path = &EnsureTrailingSlash($path);
796              
797             while (my $dir = shift(@dirnames)) {
798              
799             # extend the tree if requested
800             $path .= "$dir/";
801              
802             # does it exist, or should we make it (new directory)?
803             unless (&DirectoryIsValid($path)) {
804              
805             $dbg && print "\tcreating $path directory\n";
806              
807             # if not, create it first...
808             mkdir($path, 0775) || die "Couldn't create directory: $path";
809              
810             }
811             $path = &EnsureTrailingSlash($path);
812             }
813              
814             return $path; # this should be the validated, extended path
815              
816             }
817              
818              
819             #####################################################################
820             sub datapath {
821             #####################################################################
822             # This method returns the path to which data files either written
823             # or read from
824              
825             return $_[0]->{$kDataPath};
826             }
827              
828              
829             #####################################################################
830             sub imagepath {
831             #####################################################################
832             # This method returns the path to which image files are either written
833             # or read from
834              
835             return $_[0]->{$kImagePath};
836             }
837              
838              
839             #####################################################################
840             sub contrast {
841             #####################################################################
842             # This method returns the contrast
843              
844             return $_[0]->{$kContrast};
845             }
846              
847              
848             #####################################################################
849             sub colorScheme {
850             #####################################################################
851             # This method returns the colorScheme
852              
853             return $_[0]->{$kColorScheme};
854             }
855              
856             #####################################################################
857             sub corrCutoff {
858             #####################################################################
859             # This method returns the correlation cutoff
860              
861             return $_[0]->{$kCorrCutoff};
862             }
863              
864              
865             #####################################################################
866             sub fileBaseName {
867             #####################################################################
868             # This method returns the base name string of the files comprising of
869             # the dataset, sans suffices
870              
871             return $_[0]->{$kFileBaseName};
872             }
873              
874              
875             #####################################################################
876             sub height {
877             #####################################################################
878             # This method returns the number of data rows in the cdtFile
879              
880             return $_[0]->{$kHeight};
881             }
882              
883              
884             #####################################################################
885             sub width {
886             #####################################################################
887             # This method returns the number of data columns in the cdtFile
888              
889             return $_[0]->{$kWidth};
890             }
891              
892             #####################################################################
893             sub __config{
894             #####################################################################
895             # This private method returns the config object that was used during
896             # construction
897              
898             return $_[0]->{$kConfig};
899              
900             }
901              
902             #####################################################################
903             sub __cdtFileObject {
904             #####################################################################
905             # This private method returns a cdtFile Object. If one does not exist
906             # within the object, one will be created. If one does exist, that
907             # will simply be returned. This will likely fail for sets that are
908             # already converted, because the .cdt file is not copied into the
909             # dataset location. This is a design issue that needs to be
910             # discussed, in addition to the fact that it is private method, when
911             # it seems like other software might actually *want* to retrieve the
912             # Datamatix object
913              
914             my $self = shift;
915              
916             if (!exists($self->{$kCdtFileObject})){ # we need to create one
917              
918             $self->{$kCdtFileObject} = Microarray::DataMatrix::CdtFile->new(file => $self->_cdtFileName,
919             tmpDir => $self->__config->tmpPath);
920             }
921              
922             return $self->{$kCdtFileObject};
923             }
924              
925              
926             #####################################################################
927             sub __shouldInitialize {
928             #####################################################################
929             # This private method returns whether the object needs initialization
930              
931             return $_[0]->{$kShouldInitialize};
932             }
933              
934              
935             #####################################################################
936             sub __initializeDataset {
937             #####################################################################
938             # This method creates a new dataset from a CDT (clustered data) file.
939             # The CDT file format was defined by Michael Eisen for his Windows
940             # applications TreeView and Cluster. It has certain drawbacks, for
941             # example not more then two columns per gene can be used to store
942             # additional information. This can be partly resolved by putting more
943             # data into one record field. A kludgy fix.
944              
945             my $self = shift;
946              
947             $self->__lock; # lock dataset
948              
949             # first extract salient info from cdtfile, and
950             # create the correlations files
951              
952             $self->__dissectCDT;
953             $self->__prepareCorrelations;
954             $self->__compressCorrelations;
955              
956              
957             # should have enough data dissected so far to load the experiment
958             # info, and we'll need these for the image header, so load them
959              
960             $self->__loadExptInfo;
961              
962             # now create the required images
963             my $imageMaker = Microarray::DatasetImageMaker->new();
964             $imageMaker->makeImage('dataset' => $self,
965             'type' => 'matrix');
966             $imageMaker->makeImage('dataset' => $self,
967             'type' => 'header');
968            
969             # now write out our 'meta' information,
970             # for quick access later on
971              
972             $self->__prepareMetaFile;
973              
974             $self->__unlock; # unlock dataset
975            
976             }
977              
978              
979             #####################################################################
980             sub __lock {
981             #####################################################################
982             # This method locks the dataset
983              
984             my $self = shift;
985             my $lockFile = $self->datapath.$self->fileBaseName.$kLockSuffix;
986             open (OUT, ">".$lockFile) || die "Cannot lock dataset using lockfile $lockFile : $!";
987             close (OUT);
988              
989             }
990              
991              
992             #####################################################################
993             sub __unlock {
994             #####################################################################
995             # This method unlocks the dataset
996              
997             my $self = shift;
998             my $lockFile = $self->datapath.$self->fileBaseName.$kLockSuffix;
999             unlink ($lockFile);
1000             }
1001              
1002              
1003             #####################################################################
1004             sub __dissectCDT {
1005             #####################################################################
1006             # This method determines the contents of the cdtfile, and stores some
1007             # of the cdtMeta data for quick retrieval. Note that the previous
1008             # version did its own parsing of the cdtFile. This is now delegated
1009             # to the cdtFile object.
1010              
1011             my $self = shift;
1012              
1013             my $cdtFileObject = $self->__cdtFileObject;
1014              
1015             $self->__setWidth($cdtFileObject->numColumns);
1016             $self->__setHeight($cdtFileObject->numRows);
1017              
1018             $self->__saveCdtExptNames($cdtFileObject->columnNamesArrayRef);
1019            
1020             $cdtFileObject->createIndexFile($self->datapath.$self->fileBaseName.$kFeatureSuffix);
1021             $cdtFileObject->createRawMatrixFile($self->datapath.$self->fileBaseName.$kMatrixSuffix);
1022              
1023             }
1024              
1025              
1026             ######################################################################
1027             sub __saveCdtExptNames {
1028             ######################################################################
1029             # This method (we may eliminate it later) save the names of the data
1030             # columns from the cdtFile (these are usually the experiment names) to
1031             # a file. This is later used by GeneXplorer, but also provides a
1032             # quick way of looking up the data, without having to read the cdtFile
1033             # in.
1034              
1035             my ($self, $exptNamesARef) = @_;
1036              
1037             my $file = $self->datapath.$self->fileBaseName.$kExptInfoSuffix;
1038              
1039             # write index number and name to file
1040              
1041             open (OUT, ">".$file) || die "Cannot write out experiment info to $file : $!";
1042              
1043             print OUT "ID\tNAME\n";
1044              
1045             for (my $i=0; $i < @{$exptNamesARef}; $i++){
1046            
1047             print OUT $i, "\t", $exptNamesARef->[$i], "\n";
1048            
1049             }
1050            
1051             close (OUT);
1052              
1053             }
1054              
1055              
1056             #####################################################################
1057             sub __prepareCorrelations {
1058             #####################################################################
1059             # This method prepares a correlations file
1060              
1061             my $self = shift;
1062              
1063             # first we have to create a pcl file, with an index in the first
1064             # column
1065              
1066             my $pclFileName = $self->datapath.$self->fileBaseName.$kPclSuffix;
1067              
1068             print "pcl name: $pclFileName\n";
1069              
1070             print "tmp path: ".$self->__config->tmpPath."\n";
1071              
1072             $self->__createIndexedPclFile($pclFileName);
1073              
1074             my $pcl = Microarray::DataMatrix::PclFile->new(tmpDir => $self->__config->tmpPath,
1075             file => $pclFileName);
1076              
1077             # then use the pcl file to create correlations
1078              
1079             $pcl->createCorrelationsFile(cutoff=>$self->corrCutoff);
1080              
1081             # Now we can get rid of the pcl file
1082             unlink $pclFileName || warn "Couldn't unlink $pclFileName : $!";
1083            
1084             }
1085              
1086              
1087             #####################################################################
1088             sub __createIndexedPclFile{
1089             #####################################################################
1090             # This method creates a pcl file from the cdt file that was used to
1091             # instantiate the object. This is coded here, rather than using the
1092             # cdtFile method to convert to a pcl, because the pcl file must have
1093             # an index for it's names, rather than the names themselves.
1094              
1095             my ($self, $pclFile) = @_;
1096              
1097             open (IN, $self->_cdtFileName) || die "Cannot open cdt file ".$self->_cdtFileName." : $!";
1098              
1099             print "trying to create $pclFile \n\n";
1100              
1101             open (OUT, ">".$pclFile) || die "Cannot create $pclFile : $!";
1102              
1103             my $count = 1;
1104             my $index = 0;
1105             my $hasGtr = 0;
1106             my @line;
1107             my $numColumns;
1108              
1109             while (){
1110              
1111             chomp;
1112              
1113             @line = split("\t", $_, -1);
1114              
1115             if ($count == 1){
1116              
1117             if ($line[0] eq "GID"){
1118              
1119             shift (@line);
1120             $hasGtr = 1;
1121              
1122             }
1123              
1124             $numColumns = scalar(@line);
1125              
1126             print OUT join("\t", @line), "\n";
1127             print OUT "EWEIGHT\t\t";
1128              
1129             print OUT "\t1" x (scalar(@line)-3), "\n";
1130              
1131             $count++;
1132              
1133             next;
1134              
1135             }elsif ($count == 2){
1136              
1137             $count++;
1138              
1139             next if ($line[0] eq "AID");
1140              
1141             }
1142              
1143             # if we get here, it's a data line
1144              
1145             shift (@line) if $hasGtr;
1146              
1147             next if !@line;
1148              
1149             $line[0] = $index;
1150              
1151             if (scalar(@line) != $numColumns){
1152              
1153             die "In your cdtFile, data line $index has ".scalar(@line)." columns, instead of $numColumns.\n";
1154              
1155             }
1156              
1157             print OUT join("\t", @line), "\n";
1158              
1159             $index++;
1160              
1161             }
1162              
1163             close OUT;
1164             }
1165              
1166              
1167             #####################################################################
1168             sub __compressCorrelations {
1169             #####################################################################
1170             # This method takes a correlations file as output by Gavin Sherlocks
1171             # correlations program. These represent the correlation values of a
1172             # certain gene (array element) intensity vector vs. all other vectors
1173             # in a data matrix.
1174             # The output generated is a binary representation of the list of
1175             # correlation values for each row in the data matrix (= expression
1176             # vectors).
1177             #
1178             # The file is built like this:
1179             # ############################
1180             # name content bytes
1181             # ################################################
1182             # header
1183             # ################################################
1184             # index_size length of index 2
1185             # index offset for rows index_size * 2
1186             # ################################################
1187             # body
1188             # ################################################
1189             # data 1..n correlation data 4 * look up in index
1190             # -> index correlated vector 2 \
1191             # -> corr correlation 2 / 2 words (16 int)
1192              
1193             # The correlation data is stored in lists of pairs of the most
1194             # correlated vectors index number (row in the table) and the
1195             # correlation value. The correlation value has been multiplied by
1196             # (2^16)-1 (65535) to make it an integer. To retrieve the original
1197             # value, divide the integer by (2^16)-1. No negative correlation
1198             # values are allowed.
1199              
1200             my $self = shift;
1201              
1202             my $corrFile = $self->datapath.$self->fileBaseName.$kStdCorrSuffix;
1203              
1204             my $header = "";
1205             my $numVecs = 0;
1206             my $body = "";
1207            
1208             open(IN, $corrFile) || die "cannot open correlations file $corrFile: $!\n";
1209              
1210             while (){
1211              
1212             chomp;
1213              
1214             my @values = split(/\t/, $_); # split each row, containing the index/correlation pairs
1215             my $index = shift @values; # first value in row is the index of the vector we correlate to
1216             my $numCorrVectors = scalar(@values)/2; # determine how many correlated vectors exist for this one (pairs/2)
1217            
1218             for (my $j=1; $j < @values; $j += 2) { # look at the correlations
1219              
1220             # multiply correlation values and make them into an int
1221             # between 0 and 65535
1222              
1223             $values[$j] = int($values[$j] * 65535);
1224              
1225             }
1226            
1227             # add the number of pairs to the file header
1228             $header .= pack("n", $numCorrVectors);
1229            
1230             # pack values into string of unsigned ints
1231             # and add the packed row to the file body
1232              
1233             $body .= pack("n*", @values);
1234              
1235             $numVecs++;
1236            
1237             }
1238            
1239             close IN;
1240              
1241             # store the number of vectors as first word of header
1242              
1243             $header = pack("n", $numVecs).$header;
1244            
1245             my $binFile = $self->datapath.$self->fileBaseName.$kBinCorrSuffix;
1246            
1247             open (OUT, ">$binFile") || die "cannot open binary correlations $binFile: $!\n";
1248              
1249             print OUT $header, $body;
1250              
1251             close (OUT);
1252            
1253             # now get rid of the stdCor file, as we don;t need it now
1254              
1255             unlink $corrFile || warn "Can't remove $corrFile : $!\n";
1256              
1257             }
1258              
1259              
1260             #############################################################################
1261             sub __prepareMetaFile {
1262             #############################################################################
1263             # This method writes out a file of meta information that pertain to
1264             # the dataset, in the form of name=value pair.
1265              
1266             my $self = shift;
1267              
1268             my $file = $self->datapath.$self->fileBaseName.$kMetaSuffix;
1269              
1270             open(OUT, ">".$file) || die "Cannot create meta file, $file : $!";
1271              
1272             foreach my $key (@metaColumns) {
1273              
1274             print OUT $key, "=", $self->{$key}, "\n";
1275              
1276             }
1277              
1278             close(OUT);
1279            
1280             }
1281              
1282              
1283             #####################################################################
1284             sub _load_meta {
1285             #####################################################################
1286             # This method loads in previously cached meta data
1287              
1288             my $self = shift;
1289              
1290             my $filename = $self->datapath.$self->fileBaseName.$kMetaSuffix;
1291            
1292             open(IN, $filename) || die "cannot open _meta_ $filename $!";
1293              
1294             while (){
1295              
1296             chomp;
1297            
1298             my ($key, $value) = split("=",$_);
1299              
1300             $self->{$key} = $value;
1301              
1302             }
1303              
1304             close IN;
1305              
1306             }
1307              
1308             ###
1309             #
1310             #
1311             # STUFF FROM HERE ON DOWN IS CRAP.... so says Gavin?
1312             #
1313             ###
1314              
1315             # JCM note: Tried removing it, but many of these subroutines are still
1316             # used by both this class and Explorer. My guess is that the rest are
1317             # likely used by other clients that Christian wrote
1318              
1319             ######################################################################
1320             sub __loadExptInfo {
1321             ######################################################################
1322             ## This method loads the expt_info data
1323              
1324             my $self = shift;
1325             $self->{EXPT_INFO} = $self->__load_table("expt_info");
1326             }
1327              
1328              
1329             ######################################################################
1330             sub __loadFeatureInfo {
1331             ######################################################################
1332              
1333             my $self = shift;
1334             $self->{FEATURE_INFO} = $self->__load_table("feature_info");
1335             }
1336              
1337              
1338             ######################################################################
1339             sub __load_table {
1340             ######################################################################
1341             # loads an ASCII table. It is expected that the first row contains the
1342             # column headers It is also expected that the first column contains
1343             # numeric id's starting at '0'. returns a reference to the table
1344             # structure
1345              
1346             my ($self, $tableName) = @_;
1347              
1348             my $file = $self->datapath.$self->fileBaseName.".".$tableName;
1349              
1350             my ($table, $index, $i, @record);
1351              
1352             open(IN, $file) || die "cannot open _table $file $!\n";
1353            
1354             my $firstrow = ();
1355             chomp($firstrow);
1356              
1357             my @head = split("\t", $firstrow, -1);
1358              
1359             # since the first columns header is always 'ID' we discard it
1360             shift @head;
1361              
1362             while () {
1363              
1364             chomp;
1365             @record = split("\t", $_, -1);
1366             $index = shift @record;
1367              
1368             $i = 0;
1369             my %record = map { $head[$i++] => $_ } @record;
1370             $$table[$index] = \%record;
1371            
1372             }
1373              
1374             return $table;
1375              
1376             }
1377              
1378              
1379             ######################################################################
1380             sub image {
1381             ######################################################################
1382             # Returns the data matrix as a GD::Image, drawn with 1x1 pixel per
1383             # value at the contrast last used/initialized with $ds->new()
1384             #
1385             # Usage: $ds->image();
1386              
1387             my $self = shift;
1388              
1389             my $type = shift;
1390              
1391             my $image;
1392              
1393             if ($type eq "matrix") {
1394             $image = &_load_image($self->imagepath.$self->fileBaseName.".data_matrix.$kImgType");
1395             return $image;
1396             }
1397              
1398             if ($type eq "expt_info") {
1399             $image = &_load_image($self->imagepath.$self->fileBaseName.$kInfoGifSuffix);
1400             return $image;
1401             }
1402              
1403             die "This type of image is not known\n";
1404              
1405             }
1406              
1407              
1408             ######################################################################
1409             sub _load_image {
1410             ######################################################################
1411             # this protected method just opens up the previously stored matrix
1412             # image (from dataset initialization) , created a GD::Image object
1413             # with it, and returns it. Possible bug: it relies on GD::Image
1414             # version (>1.19) to pick $kImgType, when perhaps it should rely on
1415             # the filename suffix (.gif, .png) instead. This may prevent the
1416             # portability of intact datasets from one filesystem to another, but
1417             # in the end, you're always going to be limited by the version of GD...
1418              
1419             my $filename = shift;
1420              
1421             open(IN, $filename) || die "cannot open _image $filename! $!\n";
1422              
1423             my $funcname = "newFrom".ucfirst($kImgType);
1424             my $image = GD::Image->$funcname(*IN);
1425             return $image;
1426              
1427             }
1428              
1429              
1430             ######################################################################
1431             #sub vector {
1432             ######################################################################
1433             #
1434             # my $self = shift;
1435             # my $index = @_;
1436             #
1437             # # load the data matrix only on demand
1438             # if ( !defined( $self->{MATRIX} ) ) {
1439             #
1440             # $self->_load_matrix();
1441             #
1442             # }
1443             #
1444             # return wantarray ? @{$self->{MATRIX}[$index]} : $self->{MATRIX}[$index];
1445             #
1446             #}
1447             #
1448             #
1449             ######################################################################
1450             #sub _load_matrix {
1451             ######################################################################
1452             #
1453             # my $self = shift;
1454             #
1455             # my ($matrix, $index);
1456             #
1457             # my $filename = $self->datapath.$self->fileBaseName.".data_matrix";
1458             #
1459             # $dbg && print "\tload matrix filename: $filename\n";
1460             #
1461             # $matrix = [ ];
1462             #
1463             # open(IN, $filename) || die "cannot open _matrix_ $filename $!\n";
1464             #
1465             # while () {
1466             #
1467             # chomp;
1468             # my @row = split("\t", $_, -1);
1469             # $index = shift @row;
1470             #
1471             # $$matrix[$index] = \@row;
1472             #
1473             # }
1474             #
1475             # $self->{MATRIX} = $matrix;
1476             #
1477             #}
1478             #
1479             ######################################################################
1480             #sub getMatrixValue {
1481             ######################################################################
1482             #
1483             # my $self = shift;
1484             #
1485             # my $x = shift;
1486             # my $y = shift;
1487             #
1488             # if ($x > $self->width || $y > $self->height || $x < 0 || $y < 0) {
1489             # die "index values (x = $x, y = $y) exceed range in getMatrixValue!\n";
1490             # }
1491             #
1492             # return $self->{MATRIX}[$y][$x];
1493             #
1494             #}
1495             #
1496             #
1497             ######################################################################
1498             #sub _save_table {
1499             ######################################################################
1500             #
1501             # ### just started, has to be filled in 2000-10-
1502             #
1503             # # saves an ASCII table. It is expected that the first row contains the column headers
1504             # # It is also expected that the first column contains numeric id's starting at '0'.
1505             # # returns a reference to the table structure
1506             # my $self = shift;
1507             # my $suffix = shift;
1508             #
1509             # my $file = $self->datapath;
1510             # $file .= $self->fileBaseName;
1511             #
1512             # # save the feature table
1513             # if ($suffix =~ /feature/i) {
1514             #
1515             # $file .= ".feature_info";
1516             #
1517             # my @keys = $self->getFeatureKeys();
1518             #
1519             # open( OUT, ">$file" ) || die "cannot open $file in _save_table: $!\n";
1520             #
1521             # my $header = "ID\t".join("\t",@keys);
1522             #
1523             # $header .= "\n";
1524             #
1525             # print OUT $header;
1526             #
1527             # for(my $i=0;$i<$self->height;$i++) {
1528             #
1529             # print OUT $i;
1530             # foreach (@keys) {
1531             # print OUT "\t".$self->getFeature($i,$_);
1532             # }
1533             # print OUT "\n";
1534             #
1535             # }
1536             #
1537             # close(OUT);
1538             #
1539             # }
1540             #
1541             # # save experiment table
1542             # if ($suffix =~ /expt/) {
1543             #
1544             # my @keys = $self->getExperimentKeys();
1545             #
1546             # $file .= ".expt_info";
1547             #
1548             # open(OUT, ">$file") || die "cannot open $file in _save_table: $!\n";
1549             #
1550             # for(my $i=0;$i<$self->width;$i++) {
1551             #
1552             # print OUT $i;
1553             # foreach (@keys) {
1554             # print OUT "\t".$self->getFeature($i,$_);
1555             # }
1556             # print OUT "\n";
1557             #
1558             # }
1559             #
1560             # close(OUT);
1561             #
1562             # }
1563             #
1564             #
1565             #
1566             #}
1567             #
1568             ######################################################################
1569             #sub serialize {
1570             ######################################################################
1571             # writes the dataset annotation to the repository, used in conjunction
1572             # with setFeature()
1573             #
1574             # my $self = shift;
1575             #
1576             # $self->_save_table('feature');
1577             #
1578             #}
1579             #
1580             ######################################################################
1581             #sub getExperimentKeys { # wrapper to keep interface intact
1582             ######################################################################
1583             # my $self = shift;
1584             #
1585             # my @keys = ( keys %{$self->{EXPT_INFO}[0]} );
1586             #
1587             # return wantarray ? (@keys) : \@keys;
1588             #
1589             #}
1590             #
1591             #######################################################################
1592             #sub experiment_keys {
1593             #######################################################################
1594             #
1595             # my $self = shift;
1596             # my @keys = ( keys %{$self->{EXPT_INFO}[0]} );
1597             #
1598             # return wantarray ? (@keys) : \@keys;
1599             #
1600             #}
1601             #
1602             ######################################################################
1603             sub experiment {
1604             ######################################################################
1605              
1606             my $self = shift;
1607             my( $index, $field ) = @_;
1608              
1609             if ( $index > $self->width() ) {
1610             die "error: index larger then data\n";
1611             }
1612            
1613             if ( exists( $self->{EXPT_INFO}[$index]{$field} ) ) {
1614             return $self->{EXPT_INFO}[$index]{$field};
1615             } else {
1616             return "$index $field";
1617             }
1618              
1619             }
1620              
1621             ######################################################################
1622             #sub featureAttributeExists {
1623             ######################################################################
1624             # Returns true if the attribute passed as an argument
1625             # (e.g. 'CHROMOSOME') exists
1626             #
1627             # Usage: $ds->featureAttributeExists()
1628             #
1629             #
1630             # my $self = shift;
1631             # my $attr = shift;
1632             #
1633             # return (exists($self->{FEATURE_INFO}->[0]->{$attr}));
1634             #
1635             #}
1636              
1637             #####################################################################
1638             sub getFeatureKeys {
1639             #####################################################################
1640             # returns the keys (attributes) for the features (gene expression row
1641             # vectors)
1642             #
1643             #Usage: $ds->getFeatureKeys()
1644              
1645             my $self = shift;
1646            
1647             my @keys = ( keys %{$self->{FEATURE_INFO}[0]} );
1648              
1649             return wantarray ? (@keys) : \@keys;
1650              
1651             }
1652              
1653              
1654             #####################################################################
1655             sub feature {
1656             ######################################################################
1657             # required by the search function of Explorer
1658              
1659             my $self = shift;
1660             my( $index, $field ) = @_;
1661              
1662             if ( $index > $self->height ) {
1663             return "error: index larger then data\n";
1664             }
1665            
1666             if ( exists( $self->{FEATURE_INFO}->[$index]->{$field} ) ) {
1667             return $self->{FEATURE_INFO}->[$index]->{$field}
1668             } else {
1669             return "no field by this name: $field.";
1670             }
1671             }
1672              
1673              
1674             #####################################################################
1675             sub getFeature {
1676             #####################################################################
1677              
1678             my $self = shift;
1679             my( $index, $field ) = @_;
1680              
1681             if ( $index > $self->height ) {
1682             return "error: index larger then data\n";
1683             }
1684            
1685             if ( exists( $self->{FEATURE_INFO}->[$index]->{$field} ) ) {
1686             return $self->{FEATURE_INFO}->[$index]->{$field}
1687             } else {
1688             return "no field by this name: $field.";
1689             }
1690             }
1691              
1692              
1693             ######################################################################
1694             #sub setFeature {
1695             ######################################################################
1696             # Sets the column in row to [also see
1697             # serialize() ] JCM: This is probably for updating the dataset
1698             #
1699             #
1700             # my $self = shift;
1701             # my( $index, $field, $newval ) = @_;
1702             #
1703             # if ( $index > $self->height ) {
1704             #
1705             # return "error: index larger then data\n";
1706             #
1707             # }
1708             #
1709             # if ( exists( $self->{FEATURE_INFO}->[$index]->{$field} ) ) {
1710             #
1711             # $self->{FEATURE_INFO}->[$index]->{$field} = $newval;
1712             #
1713             # } else {
1714             #
1715             # # this adds a column to the table. We iterate over all
1716             # # rows and add the new row/record field
1717             # for (my $i=0;$i<$self->height;$i++) {
1718             # $self->{FEATURE_INFO}->[$i]->{$field} = '';
1719             # }
1720             #
1721             # # after we added the column, we assign the passed value
1722             # $self->{FEATURE_INFO}->[$index]->{$field} = $newval;
1723             #
1724             # }
1725             #
1726             #}
1727             #
1728              
1729              
1730             #####################################################################
1731             sub search {
1732             #####################################################################
1733             # Returns an array of data matrix row numbers where matched in
1734             # column . When using 'ALL' as , all
1735             # columns will be searched
1736              
1737             my $self = shift;
1738              
1739             my $query = shift;
1740             my $field = shift;
1741              
1742             my( @keys, @hits );
1743              
1744             if ($dbg) { print "Now searching $query in $field...\n", "
" };
1745              
1746             # search a specific field...
1747             if ( $field eq "ALL" ) {
1748             @keys = $self->getFeatureKeys();
1749             # ... or all fields
1750             } else {
1751             push @keys, $field;
1752             }
1753            
1754             for(my $i=0;$i<$self->height;$i++) {
1755              
1756             if ( $self->_search_feature( $i, $query, \@keys ) ) {
1757             push @hits, $i;
1758             }
1759              
1760             }
1761              
1762             return (@hits);
1763              
1764             }
1765              
1766              
1767             #####################################################################
1768             sub _search_feature {
1769             #####################################################################
1770             # usage: $hit = $self->_search_feature( 100, "kinase", ['ACC','NAME','SYMBOL'])
1771             ############################################################################-
1772             # this function returns true, if the feature queried contains the passed
1773             # string values(s). The parameters to this function are:
1774             # - required: the index number of the feature
1775             # - required: a search term
1776             # - optional: an array reference, containing the names of fields to search,
1777             # if not passed, all fields will be searched.
1778              
1779             my $self = shift;
1780              
1781             my $index = shift;
1782             my $query = shift;
1783              
1784             my $field_aref = shift;
1785              
1786             my $field;
1787              
1788             # search field(s) for the query
1789             foreach $field ( @{$field_aref} ) {
1790              
1791             if ( $self->feature( $index, $field ) =~ /$query/i ) {
1792              
1793             if ($dbg) { print "found $query in field $field in feature $index", "
" }
1794            
1795             return 1;
1796             }
1797             }
1798            
1799             # if we got here, nothing was found, return 'FALSE'
1800             return 0;
1801              
1802             }
1803              
1804              
1805             #####################################################################
1806             sub correlations {
1807             #####################################################################
1808             # Returns the precalculated correlation values for row . Up to
1809             # 50 correlations values > 0.5 are stored. As an example client
1810             # usage, see Explorer's/gx retrieval of those profiles correlated to
1811             # the query (user-clicked profile within zoom view).
1812              
1813              
1814              
1815              
1816             my $self = shift;
1817              
1818             my( $seed, $neighbours, $corrs ) = @_;
1819              
1820             my $corr_file = $self->datapath.$self->fileBaseName.$kBinCorrSuffix;
1821              
1822             &_get_correlations(
1823             $corr_file,
1824             $seed,
1825             $neighbours,
1826             $corrs
1827             );
1828            
1829            
1830             }
1831              
1832              
1833             #####################################################################
1834             sub _get_correlations {
1835             #####################################################################
1836             # required for Explorer to retrieve those profiles highly correlated
1837             # to the query (user-clicked profile within zoom view)
1838              
1839              
1840             my (
1841             $file,
1842             $startVector,
1843             $cVec_aref,
1844             $cCor_aref
1845             ) = @_;
1846              
1847             my ($tmp, $numRows, $indexSize, $index, $vecOffset, $vector);
1848             my (@indices, @vecOffsets, @vecSizes);
1849              
1850             open (IN, $file) || die "cannot open _correlations_ $file: $!\n";
1851              
1852             # read the first word, indicating the number of rows
1853             # contained in the body
1854             read(IN, $tmp, 2);
1855              
1856             $numRows = unpack("n", $tmp);
1857              
1858             # if the function is only being called with the filename, return the number of correlations
1859             if ($startVector eq "") {
1860             return $numRows;
1861             }
1862              
1863             if ( ($startVector < 0) || ($startVector > $numRows-1) ) {
1864             die "Error: loadCompressedCorrelations: start vector ($startVector) wrong size\n";
1865             }
1866              
1867             $indexSize = $numRows * 2; # the index that follows in the head is numRows words (= 2 bytes)
1868              
1869             # after this value, an index containing the same number of words,
1870             # each indicating the number of correlation values stored per row, follows
1871             # reading from the current filepointer is equivalent to setting it with seek(IN, 2, 0);
1872             read(IN, $index, $indexSize );
1873              
1874             @indices = unpack("n*", $index);
1875              
1876             # initialize the first offset outside loop
1877             $vecOffsets[0] = 0;
1878             $vecSizes[0] = $indices[0] * 4;
1879             $vecOffset = $vecSizes[0];
1880              
1881             # build the offset size for all entries in body, since the index only contains the number
1882             # of correlations for the individual gene. We need cumulative offset values
1883             for (my $j=1;$j<=$#indices;$j++) {
1884              
1885             my $vecSize = $indices[$j] * 4;
1886             $vecSizes[$j] = $vecSize;
1887             $vecOffset += $vecSize; # number of reference/correlation pairs, each pair is two words ( = 4 bytes total)
1888             $vecOffsets[$j+1] = $vecOffset;
1889              
1890             }
1891              
1892             seek(IN, $vecOffsets[$startVector], 1); # go to position for this vectors correlation data in file
1893             read(IN, $vector, $vecSizes[$startVector]);
1894              
1895             my @values = unpack("n*", $vector);
1896              
1897             my $iter = 0;
1898             for (my $k=0; $k<=$#values; $k++) {
1899             if (!($k % 2)) {
1900             $cVec_aref->[$iter] = $values[$k];
1901             } else {
1902             $cCor_aref->[$iter] = sprintf("%.4f", $values[$k] / 65535);
1903             $iter++;
1904             }
1905             }
1906              
1907             close(IN);
1908              
1909             }
1910              
1911              
1912             ######################################################################
1913             #sub size {
1914             ######################################################################
1915             ## This method returns the size of the dataset. It is based on the
1916             ## assumption (which is currently wrong) that all file in the dataset
1917             ## start with the dataset name.
1918             #
1919             # my $self = shift;
1920             #
1921             # if ( exists($self->{'SIZE'}) ) {
1922             #
1923             # return $self->{'SIZE'};
1924             #
1925             # } else {
1926             #
1927             # my $stem = $self->datapath().$self->fileBaseName.".*";
1928             #
1929             # my @files = glob($stem);
1930             #
1931             # my $size;
1932             #
1933             # for (@files) {
1934             # $size += (-s $_);
1935             # }
1936             #
1937             # $self->{'SIZE'} = $size;
1938             #
1939             # return $self->{'SIZE'};
1940             #
1941             # }
1942             #
1943             #}
1944              
1945             1; # to make perl happy
1946              
1947              
1948             __END__