diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm index cbc67d808..3d41b5a4b 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm @@ -48,6 +48,13 @@ sub default_table_name { } +sub overflow_limit { + return { + 'key_signature' => 255, + }; +} + + sub fetch_structures_for_job_ids { my ($self, $job_ids_csv, $id_scale, $id_offset) = @_; $id_scale ||= 1; @@ -63,6 +70,9 @@ sub fetch_structures_for_job_ids { ROW: while(my ($receiving_job_id, $struct_name, $key_signature, $stringified_value) = $sth->fetchrow_array() ) { + ($key_signature, $struct_name) = map {$self->check_and_dereference_analysis_data($_)} + ($key_signature, $struct_name); + my $value = destringify($stringified_value); my $sptr = \$structures{$receiving_job_id * $id_scale + $id_offset}{$struct_name}; diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm index 0e1505c97..65bcb637c 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm @@ -53,6 +53,11 @@ sub default_insertion_method { return 'INSERT_IGNORE'; } +sub overflow_limit { + return { + 'condition_analysis_url' => 255, + }; +} sub object_class { return 'Bio::EnsEMBL::Hive::AnalysisCtrlRule'; diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm index 18b07a5f2..3b9368f0d 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm @@ -700,9 +700,7 @@ sub fetch_input_ids_for_job_ids { $sth->execute(); while(my ($job_id, $input_id) = $sth->fetchrow_array() ) { - if($input_id =~ /^_ext(?:\w+)_data_id (\d+)$/) { - $input_id = $self->db->get_AnalysisDataAdaptor->fetch_by_analysis_data_id_TO_data($1); - } + $input_id = $self->check_and_dereference_analysis_data($input_id); $input_ids{$job_id * $id_scale + $id_offset} = $input_id; } } diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm index 78eebb10d..fc853a1a4 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm @@ -40,6 +40,7 @@ use strict; use warnings; no strict 'refs'; # needed to allow AUTOLOAD create new methods use DBI 1.6; # the 1.6 functionality is important for detecting autoincrement fields and other magic. +use Scalar::Util ('blessed'); use Bio::EnsEMBL::Hive::Utils ('stringify', 'throw'); @@ -138,6 +139,26 @@ sub overflow_limit { return $self->{_overflow_limit} || $self->default_overflow_limit(); } +=head2 size_limit + +Arg[1] : (optional) Hashref with column names as keys and column size as values +Description : Getter/setter for column size limits for the implementing adaptor. + If no size limit is set, this method will call _table_info_loader() to + get sizes from the database schema. +Returntype : Hashref + +=cut + +sub size_limit { + my $self = shift @_; + + if(@_) { # setter + $self->{_size_limit} = shift @_; + } elsif ( !defined( $self->{_size_limit} ) ) { + $self->_table_info_loader(); + } + return $self->{_size_limit}; +} sub input_column_mapping { my $self = shift @_; @@ -229,18 +250,27 @@ sub _table_info_loader { my $table_name = $self->table_name(); my %column_set = (); + my %size_limit = (); my $autoinc_id = ''; my @primary_key = $dbh->primary_key(undef, undef, $table_name); my $sth = $dbh->column_info(undef, undef, $table_name, '%'); $sth->execute(); while (my $row = $sth->fetchrow_hashref()) { - my ( $column_name, $column_type ) = @$row{'COLUMN_NAME', 'TYPE_NAME'}; + my ( $column_name, $column_type, $size_limit ) = @$row{'COLUMN_NAME', 'TYPE_NAME', 'COLUMN_SIZE'}; # warn "ColumnInfo [$table_name/$column_name] = $column_type\n"; $column_set{$column_name} = $column_type; + # PostgreSQL reports a COLUMN_SIZE of 4 for enums, which is not compatible with + # the way slicer does column size checking. Likewise, PostgreSQL reports + # a user-defined TYPE_NAME for enums, rather than 'enum'. Therefore, if + # the DB is PostgreSQL, only set size_limit for varchars + unless (($driver eq 'pgsql') && !($column_type eq 'character varying')) { + $size_limit{$column_name} = $size_limit; + } + if( ($column_name eq $table_name.'_id') or ($table_name eq 'analysis_base' and $column_name eq 'analysis_id') ) { # a special case (historical) $autoinc_id = $column_name; @@ -249,6 +279,7 @@ sub _table_info_loader { $sth->finish; $self->column_set( \%column_set ); + $self->size_limit( \%size_limit ); $self->primary_key( \@primary_key ); $self->autoinc_id( $autoinc_id ); } @@ -303,9 +334,7 @@ sub fetch_all { while(my $hashref = $sth->fetchrow_hashref) { foreach my $overflow_key (@overflow_columns) { - if($hashref->{$overflow_key} =~ /^_ext(?:\w+)_data_id (\d+)$/) { - $hashref->{$overflow_key} = $overflow_adaptor->fetch_by_analysis_data_id_TO_data($1); - } + $hashref->{$overflow_key} = $self->check_and_dereference_analysis_data($hashref->{$overflow_key}); } my $pptr = \$result_struct; @@ -530,6 +559,79 @@ sub store { } +=head2 check_and_dereference_analysis_data + + Arg [1] : string data that may reference an analysis_data_id + Usage : my $value = $self->check_and_dereference_analysis_data($$fetched_row[0]) + Description : Checks to see if the passed value matches the regular expression + : /^_ext(?:\w+)_data_id (\d+)$/ (e.g. "external_data_id 3", pointer to an analysis_data_id). + : If so, it returns the entry from the "data" column in analysis_data from the + : row containing the given analysis_data_id. + : If not, it returns the passed parameter unchanged. + Returntype : string + +=cut + +sub check_and_dereference_analysis_data { + my $self = shift @_; + my $possible_analysis_data_id_ref = shift @_; + + if ($possible_analysis_data_id_ref =~ /^_ext(?:\w+)_data_id (\d+)$/) { + return $self->db->get_AnalysisDataAdaptor->fetch_by_analysis_data_id_TO_data($1); + } else { + return $possible_analysis_data_id_ref; + } +} + +sub slicer { + my ($self, $sliceable, $fields) = @_; + + my $is_object = blessed($sliceable) ? 1 : 0; + + my $autoinc_id; + if ($is_object) { + $autoinc_id = $self->autoinc_id(); + } + my $overflow_limit = $self->overflow_limit(); + my $size_limit = $self->size_limit(); + + my @slice; + # keep track of any values needing overflow, so that overflow can + # be deferred until after checking all fields for size limit violations + my %needs_overflow; + + for (my $i = 0; $i <= $#{$fields}; $i++) { + my $field = $fields->[$i]; + my $value = $is_object ? $sliceable->$field() : $sliceable->{$field}; + my $ol = $overflow_limit->{$field}; + my $sl = $size_limit->{$field}; + + if ($is_object && $field eq $autoinc_id) { + $slice[$i] = $sliceable->dbID(); + } elsif (defined($ol) and defined($value) and (length($value) > $ol)) { + # if overflow limit exists for this field, we can ignore + # any size limit since an excessively large field will be + # handled through overflow + $needs_overflow{$i} = $value; + } elsif (defined($sl) and defined($value) and (length($value) > $sl)) { + # if no overflow limit, then check size and generate a meaningful error + # as some RDBMS implementations fail silently or misleadingly when trying + # to store an oversize value + throw("length of value for column \"" . $field . + "\" exceeds the maximum size, which is $sl"); + } else { + $slice[$i] = $value; + } + } + + foreach my $fields_index (keys(%needs_overflow)) { + $slice[$fields_index] = + $self->db->get_AnalysisDataAdaptor()->store_if_needed($needs_overflow{$fields_index}); + } + + return \@slice; +} + sub DESTROY { } # to simplify AUTOLOAD sub AUTOLOAD { diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm index 28b77077e..9d9526441 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm @@ -55,6 +55,13 @@ sub default_insertion_method { } +sub overflow_limit { + return { + 'to_analysis_url' => 255, + }; +} + + sub object_class { return 'Bio::EnsEMBL::Hive::DataflowRule'; } diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm index 34fe209ba..307512803 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm @@ -44,13 +44,6 @@ use Bio::EnsEMBL::Hive::NakedTable; use base ('Bio::EnsEMBL::Hive::DBSQL::BaseAdaptor'); -sub slicer { # take a slice of the hashref (if only we could inline in Perl!) - my ($self, $hashref, $fields) = @_; - - return [ @$hashref{@$fields} ]; -} - - sub objectify { # pretend the hashref becomes an object (if only we could inline in Perl!) return pop @_; } diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm index cabdfd7f4..f833f47bf 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm @@ -47,25 +47,6 @@ sub object_class { die "Please define object_class() in your specific adaptor class to return the class name of your intended object"; } - -sub slicer { # take a slice of the object (if only we could inline in Perl!) - my ($self, $object, $fields) = @_; - - my $autoinc_id = $self->autoinc_id(); - my $overflow_limit = $self->overflow_limit(); - - return [ map { ($_ eq $autoinc_id) - ? $object->dbID() - : eval { my $value = $object->$_(); - my $ol = $overflow_limit->{$_}; - (defined($ol) and length($value)>$ol) - ? $self->db->get_AnalysisDataAdaptor()->store_if_needed( $value ) - : $value - } - } @$fields ]; -} - - sub objectify { # turn the hashref into an object (if only we could inline in Perl!) my ($self, $hashref) = @_; diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm index f0bc1fa96..57418a09d 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm @@ -55,6 +55,14 @@ sub default_insertion_method { } +sub overflow_limit { + return { + 'submission_cmd_args' => 255, + 'worker_cmd_args' => 255, + }; +} + + sub object_class { return 'Bio::EnsEMBL::Hive::ResourceDescription'; } diff --git a/t/02.api/fetch_and_count_by_multiple_columns.t b/t/02.api/fetch_and_count_by_multiple_columns.t index 327dbdb3a..12ebceb42 100755 --- a/t/02.api/fetch_and_count_by_multiple_columns.t +++ b/t/02.api/fetch_and_count_by_multiple_columns.t @@ -21,7 +21,7 @@ use warnings; use Data::Dumper; use File::Temp qw{tempdir}; -use Test::More tests => 16; +use Test::More tests => 11; use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor; @@ -40,9 +40,7 @@ my $pipeline_url = 'sqlite:///ehive_test_pipeline_db'; my $hive_dba = init_pipeline('Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf', $pipeline_url, [-hive_force_init => 1]); my $ana_a = $hive_dba->get_AnalysisAdaptor; -my $job_a = $hive_dba->get_AnalysisJobAdaptor; my $dfr_a = $hive_dba->get_DataflowRuleAdaptor; -my $ada_a = $hive_dba->get_AnalysisDataAdaptor; is($ana_a->count_all(), 3, 'There are 3 analyses in the pipeline'); is($ana_a->count_all_by_logic_name('take_b_apart'), 1, 'But only 1 "take_b_apart"'); @@ -62,30 +60,6 @@ is($matching_analyses->[0]->to_analysis_url, 'part_multiply', 'Correct target lo is($dfr_a->count_all_by_branch_code(1), 3, 'There are 2 #1 branches in the pipeline'); -my $long_input_id = sprintf('{ "long_param" => "%s" }', 'tmp' x 1000); -my $new_job = Bio::EnsEMBL::Hive::AnalysisJob->new( - 'input_id' => $long_input_id, - 'analysis_id' => 1, -); - -# Test the overflow to the analysis_data table -is($ada_a->count_all(), 0, "Nothing in the analysis_data table (yet)"); - -$job_a->store($new_job); -is($ada_a->count_all(), 1, "1 entry in the analysis_data table"); - -is($ada_a->fetch_by_data_TO_analysis_data_id('unmatched input_id'), undef, 'fetch_by_data_to_analysis_data_id() returns undef when it cannot find the input_id'); -my $ext_data_id = $ada_a->fetch_by_data_TO_analysis_data_id($long_input_id); -is($ext_data_id, 1, 'analysis_data_id starts at 1'); - -my $another_job = Bio::EnsEMBL::Hive::AnalysisJob->new( - 'input_id' => $long_input_id, - 'analysis_id' => 2, -); - -$job_a->store($another_job); -is($ada_a->count_all(), 1, "still 1 entry in the analysis_data table"); - done_testing(); chdir $original; diff --git a/t/02.api/overflow.t b/t/02.api/overflow.t new file mode 100755 index 000000000..259e96541 --- /dev/null +++ b/t/02.api/overflow.t @@ -0,0 +1,204 @@ +#!/usr/bin/env perl +# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +# Copyright [2016-2019] EMBL-European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +use strict; +use warnings; + +use Data::Dumper; +use File::Temp qw{tempdir}; + +use Test::More tests => 20; +use Test::Exception; + +use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Hive::ResourceClass; + +use Bio::EnsEMBL::Hive::Utils::Test qw(init_pipeline); + +# eHive needs this to initialize the pipeline (and run db_cmd.pl) +use Cwd (); +use File::Basename (); +$ENV{'EHIVE_ROOT_DIR'} ||= File::Basename::dirname( File::Basename::dirname( File::Basename::dirname( Cwd::realpath($0) ) ) ); + +my $dir = tempdir CLEANUP => 1; +chdir $dir; + +my $pipeline_url = 'sqlite:///ehive_test_pipeline_db'; + +my $hive_dba = init_pipeline('Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf', $pipeline_url, [-hive_force_init => 1]); + +my $job_a = $hive_dba->get_AnalysisJobAdaptor; +my $rcl_a = $hive_dba->get_ResourceClassAdaptor; +my $rde_a = $hive_dba->get_ResourceDescriptionAdaptor; +my $dfr_a = $hive_dba->get_DataflowRuleAdaptor; +my $ada_a = $hive_dba->get_AnalysisDataAdaptor; +my $acu_a = $hive_dba->get_AccumulatorAdaptor; +my $acr_a = $hive_dba->get_AnalysisCtrlRuleAdaptor; +my $ana_a = $hive_dba->get_AnalysisAdaptor; + +my $long_input_id = sprintf('{ "long_param" => "%s" }', 'tmp' x 1000); +my $new_job = Bio::EnsEMBL::Hive::AnalysisJob->new( + 'input_id' => $long_input_id, + 'analysis_id' => 1, +); + +# Test the overflow into the analysis_data table +# Test overflow for input_id +is($ada_a->count_all(), 0, "Nothing in the analysis_data table (yet)"); + +$job_a->store($new_job); +is($ada_a->count_all(), 1, "1 entry in the analysis_data table"); + +is($ada_a->fetch_by_data_TO_analysis_data_id('unmatched input_id'), undef, 'fetch_by_data_to_analysis_data_id() returns undef when it cannot find the input_id'); +my $ext_data_id = $ada_a->fetch_by_data_TO_analysis_data_id($long_input_id); +is($ext_data_id, 1, 'analysis_data_id starts at 1'); + +my $fan_job = Bio::EnsEMBL::Hive::AnalysisJob->new( + 'input_id' => $long_input_id, + 'analysis_id' => 2, +); + +$job_a->store($fan_job); +is($ada_a->count_all(), 1, "still 1 entry in the analysis_data table"); + +# Test overflow for resource description args + +Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->init_collections(); +my $new_rc = Bio::EnsEMBL::Hive::ResourceClass->add_new_or_update( + 'name' => 'testresourceclass', +); + +my $long_sca = 'sc' x 129; +my $long_wca = 'wc' x 129; +my $new_rd = Bio::EnsEMBL::Hive::ResourceDescription->add_new_or_update( + 'resource_class' => $new_rc, + 'meadow_type' => 'test_meadow', + 'submission_cmd_args' => $long_sca, + 'worker_cmd_args' => $long_wca, +); + +$rcl_a->store($new_rc); +$rde_a->store($new_rd); +is($ada_a->count_all(), 3, "New resource description overflowed two entries to analysis_data, total 3"); + +# Test overflow for to_analysis_urls + +my $long_struct_name = 'ta' x 129; +my $long_to_analysis_url = ':////accu?' . $long_struct_name; +my $new_dfr = Bio::EnsEMBL::Hive::DataflowRule->add_new_or_update( + 'from_analysis' => $ana_a->fetch_by_dbID(1), + 'to_analysis_url' => $long_to_analysis_url, + 'branch_code' => 3, +); + +$dfr_a->store($new_dfr); +is($ada_a->count_all(), 4, "New to_analysis_url overflowed an entry to analysis_data, total 4"); + +# Test overflow for condition analysis urls + +my $long_cau = 'cau' x 86; +my $ctrled_analysis_id = 1; +my $new_acr = Bio::EnsEMBL::Hive::AnalysisCtrlRule->add_new_or_update( + 'condition_analysis_url' => $long_cau, + 'ctrled_analysis' => $ana_a->fetch_by_dbID($ctrled_analysis_id), +); + +$acr_a->store($new_acr); +is($ada_a->count_all(), 5, "New condition_analysis_url overflowed an entry to analysis_data, total 5"); + +# Test overflow for accu key_signatures +# Note: AccumulatorAdaptor will complain if storing an accu without a proper fan job +# and semaphored funnel job + +my $accu_funnel_job = Bio::EnsEMBL::Hive::AnalysisJob->new( + 'input_id' => {}, + 'analysis_id' => 3, +); +$job_a->store($accu_funnel_job); + +my $accu_fan_job = Bio::EnsEMBL::Hive::AnalysisJob->new( + 'input_id' => {}, + 'analysis_id' => 2, + 'semaphored_job_id' => $accu_funnel_job->dbID, +); +$job_a->store($accu_fan_job); + +my $new_accu = Bio::EnsEMBL::Hive::Accumulator->new( + adaptor => $acu_a, + struct_name => $long_struct_name, + signature_template => '{key}', +); + +my $long_key_signature = 'ks' x 129; +my $long_output_id = [ { 'key' => $long_key_signature, + $long_struct_name => 1, } ]; + +throws_ok { $new_accu->dataflow( + $long_output_id, + $accu_fan_job,) + } qr/length of value/, + "exception to insert of oversize struct name"; + +is($ada_a->count_all(), 5, "Oversize struct name not inserted"); + +my $short_struct_name = 'ssn'; + +my $sslk_accu = Bio::EnsEMBL::Hive::Accumulator->new( + adaptor => $acu_a, + struct_name => $short_struct_name, + signature_template => '{key}', +); + +my $sslk_output_id = [ { 'key' => $long_key_signature, + 'ssn' => 1, } ]; + +$sslk_accu->dataflow( + $sslk_output_id, + $accu_fan_job,); + +is($ada_a->count_all(), 6, "Oversize key signature overflow"); + + +# Test retrieval of overflow data + +my $fetched_rds = $rde_a->fetch_all(); +my $rd_with_long_args; +foreach my $fetched_rd (@$fetched_rds) { + if ($fetched_rd->resource_class_id() == $new_rc->dbID) { + $rd_with_long_args = $fetched_rd; + } +} + +is($rd_with_long_args->submission_cmd_args, $long_sca, "Retrieved long submission_cmd_args"); +is($rd_with_long_args->worker_cmd_args, $long_wca, "Retrieved long worker_cmd_args"); + +my $fetched_dfr = $dfr_a->fetch_by_dbID($new_dfr->dbID); +is ($fetched_dfr->to_analysis_url, $long_to_analysis_url, "Retrieved long to_analysis_url"); + +my $fetched_acr = $acr_a->fetch_by_ctrled_analysis_id($ctrled_analysis_id); +is ($fetched_acr->condition_analysis_url, $long_cau, "Retrieved long condition_analysis_url"); + +# $fetched_accu_structures->{$receiving_job_id}->{$struct_name}->{$key_signature} = value +my $fetched_accu_structures = $acu_a->fetch_structures_for_job_ids($accu_funnel_job->dbID); +my $fetched_accu_hash = $fetched_accu_structures->{$accu_funnel_job->dbID}; +my $fetched_struct_name = (keys(%$fetched_accu_hash))[0]; +my $fetched_key_signature = (keys(%{$fetched_accu_hash->{$fetched_struct_name}}))[0]; + +is ($fetched_struct_name, $short_struct_name, "fetched short struct_name from accu"); +is ($fetched_key_signature, $long_key_signature, "fetched long key_signature from accu"); + +done_testing();