From 1e85f0bcfa55403e4ef31a016e200e2aeed990b4 Mon Sep 17 00:00:00 2001 From: ens-bwalts Date: Tue, 24 Jul 2018 17:30:29 +0100 Subject: [PATCH 1/5] enable overflow for accu keys, to_analysis_urls, and submission_cmd_args --- .../EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm | 29 +++++++++++++++++++ modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm | 25 ++++++++++++++++ .../EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm | 7 +++++ .../Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm | 2 +- .../Hive/DBSQL/ResourceDescriptionAdaptor.pm | 7 +++++ 5 files changed, 69 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm index cbc67d808..461bb6b9e 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm @@ -48,6 +48,32 @@ sub default_table_name { } +sub slicer { # take a slice of the hashref (if only we could inline in Perl!) + my ($self, $hashref, $fields) = @_; + + my $overflow_limit = $self->overflow_limit(); + + return [ map { eval { my $value = $hashref->{$_}; + my $ol = $overflow_limit->{$_}; + if (defined($ol) and defined($value) and (length($value) > $ol)) { + $self->db->get_AnalysisDataAdaptor()->store_if_needed($value); + } else { + $value; + } + } + + + } @$fields ]; +} + +sub overflow_limit { + return { + 'key_signature' => 255, + 'struct_name' => 255, + }; +} + + sub fetch_structures_for_job_ids { my ($self, $job_ids_csv, $id_scale, $id_offset) = @_; $id_scale ||= 1; @@ -63,6 +89,9 @@ sub fetch_structures_for_job_ids { ROW: while(my ($receiving_job_id, $struct_name, $key_signature, $stringified_value) = $sth->fetchrow_array() ) { + ($key_signature, $struct_name) = map {$self->check_and_dereference_analysis_data($_)} + ($key_signature, $struct_name); + my $value = destringify($stringified_value); my $sptr = \$structures{$receiving_job_id * $id_scale + $id_offset}{$struct_name}; diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm index 78eebb10d..81191cd5b 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm @@ -530,6 +530,31 @@ sub store { } +=head2 check_and_dereference_analysis_data + + Arg [1] : string data that may reference an analysis_data_id + Usage : my $value = $self->check_and_dereference_analysis_data($$fetched_row[0]) + Description : Checks to see if the passed value matches the regular expression + : /^_ext(?:\w+)_data_id (\d+)$/ (e.g. "external_data_id 3", pointer to an analysis_data_id). + : If so, it returns the entry from the "data" column in analysis_data from the + : row containing the given analysis_data_id. + : If not, it returns the passed parameter unchanged. + Returntype : string + +=cut + +sub check_and_dereference_analysis_data { + my $self = shift @_; + my $possible_analysis_data_id_ref = shift @_; + + if ($possible_analysis_data_id_ref =~ /^_ext(?:\w+)_data_id (\d+)$/) { + return $self->db->get_AnalysisDataAdaptor->fetch_by_analysis_data_id_TO_data($1); + } else { + return $possible_analysis_data_id_ref; + } +} + + sub DESTROY { } # to simplify AUTOLOAD sub AUTOLOAD { diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm index 28b77077e..9d9526441 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm @@ -55,6 +55,13 @@ sub default_insertion_method { } +sub overflow_limit { + return { + 'to_analysis_url' => 255, + }; +} + + sub object_class { return 'Bio::EnsEMBL::Hive::DataflowRule'; } diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm index cabdfd7f4..b3126e86f 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm @@ -58,7 +58,7 @@ sub slicer { # take a slice of the object (if only we could inline in Perl!) ? $object->dbID() : eval { my $value = $object->$_(); my $ol = $overflow_limit->{$_}; - (defined($ol) and length($value)>$ol) + (defined($ol) and defined($value) and length($value)>$ol) ? $self->db->get_AnalysisDataAdaptor()->store_if_needed( $value ) : $value } diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm index f0bc1fa96..888ad1853 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm @@ -55,6 +55,13 @@ sub default_insertion_method { } +sub overflow_limit { + return { + 'submission_cmd_args' => 255, + }; +} + + sub object_class { return 'Bio::EnsEMBL::Hive::ResourceDescription'; } From 48b3e5836502c0f407c7eb99d3f3048878b16229 Mon Sep 17 00:00:00 2001 From: ens-bwalts Date: Fri, 24 Aug 2018 17:54:04 +0100 Subject: [PATCH 2/5] improvements to overflow handling --- .../EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm | 18 -- .../Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm | 5 + .../EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm | 4 +- modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm | 4 +- .../EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm | 13 +- .../Hive/DBSQL/ResourceDescriptionAdaptor.pm | 1 + .../fetch_and_count_by_multiple_columns.t | 28 +-- t/02.api/overflow.t | 183 ++++++++++++++++++ 8 files changed, 204 insertions(+), 52 deletions(-) create mode 100755 t/02.api/overflow.t diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm index 461bb6b9e..c342d6658 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm @@ -48,24 +48,6 @@ sub default_table_name { } -sub slicer { # take a slice of the hashref (if only we could inline in Perl!) - my ($self, $hashref, $fields) = @_; - - my $overflow_limit = $self->overflow_limit(); - - return [ map { eval { my $value = $hashref->{$_}; - my $ol = $overflow_limit->{$_}; - if (defined($ol) and defined($value) and (length($value) > $ol)) { - $self->db->get_AnalysisDataAdaptor()->store_if_needed($value); - } else { - $value; - } - } - - - } @$fields ]; -} - sub overflow_limit { return { 'key_signature' => 255, diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm index 0e1505c97..65bcb637c 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm @@ -53,6 +53,11 @@ sub default_insertion_method { return 'INSERT_IGNORE'; } +sub overflow_limit { + return { + 'condition_analysis_url' => 255, + }; +} sub object_class { return 'Bio::EnsEMBL::Hive::AnalysisCtrlRule'; diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm index 18b07a5f2..3b9368f0d 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm @@ -700,9 +700,7 @@ sub fetch_input_ids_for_job_ids { $sth->execute(); while(my ($job_id, $input_id) = $sth->fetchrow_array() ) { - if($input_id =~ /^_ext(?:\w+)_data_id (\d+)$/) { - $input_id = $self->db->get_AnalysisDataAdaptor->fetch_by_analysis_data_id_TO_data($1); - } + $input_id = $self->check_and_dereference_analysis_data($input_id); $input_ids{$job_id * $id_scale + $id_offset} = $input_id; } } diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm index 81191cd5b..c991e79fb 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm @@ -303,9 +303,7 @@ sub fetch_all { while(my $hashref = $sth->fetchrow_hashref) { foreach my $overflow_key (@overflow_columns) { - if($hashref->{$overflow_key} =~ /^_ext(?:\w+)_data_id (\d+)$/) { - $hashref->{$overflow_key} = $overflow_adaptor->fetch_by_analysis_data_id_TO_data($1); - } + $hashref->{$overflow_key} = $self->check_and_dereference_analysis_data($hashref->{$overflow_key}); } my $pptr = \$result_struct; diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm index 34fe209ba..48e602d74 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm @@ -47,7 +47,18 @@ use base ('Bio::EnsEMBL::Hive::DBSQL::BaseAdaptor'); sub slicer { # take a slice of the hashref (if only we could inline in Perl!) my ($self, $hashref, $fields) = @_; - return [ @$hashref{@$fields} ]; + my $overflow_limit = $self->overflow_limit(); + + return [ map { eval { my $value = $hashref->{$_}; + my $ol = $overflow_limit->{$_}; + if (defined($ol) and defined($value) and (length($value) > $ol)) { + $self->db->get_AnalysisDataAdaptor()->store_if_needed($value); + } else { + $value; + } + } + + } @$fields ]; } diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm index 888ad1853..57418a09d 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm @@ -58,6 +58,7 @@ sub default_insertion_method { sub overflow_limit { return { 'submission_cmd_args' => 255, + 'worker_cmd_args' => 255, }; } diff --git a/t/02.api/fetch_and_count_by_multiple_columns.t b/t/02.api/fetch_and_count_by_multiple_columns.t index 327dbdb3a..12ebceb42 100755 --- a/t/02.api/fetch_and_count_by_multiple_columns.t +++ b/t/02.api/fetch_and_count_by_multiple_columns.t @@ -21,7 +21,7 @@ use warnings; use Data::Dumper; use File::Temp qw{tempdir}; -use Test::More tests => 16; +use Test::More tests => 11; use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor; @@ -40,9 +40,7 @@ my $pipeline_url = 'sqlite:///ehive_test_pipeline_db'; my $hive_dba = init_pipeline('Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf', $pipeline_url, [-hive_force_init => 1]); my $ana_a = $hive_dba->get_AnalysisAdaptor; -my $job_a = $hive_dba->get_AnalysisJobAdaptor; my $dfr_a = $hive_dba->get_DataflowRuleAdaptor; -my $ada_a = $hive_dba->get_AnalysisDataAdaptor; is($ana_a->count_all(), 3, 'There are 3 analyses in the pipeline'); is($ana_a->count_all_by_logic_name('take_b_apart'), 1, 'But only 1 "take_b_apart"'); @@ -62,30 +60,6 @@ is($matching_analyses->[0]->to_analysis_url, 'part_multiply', 'Correct target lo is($dfr_a->count_all_by_branch_code(1), 3, 'There are 2 #1 branches in the pipeline'); -my $long_input_id = sprintf('{ "long_param" => "%s" }', 'tmp' x 1000); -my $new_job = Bio::EnsEMBL::Hive::AnalysisJob->new( - 'input_id' => $long_input_id, - 'analysis_id' => 1, -); - -# Test the overflow to the analysis_data table -is($ada_a->count_all(), 0, "Nothing in the analysis_data table (yet)"); - -$job_a->store($new_job); -is($ada_a->count_all(), 1, "1 entry in the analysis_data table"); - -is($ada_a->fetch_by_data_TO_analysis_data_id('unmatched input_id'), undef, 'fetch_by_data_to_analysis_data_id() returns undef when it cannot find the input_id'); -my $ext_data_id = $ada_a->fetch_by_data_TO_analysis_data_id($long_input_id); -is($ext_data_id, 1, 'analysis_data_id starts at 1'); - -my $another_job = Bio::EnsEMBL::Hive::AnalysisJob->new( - 'input_id' => $long_input_id, - 'analysis_id' => 2, -); - -$job_a->store($another_job); -is($ada_a->count_all(), 1, "still 1 entry in the analysis_data table"); - done_testing(); chdir $original; diff --git a/t/02.api/overflow.t b/t/02.api/overflow.t new file mode 100755 index 000000000..7382abd09 --- /dev/null +++ b/t/02.api/overflow.t @@ -0,0 +1,183 @@ +#!/usr/bin/env perl +# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +# Copyright [2016-2018] EMBL-European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +use strict; +use warnings; + +use Data::Dumper; +use File::Temp qw{tempdir}; + +use Test::More tests => 18; + +use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Hive::ResourceClass; + +use Bio::EnsEMBL::Hive::Utils::Test qw(init_pipeline); + +# eHive needs this to initialize the pipeline (and run db_cmd.pl) +use Cwd (); +use File::Basename (); +$ENV{'EHIVE_ROOT_DIR'} ||= File::Basename::dirname( File::Basename::dirname( File::Basename::dirname( Cwd::realpath($0) ) ) ); + +my $dir = tempdir CLEANUP => 1; +chdir $dir; + +my $pipeline_url = 'sqlite:///ehive_test_pipeline_db'; + +my $hive_dba = init_pipeline('Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf', $pipeline_url, [-hive_force_init => 1]); + +my $job_a = $hive_dba->get_AnalysisJobAdaptor; +my $rcl_a = $hive_dba->get_ResourceClassAdaptor; +my $rde_a = $hive_dba->get_ResourceDescriptionAdaptor; +my $dfr_a = $hive_dba->get_DataflowRuleAdaptor; +my $ada_a = $hive_dba->get_AnalysisDataAdaptor; +my $acu_a = $hive_dba->get_AccumulatorAdaptor; +my $acr_a = $hive_dba->get_AnalysisCtrlRuleAdaptor; +my $ana_a = $hive_dba->get_AnalysisAdaptor; + +my $long_input_id = sprintf('{ "long_param" => "%s" }', 'tmp' x 1000); +my $new_job = Bio::EnsEMBL::Hive::AnalysisJob->new( + 'input_id' => $long_input_id, + 'analysis_id' => 1, +); + +# Test the overflow into the analysis_data table +# Test overflow for input_id +is($ada_a->count_all(), 0, "Nothing in the analysis_data table (yet)"); + +$job_a->store($new_job); +is($ada_a->count_all(), 1, "1 entry in the analysis_data table"); + +is($ada_a->fetch_by_data_TO_analysis_data_id('unmatched input_id'), undef, 'fetch_by_data_to_analysis_data_id() returns undef when it cannot find the input_id'); +my $ext_data_id = $ada_a->fetch_by_data_TO_analysis_data_id($long_input_id); +is($ext_data_id, 1, 'analysis_data_id starts at 1'); + +my $fan_job = Bio::EnsEMBL::Hive::AnalysisJob->new( + 'input_id' => $long_input_id, + 'analysis_id' => 2, +); + +$job_a->store($fan_job); +is($ada_a->count_all(), 1, "still 1 entry in the analysis_data table"); + +# Test overflow for resource description args + +Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->init_collections(); +my $new_rc = Bio::EnsEMBL::Hive::ResourceClass->add_new_or_update( + 'name' => 'testresourceclass', +); + +my $long_sca = 'sc' x 129; +my $long_wca = 'wc' x 129; +my $new_rd = Bio::EnsEMBL::Hive::ResourceDescription->add_new_or_update( + 'resource_class' => $new_rc, + 'meadow_type' => 'test_meadow', + 'submission_cmd_args' => $long_sca, + 'worker_cmd_args' => $long_wca, +); + +$rcl_a->store($new_rc); +$rde_a->store($new_rd); +is($ada_a->count_all(), 3, "New resource description overflowed two entries to analysis_data, total 3"); + +# Test overflow for to_analysis_urls + +my $long_struct_name = 'ta' x 129; +my $long_to_analysis_url = ':////accu?' . $long_struct_name; +my $new_dfr = Bio::EnsEMBL::Hive::DataflowRule->add_new_or_update( + 'from_analysis' => $ana_a->fetch_by_dbID(1), + 'to_analysis_url' => $long_to_analysis_url, + 'branch_code' => 3, +); + +$dfr_a->store($new_dfr); +is($ada_a->count_all(), 4, "New to_analysis_url overflowed an entry to analysis_data, total 4"); + +# Test overflow for condition analysis urls + +my $long_cau = 'cau' x 86; +my $ctrled_analysis_id = 1; +my $new_acr = Bio::EnsEMBL::Hive::AnalysisCtrlRule->add_new_or_update( + 'condition_analysis_url' => $long_cau, + 'ctrled_analysis' => $ana_a->fetch_by_dbID($ctrled_analysis_id), +); + +$acr_a->store($new_acr); +is($ada_a->count_all(), 5, "New condition_analysis_url overflowed an entry to analysis_data, total 5"); + +# Test overflow for accu key_signatures +# Note: AccumulatorAdaptor will complain if storing an accu without a proper fan job +# and semaphored funnel job + +my $accu_funnel_job = Bio::EnsEMBL::Hive::AnalysisJob->new( + 'input_id' => {}, + 'analysis_id' => 3, +); +$job_a->store($accu_funnel_job); + +my $accu_fan_job = Bio::EnsEMBL::Hive::AnalysisJob->new( + 'input_id' => {}, + 'analysis_id' => 2, + 'semaphored_job_id' => $accu_funnel_job->dbID, +); +$job_a->store($accu_fan_job); + +my $new_accu = Bio::EnsEMBL::Hive::Accumulator->new( + adaptor => $acu_a, + struct_name => $long_struct_name, + signature_template => '{key}', +); + +my $long_key_signature = 'ks' x 129; +my $long_output_id = [ { 'key' => $long_key_signature, + $long_struct_name => 1, } ]; +$new_accu->dataflow( + $long_output_id, + $accu_fan_job, +); + +is($ada_a->count_all(), 7, "Overflow for long struct_name and key_signature in accu"); + +# Test retrieval of overflow data + +my $fetched_rds = $rde_a->fetch_all(); +my $rd_with_long_args; +foreach my $fetched_rd (@$fetched_rds) { + if ($fetched_rd->resource_class_id() == $new_rc->dbID) { + $rd_with_long_args = $fetched_rd; + } +} + +is($rd_with_long_args->submission_cmd_args, $long_sca, "Retrieved long submission_cmd_args"); +is($rd_with_long_args->worker_cmd_args, $long_wca, "Retrieved long worker_cmd_args"); + +my $fetched_dfr = $dfr_a->fetch_by_dbID($new_dfr->dbID); +is ($fetched_dfr->to_analysis_url, $long_to_analysis_url, "Retrieved long to_analysis_url"); + +my $fetched_acr = $acr_a->fetch_by_ctrled_analysis_id($ctrled_analysis_id); +is ($fetched_acr->condition_analysis_url, $long_cau, "Retrieved long condition_analysis_url"); + +# $fetched_accu_structures->{$receiving_job_id}->{$struct_name}->{$key_signature} = value +my $fetched_accu_structures = $acu_a->fetch_structures_for_job_ids($accu_funnel_job->dbID); +my $fetched_accu_hash = $fetched_accu_structures->{$accu_funnel_job->dbID}; +my $fetched_struct_name = (keys(%$fetched_accu_hash))[0]; +my $fetched_key_signature = (keys(%$fetched_accu_hash->{$fetched_struct_name}))[0]; + +is ($fetched_struct_name, $long_struct_name, "fetched long struct_name from accu"); +is ($fetched_key_signature, $long_key_signature, "fetched long key_signature from accu"); + +done_testing(); From 1beb0a53bad9ca46dc4640da5c7f86f66de3e1fa Mon Sep 17 00:00:00 2001 From: ens-bwalts Date: Tue, 28 Aug 2018 13:34:25 +0100 Subject: [PATCH 3/5] syntax for hashref element access changes between Perl versions --- t/02.api/overflow.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/02.api/overflow.t b/t/02.api/overflow.t index 7382abd09..6b041b238 100755 --- a/t/02.api/overflow.t +++ b/t/02.api/overflow.t @@ -175,7 +175,7 @@ is ($fetched_acr->condition_analysis_url, $long_cau, "Retrieved long condition_a my $fetched_accu_structures = $acu_a->fetch_structures_for_job_ids($accu_funnel_job->dbID); my $fetched_accu_hash = $fetched_accu_structures->{$accu_funnel_job->dbID}; my $fetched_struct_name = (keys(%$fetched_accu_hash))[0]; -my $fetched_key_signature = (keys(%$fetched_accu_hash->{$fetched_struct_name}))[0]; +my $fetched_key_signature = (keys(%{$fetched_accu_hash->{$fetched_struct_name}}))[0]; is ($fetched_struct_name, $long_struct_name, "fetched long struct_name from accu"); is ($fetched_key_signature, $long_key_signature, "fetched long key_signature from accu"); From d657f9dd70abc915e797f205493cc488420796be Mon Sep 17 00:00:00 2001 From: ens-bwalts Date: Wed, 3 Apr 2019 09:38:41 +0100 Subject: [PATCH 4/5] Suggested improvements to overflow code, including moving slicer to BaseAdaptor --- .../EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm | 1 - modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm | 74 ++++++++++++++++++- .../EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm | 18 ----- .../Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm | 19 ----- t/02.api/overflow.t | 35 +++++++-- 5 files changed, 101 insertions(+), 46 deletions(-) diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm index c342d6658..3d41b5a4b 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm @@ -51,7 +51,6 @@ sub default_table_name { sub overflow_limit { return { 'key_signature' => 255, - 'struct_name' => 255, }; } diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm index c991e79fb..ff59e5bb9 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm @@ -40,6 +40,7 @@ use strict; use warnings; no strict 'refs'; # needed to allow AUTOLOAD create new methods use DBI 1.6; # the 1.6 functionality is important for detecting autoincrement fields and other magic. +use Scalar::Util ('blessed'); use Bio::EnsEMBL::Hive::Utils ('stringify', 'throw'); @@ -138,6 +139,26 @@ sub overflow_limit { return $self->{_overflow_limit} || $self->default_overflow_limit(); } +=head2 size_limit + +Arg[1] : (optional) Hashref with column names as keys and column size as values +Description : Getter/setter for column size limits for the implementing adaptor. + If no size limit is set, this method will call _table_info_loader() to + get sizes from the database schema. +Returntype : Hashref + +=cut + +sub size_limit { + my $self = shift @_; + + if(@_) { # setter + $self->{_size_limit} = shift @_; + } elsif ( !defined( $self->{_size_limit} ) ) { + $self->_table_info_loader(); + } + return $self->{_size_limit}; +} sub input_column_mapping { my $self = shift @_; @@ -229,17 +250,19 @@ sub _table_info_loader { my $table_name = $self->table_name(); my %column_set = (); + my %size_limit = (); my $autoinc_id = ''; my @primary_key = $dbh->primary_key(undef, undef, $table_name); my $sth = $dbh->column_info(undef, undef, $table_name, '%'); $sth->execute(); while (my $row = $sth->fetchrow_hashref()) { - my ( $column_name, $column_type ) = @$row{'COLUMN_NAME', 'TYPE_NAME'}; + my ( $column_name, $column_type, $size_limit ) = @$row{'COLUMN_NAME', 'TYPE_NAME', 'COLUMN_SIZE'}; # warn "ColumnInfo [$table_name/$column_name] = $column_type\n"; $column_set{$column_name} = $column_type; + $size_limit{$column_name} = $size_limit; if( ($column_name eq $table_name.'_id') or ($table_name eq 'analysis_base' and $column_name eq 'analysis_id') ) { # a special case (historical) @@ -249,6 +272,7 @@ sub _table_info_loader { $sth->finish; $self->column_set( \%column_set ); + $self->size_limit( \%size_limit ); $self->primary_key( \@primary_key ); $self->autoinc_id( $autoinc_id ); } @@ -552,6 +576,54 @@ sub check_and_dereference_analysis_data { } } +sub slicer { + my ($self, $sliceable, $fields) = @_; + + my $is_object = blessed($sliceable) ? 1 : 0; + + my $autoinc_id; + if ($is_object) { + $autoinc_id = $self->autoinc_id(); + } + my $overflow_limit = $self->overflow_limit(); + my $size_limit = $self->size_limit(); + + my @slice; + # keep track of any values needing overflow, so that overflow can + # be deferred until after checking all fields for size limit violations + my %needs_overflow; + + for (my $i = 0; $i <= $#{$fields}; $i++) { + my $field = $fields->[$i]; + my $value = $is_object ? $sliceable->$field() : $sliceable->{$field}; + my $ol = $overflow_limit->{$field}; + my $sl = $size_limit->{$field}; + + if ($is_object && $field eq $autoinc_id) { + $slice[$i] = $sliceable->dbID(); + } elsif (defined($ol) and defined($value) and (length($value) > $ol)) { + # if overflow limit exists for this field, we can ignore + # any size limit since an excessively large field will be + # handled through overflow + $needs_overflow{$i} = $value; + } elsif (defined($sl) and defined($value) and (length($value) > $sl)) { + # if no overflow limit, then check size and generate a meaningful error + # as some RDBMS implementations fail silently or misleadingly when trying + # to store an oversize value + throw("length of value for column \"" . $field . + "\" exceeds the maximum size, which is $sl"); + } else { + $slice[$i] = $value; + } + } + + foreach my $fields_index (keys(%needs_overflow)) { + $slice[$fields_index] = + $self->db->get_AnalysisDataAdaptor()->store_if_needed($needs_overflow{$fields_index}); + } + + return \@slice; +} sub DESTROY { } # to simplify AUTOLOAD diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm index 48e602d74..307512803 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm @@ -44,24 +44,6 @@ use Bio::EnsEMBL::Hive::NakedTable; use base ('Bio::EnsEMBL::Hive::DBSQL::BaseAdaptor'); -sub slicer { # take a slice of the hashref (if only we could inline in Perl!) - my ($self, $hashref, $fields) = @_; - - my $overflow_limit = $self->overflow_limit(); - - return [ map { eval { my $value = $hashref->{$_}; - my $ol = $overflow_limit->{$_}; - if (defined($ol) and defined($value) and (length($value) > $ol)) { - $self->db->get_AnalysisDataAdaptor()->store_if_needed($value); - } else { - $value; - } - } - - } @$fields ]; -} - - sub objectify { # pretend the hashref becomes an object (if only we could inline in Perl!) return pop @_; } diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm index b3126e86f..f833f47bf 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm @@ -47,25 +47,6 @@ sub object_class { die "Please define object_class() in your specific adaptor class to return the class name of your intended object"; } - -sub slicer { # take a slice of the object (if only we could inline in Perl!) - my ($self, $object, $fields) = @_; - - my $autoinc_id = $self->autoinc_id(); - my $overflow_limit = $self->overflow_limit(); - - return [ map { ($_ eq $autoinc_id) - ? $object->dbID() - : eval { my $value = $object->$_(); - my $ol = $overflow_limit->{$_}; - (defined($ol) and defined($value) and length($value)>$ol) - ? $self->db->get_AnalysisDataAdaptor()->store_if_needed( $value ) - : $value - } - } @$fields ]; -} - - sub objectify { # turn the hashref into an object (if only we could inline in Perl!) my ($self, $hashref) = @_; diff --git a/t/02.api/overflow.t b/t/02.api/overflow.t index 6b041b238..259e96541 100755 --- a/t/02.api/overflow.t +++ b/t/02.api/overflow.t @@ -1,6 +1,6 @@ #!/usr/bin/env perl # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute -# Copyright [2016-2018] EMBL-European Bioinformatics Institute +# Copyright [2016-2019] EMBL-European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,8 @@ use warnings; use Data::Dumper; use File::Temp qw{tempdir}; -use Test::More tests => 18; +use Test::More tests => 20; +use Test::Exception; use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor; use Bio::EnsEMBL::Hive::ResourceClass; @@ -145,12 +146,32 @@ my $new_accu = Bio::EnsEMBL::Hive::Accumulator->new( my $long_key_signature = 'ks' x 129; my $long_output_id = [ { 'key' => $long_key_signature, $long_struct_name => 1, } ]; -$new_accu->dataflow( - $long_output_id, - $accu_fan_job, + +throws_ok { $new_accu->dataflow( + $long_output_id, + $accu_fan_job,) + } qr/length of value/, + "exception to insert of oversize struct name"; + +is($ada_a->count_all(), 5, "Oversize struct name not inserted"); + +my $short_struct_name = 'ssn'; + +my $sslk_accu = Bio::EnsEMBL::Hive::Accumulator->new( + adaptor => $acu_a, + struct_name => $short_struct_name, + signature_template => '{key}', ); -is($ada_a->count_all(), 7, "Overflow for long struct_name and key_signature in accu"); +my $sslk_output_id = [ { 'key' => $long_key_signature, + 'ssn' => 1, } ]; + +$sslk_accu->dataflow( + $sslk_output_id, + $accu_fan_job,); + +is($ada_a->count_all(), 6, "Oversize key signature overflow"); + # Test retrieval of overflow data @@ -177,7 +198,7 @@ my $fetched_accu_hash = $fetched_accu_structures->{$accu_funnel_job->dbID}; my $fetched_struct_name = (keys(%$fetched_accu_hash))[0]; my $fetched_key_signature = (keys(%{$fetched_accu_hash->{$fetched_struct_name}}))[0]; -is ($fetched_struct_name, $long_struct_name, "fetched long struct_name from accu"); +is ($fetched_struct_name, $short_struct_name, "fetched short struct_name from accu"); is ($fetched_key_signature, $long_key_signature, "fetched long key_signature from accu"); done_testing(); From 8beee26bde585fe008dce36140b44b32c6dd20e6 Mon Sep 17 00:00:00 2001 From: ens-bwalts Date: Wed, 3 Apr 2019 13:44:20 +0100 Subject: [PATCH 5/5] When using PostgreSQL, only set a size_limit for VARCHAR columns --- modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm index ff59e5bb9..fc853a1a4 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm @@ -262,7 +262,14 @@ sub _table_info_loader { # warn "ColumnInfo [$table_name/$column_name] = $column_type\n"; $column_set{$column_name} = $column_type; - $size_limit{$column_name} = $size_limit; + + # PostgreSQL reports a COLUMN_SIZE of 4 for enums, which is not compatible with + # the way slicer does column size checking. Likewise, PostgreSQL reports + # a user-defined TYPE_NAME for enums, rather than 'enum'. Therefore, if + # the DB is PostgreSQL, only set size_limit for varchars + unless (($driver eq 'pgsql') && !($column_type eq 'character varying')) { + $size_limit{$column_name} = $size_limit; + } if( ($column_name eq $table_name.'_id') or ($table_name eq 'analysis_base' and $column_name eq 'analysis_id') ) { # a special case (historical)