Ensembl · ens-bwalts · Jul 24, 2018 · Aug 24, 2018 · Aug 28, 2018 · Apr 3, 2019
diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AccumulatorAdaptor.pm
@@ -48,6 +48,13 @@ sub default_table_name {
 }
 
 
+sub overflow_limit {
+    return {
+        'key_signature' => 255,
+    };
+}
+
+
 sub fetch_structures_for_job_ids {
     my ($self, $job_ids_csv, $id_scale, $id_offset) = @_;
     $id_scale   ||= 1;
@@ -63,6 +70,9 @@ sub fetch_structures_for_job_ids {
 
         ROW: while(my ($receiving_job_id, $struct_name, $key_signature, $stringified_value) = $sth->fetchrow_array() ) {
 
+            ($key_signature, $struct_name) = map {$self->check_and_dereference_analysis_data($_)}
+                ($key_signature, $struct_name);
+
             my $value = destringify($stringified_value);
 
             my $sptr = \$structures{$receiving_job_id * $id_scale + $id_offset}{$struct_name};

diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisCtrlRuleAdaptor.pm
@@ -53,6 +53,11 @@ sub default_insertion_method {
     return 'INSERT_IGNORE';
 }
 
+sub overflow_limit {
+    return {
+        'condition_analysis_url' => 255,
+    };
+}
 
 sub object_class {
     return 'Bio::EnsEMBL::Hive::AnalysisCtrlRule';

diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm
@@ -700,9 +700,7 @@ sub fetch_input_ids_for_job_ids {
         $sth->execute();
 
         while(my ($job_id, $input_id) = $sth->fetchrow_array() ) {
-            if($input_id =~ /^_ext(?:\w+)_data_id (\d+)$/) {
-                $input_id = $self->db->get_AnalysisDataAdaptor->fetch_by_analysis_data_id_TO_data($1);
-            }
+            $input_id = $self->check_and_dereference_analysis_data($input_id);
             $input_ids{$job_id * $id_scale + $id_offset} = $input_id;
         }
     }

diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm
@@ -40,6 +40,7 @@ use strict;
 use warnings;
 no strict 'refs';   # needed to allow AUTOLOAD create new methods
 use DBI 1.6;        # the 1.6 functionality is important for detecting autoincrement fields and other magic.
+use Scalar::Util ('blessed');
 
 use Bio::EnsEMBL::Hive::Utils ('stringify', 'throw');
 
@@ -138,6 +139,26 @@ sub overflow_limit {
     return $self->{_overflow_limit} || $self->default_overflow_limit();
 }
 
+=head2 size_limit
+
+Arg[1]      : (optional) Hashref with column names as keys and column size as values
+Description : Getter/setter for column size limits for the implementing adaptor.
+              If no size limit is set, this method will call _table_info_loader() to
+              get sizes from the database schema.
+Returntype  : Hashref
+
+=cut
+
+sub size_limit {
+    my $self = shift @_;
+
+    if(@_) {   # setter
+        $self->{_size_limit} = shift @_;
+    } elsif ( !defined( $self->{_size_limit} ) ) {
+        $self->_table_info_loader();
+    }
+    return $self->{_size_limit};
+}
 
 sub input_column_mapping {
     my $self = shift @_;
@@ -229,18 +250,27 @@ sub _table_info_loader {
     my $table_name  = $self->table_name();
 
     my %column_set  = ();
+    my %size_limit  = ();
     my $autoinc_id  = '';
     my @primary_key = $dbh->primary_key(undef, undef, $table_name);
 
     my $sth = $dbh->column_info(undef, undef, $table_name, '%');
     $sth->execute();
     while (my $row = $sth->fetchrow_hashref()) {
-        my ( $column_name, $column_type ) = @$row{'COLUMN_NAME', 'TYPE_NAME'};
+        my ( $column_name, $column_type, $size_limit ) = @$row{'COLUMN_NAME', 'TYPE_NAME', 'COLUMN_SIZE'};
 
         # warn "ColumnInfo [$table_name/$column_name] = $column_type\n";
 
         $column_set{$column_name}  = $column_type;
 
+        # PostgreSQL reports a COLUMN_SIZE of 4 for enums, which is not compatible with
+        # the way slicer does column size checking. Likewise, PostgreSQL reports
+        # a user-defined TYPE_NAME for enums, rather than 'enum'. Therefore, if
+        # the DB is PostgreSQL, only set size_limit for varchars
+        unless (($driver eq 'pgsql') && !($column_type eq 'character varying')) {
+            $size_limit{$column_name}  = $size_limit;
+        }
+
         if( ($column_name eq $table_name.'_id')
          or ($table_name eq 'analysis_base' and $column_name eq 'analysis_id') ) {    # a special case (historical)
             $autoinc_id = $column_name;
@@ -249,6 +279,7 @@ sub _table_info_loader {
     $sth->finish;
 
     $self->column_set(  \%column_set );
+    $self->size_limit(  \%size_limit );
     $self->primary_key( \@primary_key );
     $self->autoinc_id(   $autoinc_id );
 }
@@ -303,9 +334,7 @@ sub fetch_all {
     while(my $hashref = $sth->fetchrow_hashref) {
 
         foreach my $overflow_key (@overflow_columns) {
-            if($hashref->{$overflow_key} =~ /^_ext(?:\w+)_data_id (\d+)$/) {
-                $hashref->{$overflow_key} = $overflow_adaptor->fetch_by_analysis_data_id_TO_data($1);
-            }
+            $hashref->{$overflow_key} = $self->check_and_dereference_analysis_data($hashref->{$overflow_key});
         }
 
         my $pptr = \$result_struct;
@@ -530,6 +559,79 @@ sub store {
 }
 
 
+=head2 check_and_dereference_analysis_data
+
+   Arg [1]     : string data that may reference an analysis_data_id
+   Usage       : my $value = $self->check_and_dereference_analysis_data($$fetched_row[0])
+   Description : Checks to see if the passed value matches the regular expression
+               : /^_ext(?:\w+)_data_id (\d+)$/ (e.g. "external_data_id 3", pointer to an analysis_data_id).
+               : If so, it returns the entry from the "data" column in analysis_data from the
+               : row containing the given analysis_data_id.
+               : If not, it returns the passed parameter unchanged.
+   Returntype  : string
+
+=cut
+
+sub check_and_dereference_analysis_data {
+    my $self = shift @_;
+    my $possible_analysis_data_id_ref = shift @_;
+
+    if ($possible_analysis_data_id_ref  =~ /^_ext(?:\w+)_data_id (\d+)$/) {
+        return $self->db->get_AnalysisDataAdaptor->fetch_by_analysis_data_id_TO_data($1);
+    } else {
+        return $possible_analysis_data_id_ref;
+    }
+}
+
+sub slicer {
+    my ($self, $sliceable, $fields) = @_;
+
+    my $is_object = blessed($sliceable) ? 1 : 0;
+
+    my $autoinc_id;
+    if ($is_object) {
+        $autoinc_id      = $self->autoinc_id();
+    }
+    my $overflow_limit  = $self->overflow_limit();
+    my $size_limit      = $self->size_limit();
+
+    my @slice;
+    # keep track of any values needing overflow, so that overflow can
+    # be deferred until after checking all fields for size limit violations
+    my %needs_overflow;
+
+    for (my $i = 0; $i <= $#{$fields}; $i++) {
+        my $field = $fields->[$i];
+        my $value = $is_object ? $sliceable->$field() : $sliceable->{$field};
+        my $ol = $overflow_limit->{$field};
+        my $sl = $size_limit->{$field};
+
+        if ($is_object && $field eq $autoinc_id) {
+            $slice[$i] = $sliceable->dbID();
+        } elsif (defined($ol) and defined($value) and (length($value) > $ol)) {
+            # if overflow limit exists for this field, we can ignore
+            # any size limit since an excessively large field will be
+            # handled through overflow
+            $needs_overflow{$i} = $value;
+        } elsif (defined($sl) and defined($value) and (length($value) > $sl)) {
+            # if no overflow limit, then check size and generate a meaningful error
+            # as some RDBMS implementations fail silently or misleadingly when trying
+            # to store an oversize value
+            throw("length of value for column \"" . $field .
+                  "\" exceeds the maximum size, which is $sl");
+        } else {
+            $slice[$i] = $value;
+        }
+    }
+
+    foreach my $fields_index (keys(%needs_overflow)) {
+        $slice[$fields_index] =
+            $self->db->get_AnalysisDataAdaptor()->store_if_needed($needs_overflow{$fields_index});
+    }
+
+    return \@slice;
+}
+
 sub DESTROY { }   # to simplify AUTOLOAD
 
 sub AUTOLOAD {

diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/DataflowRuleAdaptor.pm
@@ -55,6 +55,13 @@ sub default_insertion_method {
 }
 
 
+sub overflow_limit {
+    return {
+        'to_analysis_url' => 255,
+    };
+}
+
+
 sub object_class {
     return 'Bio::EnsEMBL::Hive::DataflowRule';
 }

diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/NakedTableAdaptor.pm
@@ -44,13 +44,6 @@ use Bio::EnsEMBL::Hive::NakedTable;
 use base ('Bio::EnsEMBL::Hive::DBSQL::BaseAdaptor');
 
 
-sub slicer {    # take a slice of the hashref (if only we could inline in Perl!)
-    my ($self, $hashref, $fields) = @_;
-
-    return [ @$hashref{@$fields} ];
-}
-
-
 sub objectify {    # pretend the hashref becomes an object (if only we could inline in Perl!)
     return pop @_;
 }

diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/ObjectAdaptor.pm
@@ -47,25 +47,6 @@ sub object_class {
     die "Please define object_class() in your specific adaptor class to return the class name of your intended object";
 }
 
-
-sub slicer {    # take a slice of the object (if only we could inline in Perl!)
-    my ($self, $object, $fields) = @_;
-
-    my $autoinc_id      = $self->autoinc_id();
-    my $overflow_limit  = $self->overflow_limit();
-
-    return [ map { ($_ eq $autoinc_id)
-                    ? $object->dbID()
-                    : eval { my $value  = $object->$_();
-                             my $ol     = $overflow_limit->{$_};
-                             (defined($ol) and length($value)>$ol)
-                                ? $self->db->get_AnalysisDataAdaptor()->store_if_needed( $value )
-                                : $value
-                      }
-                 } @$fields ];
-}
-
-
 sub objectify { # turn the hashref into an object (if only we could inline in Perl!)
     my ($self, $hashref) = @_;
 

diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/ResourceDescriptionAdaptor.pm
@@ -55,6 +55,14 @@ sub default_insertion_method {
 }
 
 
+sub overflow_limit {
+    return {
+        'submission_cmd_args' => 255,
+        'worker_cmd_args'     => 255,
+    };
+}
+
+
 sub object_class {
     return 'Bio::EnsEMBL::Hive::ResourceDescription';
 }

diff --git a/t/02.api/fetch_and_count_by_multiple_columns.t b/t/02.api/fetch_and_count_by_multiple_columns.t
@@ -21,7 +21,7 @@ use warnings;
 use Data::Dumper;
 use File::Temp qw{tempdir};
 
-use Test::More tests => 16;
+use Test::More tests => 11;
 
 use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
 
@@ -40,9 +40,7 @@ my $pipeline_url      = 'sqlite:///ehive_test_pipeline_db';
 
 my $hive_dba    = init_pipeline('Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf', $pipeline_url, [-hive_force_init => 1]);
 my $ana_a       = $hive_dba->get_AnalysisAdaptor;
-my $job_a       = $hive_dba->get_AnalysisJobAdaptor;
 my $dfr_a       = $hive_dba->get_DataflowRuleAdaptor;
-my $ada_a       = $hive_dba->get_AnalysisDataAdaptor;
 
 is($ana_a->count_all(), 3, 'There are 3 analyses in the pipeline');
 is($ana_a->count_all_by_logic_name('take_b_apart'), 1, 'But only 1 "take_b_apart"');
@@ -62,30 +60,6 @@ is($matching_analyses->[0]->to_analysis_url, 'part_multiply', 'Correct target lo
 
 is($dfr_a->count_all_by_branch_code(1), 3, 'There are 2 #1 branches in the pipeline');
 
-my $long_input_id = sprintf('{ "long_param" => "%s" }', 'tmp' x 1000);
-my $new_job = Bio::EnsEMBL::Hive::AnalysisJob->new(
-    'input_id'      => $long_input_id,
-    'analysis_id'   => 1,
-);
-
-# Test the overflow to the analysis_data table
-is($ada_a->count_all(), 0, "Nothing in the analysis_data table (yet)");
-
-$job_a->store($new_job);
-is($ada_a->count_all(), 1, "1 entry in the analysis_data table");
-
-is($ada_a->fetch_by_data_TO_analysis_data_id('unmatched input_id'), undef, 'fetch_by_data_to_analysis_data_id() returns undef when it cannot find the input_id');
-my $ext_data_id = $ada_a->fetch_by_data_TO_analysis_data_id($long_input_id);
-is($ext_data_id, 1, 'analysis_data_id starts at 1');
-
-my $another_job = Bio::EnsEMBL::Hive::AnalysisJob->new(
-    'input_id'      => $long_input_id,
-    'analysis_id'   => 2,
-);
-
-$job_a->store($another_job);
-is($ada_a->count_all(), 1, "still 1 entry in the analysis_data table");
-
 done_testing();
 
 chdir $original;