-
Notifications
You must be signed in to change notification settings - Fork 28
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
enable overflow for accu keys, to_analysis_urls, and submission_cmd_args #40
Changes from 3 commits
1e85f0b
48b3e58
1beb0a5
d657f9d
8beee26
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,6 +48,14 @@ sub default_table_name { | |
} | ||
|
||
|
||
sub overflow_limit { | ||
return { | ||
'key_signature' => 255, | ||
'struct_name' => 255, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure about overflowing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. struct_name needs protection because it's linked to to_analysis_url, which is itself protected. As for param_name, logic_name, and others that get set at init time - I'm not sure it's worth introducing a size limit which would throw an error, since the MySQL error that gets thrown is actually informative in this case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It actually depends how the MySQL server has been configured. mysql-ens-compara-prod-1 complains:
whereas mysql-ens-reg-prod-4 silently truncates the logic name to 256 characters There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have been experimenting (not pushed yet) with getting sizes in _table_info_loader(). Straightforward, but for many types COLUMN_SIZE returns null. These are types such as INT or TEXT, where either the size is very large, or that we are less concerned about. The bottom line being that the check against size_limit is not comprehensive without a lot of work (and somehow defining it for every column elsewhere in the code), but it does cover the cases that the ticket and this PR care about |
||
}; | ||
} | ||
|
||
|
||
sub fetch_structures_for_job_ids { | ||
my ($self, $job_ids_csv, $id_scale, $id_offset) = @_; | ||
$id_scale ||= 1; | ||
|
@@ -63,6 +71,9 @@ sub fetch_structures_for_job_ids { | |
|
||
ROW: while(my ($receiving_job_id, $struct_name, $key_signature, $stringified_value) = $sth->fetchrow_array() ) { | ||
|
||
($key_signature, $struct_name) = map {$self->check_and_dereference_analysis_data($_)} | ||
($key_signature, $struct_name); | ||
|
||
muffato marked this conversation as resolved.
Show resolved
Hide resolved
|
||
my $value = destringify($stringified_value); | ||
|
||
my $sptr = \$structures{$receiving_job_id * $id_scale + $id_offset}{$struct_name}; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,7 +47,18 @@ use base ('Bio::EnsEMBL::Hive::DBSQL::BaseAdaptor'); | |
sub slicer { # take a slice of the hashref (if only we could inline in Perl!) | ||
my ($self, $hashref, $fields) = @_; | ||
|
||
return [ @$hashref{@$fields} ]; | ||
my $overflow_limit = $self->overflow_limit(); | ||
|
||
return [ map { eval { my $value = $hashref->{$_}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know there is an |
||
my $ol = $overflow_limit->{$_}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is almost exactly the same as in ObjectAdaptor. I would prefer this to be in BaseAdaptor but I'm not sure how to do it cleanly, clearly and efficiently. |
||
if (defined($ol) and defined($value) and (length($value) > $ol)) { | ||
$self->db->get_AnalysisDataAdaptor()->store_if_needed($value); | ||
} else { | ||
$value; | ||
} | ||
} | ||
|
||
} @$fields ]; | ||
} | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
#!/usr/bin/env perl | ||
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | ||
# Copyright [2016-2018] EMBL-European Bioinformatics Institute | ||
muffato marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
use strict; | ||
use warnings; | ||
|
||
use Data::Dumper; | ||
use File::Temp qw{tempdir}; | ||
|
||
use Test::More tests => 18; | ||
|
||
use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor; | ||
use Bio::EnsEMBL::Hive::ResourceClass; | ||
|
||
use Bio::EnsEMBL::Hive::Utils::Test qw(init_pipeline); | ||
|
||
# eHive needs this to initialize the pipeline (and run db_cmd.pl) | ||
use Cwd (); | ||
use File::Basename (); | ||
$ENV{'EHIVE_ROOT_DIR'} ||= File::Basename::dirname( File::Basename::dirname( File::Basename::dirname( Cwd::realpath($0) ) ) ); | ||
|
||
my $dir = tempdir CLEANUP => 1; | ||
chdir $dir; | ||
|
||
my $pipeline_url = 'sqlite:///ehive_test_pipeline_db'; | ||
|
||
my $hive_dba = init_pipeline('Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf', $pipeline_url, [-hive_force_init => 1]); | ||
|
||
my $job_a = $hive_dba->get_AnalysisJobAdaptor; | ||
my $rcl_a = $hive_dba->get_ResourceClassAdaptor; | ||
my $rde_a = $hive_dba->get_ResourceDescriptionAdaptor; | ||
my $dfr_a = $hive_dba->get_DataflowRuleAdaptor; | ||
my $ada_a = $hive_dba->get_AnalysisDataAdaptor; | ||
my $acu_a = $hive_dba->get_AccumulatorAdaptor; | ||
my $acr_a = $hive_dba->get_AnalysisCtrlRuleAdaptor; | ||
my $ana_a = $hive_dba->get_AnalysisAdaptor; | ||
|
||
my $long_input_id = sprintf('{ "long_param" => "%s" }', 'tmp' x 1000); | ||
my $new_job = Bio::EnsEMBL::Hive::AnalysisJob->new( | ||
'input_id' => $long_input_id, | ||
'analysis_id' => 1, | ||
); | ||
|
||
# Test the overflow into the analysis_data table | ||
# Test overflow for input_id | ||
is($ada_a->count_all(), 0, "Nothing in the analysis_data table (yet)"); | ||
|
||
$job_a->store($new_job); | ||
is($ada_a->count_all(), 1, "1 entry in the analysis_data table"); | ||
|
||
is($ada_a->fetch_by_data_TO_analysis_data_id('unmatched input_id'), undef, 'fetch_by_data_to_analysis_data_id() returns undef when it cannot find the input_id'); | ||
my $ext_data_id = $ada_a->fetch_by_data_TO_analysis_data_id($long_input_id); | ||
is($ext_data_id, 1, 'analysis_data_id starts at 1'); | ||
|
||
my $fan_job = Bio::EnsEMBL::Hive::AnalysisJob->new( | ||
'input_id' => $long_input_id, | ||
'analysis_id' => 2, | ||
); | ||
|
||
$job_a->store($fan_job); | ||
is($ada_a->count_all(), 1, "still 1 entry in the analysis_data table"); | ||
|
||
# Test overflow for resource description args | ||
|
||
Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->init_collections(); | ||
my $new_rc = Bio::EnsEMBL::Hive::ResourceClass->add_new_or_update( | ||
'name' => 'testresourceclass', | ||
); | ||
|
||
my $long_sca = 'sc' x 129; | ||
my $long_wca = 'wc' x 129; | ||
my $new_rd = Bio::EnsEMBL::Hive::ResourceDescription->add_new_or_update( | ||
'resource_class' => $new_rc, | ||
'meadow_type' => 'test_meadow', | ||
'submission_cmd_args' => $long_sca, | ||
'worker_cmd_args' => $long_wca, | ||
); | ||
|
||
$rcl_a->store($new_rc); | ||
$rde_a->store($new_rd); | ||
is($ada_a->count_all(), 3, "New resource description overflowed two entries to analysis_data, total 3"); | ||
|
||
# Test overflow for to_analysis_urls | ||
|
||
my $long_struct_name = 'ta' x 129; | ||
my $long_to_analysis_url = ':////accu?' . $long_struct_name; | ||
my $new_dfr = Bio::EnsEMBL::Hive::DataflowRule->add_new_or_update( | ||
'from_analysis' => $ana_a->fetch_by_dbID(1), | ||
'to_analysis_url' => $long_to_analysis_url, | ||
'branch_code' => 3, | ||
); | ||
|
||
$dfr_a->store($new_dfr); | ||
is($ada_a->count_all(), 4, "New to_analysis_url overflowed an entry to analysis_data, total 4"); | ||
|
||
# Test overflow for condition analysis urls | ||
|
||
my $long_cau = 'cau' x 86; | ||
my $ctrled_analysis_id = 1; | ||
my $new_acr = Bio::EnsEMBL::Hive::AnalysisCtrlRule->add_new_or_update( | ||
'condition_analysis_url' => $long_cau, | ||
'ctrled_analysis' => $ana_a->fetch_by_dbID($ctrled_analysis_id), | ||
); | ||
|
||
$acr_a->store($new_acr); | ||
is($ada_a->count_all(), 5, "New condition_analysis_url overflowed an entry to analysis_data, total 5"); | ||
|
||
# Test overflow for accu key_signatures | ||
# Note: AccumulatorAdaptor will complain if storing an accu without a proper fan job | ||
# and semaphored funnel job | ||
|
||
my $accu_funnel_job = Bio::EnsEMBL::Hive::AnalysisJob->new( | ||
'input_id' => {}, | ||
'analysis_id' => 3, | ||
); | ||
$job_a->store($accu_funnel_job); | ||
|
||
my $accu_fan_job = Bio::EnsEMBL::Hive::AnalysisJob->new( | ||
'input_id' => {}, | ||
'analysis_id' => 2, | ||
'semaphored_job_id' => $accu_funnel_job->dbID, | ||
); | ||
$job_a->store($accu_fan_job); | ||
|
||
my $new_accu = Bio::EnsEMBL::Hive::Accumulator->new( | ||
adaptor => $acu_a, | ||
struct_name => $long_struct_name, | ||
signature_template => '{key}', | ||
); | ||
|
||
my $long_key_signature = 'ks' x 129; | ||
my $long_output_id = [ { 'key' => $long_key_signature, | ||
$long_struct_name => 1, } ]; | ||
$new_accu->dataflow( | ||
$long_output_id, | ||
$accu_fan_job, | ||
); | ||
|
||
is($ada_a->count_all(), 7, "Overflow for long struct_name and key_signature in accu"); | ||
|
||
# Test retrieval of overflow data | ||
|
||
my $fetched_rds = $rde_a->fetch_all(); | ||
my $rd_with_long_args; | ||
foreach my $fetched_rd (@$fetched_rds) { | ||
if ($fetched_rd->resource_class_id() == $new_rc->dbID) { | ||
$rd_with_long_args = $fetched_rd; | ||
} | ||
} | ||
|
||
is($rd_with_long_args->submission_cmd_args, $long_sca, "Retrieved long submission_cmd_args"); | ||
is($rd_with_long_args->worker_cmd_args, $long_wca, "Retrieved long worker_cmd_args"); | ||
|
||
my $fetched_dfr = $dfr_a->fetch_by_dbID($new_dfr->dbID); | ||
is ($fetched_dfr->to_analysis_url, $long_to_analysis_url, "Retrieved long to_analysis_url"); | ||
|
||
my $fetched_acr = $acr_a->fetch_by_ctrled_analysis_id($ctrled_analysis_id); | ||
is ($fetched_acr->condition_analysis_url, $long_cau, "Retrieved long condition_analysis_url"); | ||
|
||
# $fetched_accu_structures->{$receiving_job_id}->{$struct_name}->{$key_signature} = value | ||
my $fetched_accu_structures = $acu_a->fetch_structures_for_job_ids($accu_funnel_job->dbID); | ||
my $fetched_accu_hash = $fetched_accu_structures->{$accu_funnel_job->dbID}; | ||
my $fetched_struct_name = (keys(%$fetched_accu_hash))[0]; | ||
my $fetched_key_signature = (keys(%{$fetched_accu_hash->{$fetched_struct_name}}))[0]; | ||
|
||
is ($fetched_struct_name, $long_struct_name, "fetched long struct_name from accu"); | ||
is ($fetched_key_signature, $long_key_signature, "fetched long key_signature from accu"); | ||
|
||
done_testing(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's odd to me how many times you overload the overflow_limit sub.