Skip to content

Commit

Permalink
Merge pull request #1805 from nakib103/install_https
Browse files Browse the repository at this point in the history
Allow https download and GITHUB_TOKEN in VEP Install
  • Loading branch information
olaaustine authored Jan 28, 2025
2 parents 70fa548 + 9765531 commit 5a816ab
Showing 1 changed file with 206 additions and 24 deletions.
230 changes: 206 additions & 24 deletions INSTALL.pl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ =head1 NAME
use Net::FTP;
use Cwd;
use Scalar::Util qw(looks_like_number);
use List::Util qw(first);
use Bio::EnsEMBL::VEP::Utils qw(get_version_data get_version_string);

our (
Expand Down Expand Up @@ -79,6 +80,8 @@ =head1 NAME
$REALPATH_DEST_DIR,
$NO_TEST,
$NO_BIOPERL,
$USE_HTTPS_PROTO,
$GITHUB_TOKEN,
$ua,

$CAN_USE_CURL,
Expand All @@ -90,6 +93,7 @@ =head1 NAME
$CAN_USE_TAR,
$CAN_USE_DBI,
$CAN_USE_DBD_MYSQL,
$CAN_USE_HTTML_EXTRACT
);


Expand Down Expand Up @@ -120,6 +124,10 @@ BEGIN
$CAN_USE_DBD_MYSQL = 1;
}

if(eval q{ use HTML::TableExtract; 1 }) {
$CAN_USE_HTTML_EXTRACT = 1;
}

$CAN_USE_CURL = 1 if `which curl` =~ /\/curl/;
$CAN_USE_HTTP_TINY = 1 if eval q{ use HTTP::Tiny; 1 };
$CAN_USE_ARCHIVE = 1 if eval q{ use Archive::Extract; 1 };
Expand All @@ -146,7 +154,7 @@ BEGIN
my $git_api_root = 'https://api.github.com/repos/Ensembl/';
my $VEP_MODULE_NAME = 'ensembl-vep';

our (@store_species, @indexes, @files, $ftp, $dirname);
our (@store_species, @indexes, @files, %file_sizes, $ftp, $dirname);

my $config = {};
GetOptions(
Expand All @@ -172,7 +180,9 @@ BEGIN
'TEST',
'NO_HTSLIB|l',
'NO_TEST',
'NO_BIOPERL'
'NO_BIOPERL',
'USE_HTTPS_PROTO',
'GITHUBTOKEN|GITHUB_TOKEN=s',
) or die("ERROR: Failed to parse arguments");

# Read configuration from environment variables starting with VEP_
Expand All @@ -181,8 +191,8 @@ sub read_config_from_environment {
my $config = shift;

for my $key (keys %ENV) {
# Look for environment variables that start with VEP_
next unless $key =~ "^VEP_";
# Look for environment variables that start with VEP_ unless it is GITHUB_TOKEN
next unless ($key =~ "^VEP_" or $key eq "GITHUB_TOKEN");

# Avoid setting empty strings
my $value = $ENV{$key};
Expand Down Expand Up @@ -219,6 +229,8 @@ sub read_config_from_environment {
$NO_HTSLIB ||= $config->{NO_HTSLIB};
$NO_TEST ||= $config->{NO_TEST};
$NO_BIOPERL ||= $config->{NO_BIOPERL};
$USE_HTTPS_PROTO ||= $config->{USE_HTTPS_PROTO};
$GITHUB_TOKEN ||= $config->{GITHUBTOKEN};

# load version data
our $CURRENT_VERSION_DATA = get_version_data($RealBin.'/.version');
Expand Down Expand Up @@ -1076,6 +1088,22 @@ ()
# CACHE FILES
#############

sub convert_file_size {
# convert file size string to byte

my $size = shift;
my @units = ( 'K', 'M', 'G', 'T' );

$size =~ m/\s?(\d+\.?\d+)([TGMK]?)/;

# in bytes already
return $1 unless defined $2;

# convert to bytes
my $scale = first {$units[$_] eq $2} 0..$#units;
return defined $scale ? $1 * (1024**($scale + 1)) : 0;
}

sub format_file_size {
# Format $size (in bytes) based on most adequate unit, e.g.:
# 0 => 0 bytes
Expand All @@ -1099,6 +1127,11 @@ sub format_file_size {

sub cache() {

if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) {
print "Cannot use HTTPS protocol for downloading cache without HTML::TableExtract Perl library installed\nSkipping cache installation\n" unless $QUIET;
return;
}

my $ok;

if($AUTO) {
Expand Down Expand Up @@ -1150,16 +1183,38 @@ ()
my $URL_TO_USE = (-e $tabix) ? $CACHE_URL_INDEXED : die "ERROR: Could not find tabix. It is recommended to use indexed cache files";

if(is_url($URL_TO_USE)) {
$URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/;
$ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
$ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
$ftp->binary();
if($USE_HTTPS_PROTO) {
# get cache file list in HTML format
my $ftp_html = get_html($URL_TO_USE);
unless (defined $ftp_html) {
print "curl failed to retrieve file list from FTP - $URL_TO_USE\n" unless $QUIET;
return;
}

# parse HTML cache file list
my $te = HTML::TableExtract->new();
$te->parse($ftp_html);

foreach my $sub(split /\//, $3) {
$ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
foreach my $rows ($te->first_table_found->rows){
my ($name, $size) = ($rows->[1], $rows->[3]);

next unless $name =~ /tar.gz/;
push @files, $name;
$file_sizes{$name} = convert_file_size($size);
}
}
else {
$URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/;
$ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
$ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
$ftp->binary();

foreach my $sub(split /\//, $3) {
$ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
}

push @files, grep {$_ =~ /tar.gz/} $ftp->ls;
push @files, grep {$_ =~ /tar.gz/} $ftp->ls;
}
}
else {
opendir DIR, $URL_TO_USE;
Expand Down Expand Up @@ -1191,7 +1246,16 @@ ()
my $size;
my $total_size = 0;
foreach my $file(@files) {
$size = defined $ftp ? $ftp->size($file) : 0;
if (defined $ftp) {
$size = $ftp->size($file);
}
elsif (%file_sizes) {
$size = $file_sizes{$file};
}
else {
$size = 0;
}

$total_size += $size;
$size = $size ? " (" . format_file_size($size) . ")" : "";
$species_list .= ++$num . " : $file$size\n";
Expand Down Expand Up @@ -1311,11 +1375,21 @@ ()
if(is_url($URL_TO_USE)) {
print " - downloading $URL_TO_USE/$file_path\n" unless $QUIET;
if(!$TEST) {
$ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file);
if ($USE_HTTPS_PROTO) {
download_to_file("$URL_TO_USE/$file_path", $target_file);
}
else {
$ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file);
}

my $checksums = "CHECKSUMS";
my $checksums_target_file = "$CACHE_DIR/tmp/$checksums";
$ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file);
if ($USE_HTTPS_PROTO) {
download_to_file("$URL_TO_USE/$checksums", $checksums_target_file);
}
else {
$ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file);
}
if (-e $checksums_target_file) {
my $sum_download = `sum $target_file`;
$sum_download =~ m/([0-9]+)(\s+)([0-9]+)/;
Expand Down Expand Up @@ -1366,6 +1440,11 @@ ()
#############
sub fasta() {

if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) {
print "Cannot use HTTPS protocol for downloading FASTA without HTML::TableExtract Perl library installed\nSkipping FASTA installation\n" unless $QUIET;
return;
}

### SPECIAL CASE GRCh37
if((grep {$files[$_ - 1] =~ /GRCh37/} @indexes) || (defined($ASSEMBLY) && $ASSEMBLY eq 'GRCh37')) {

Expand Down Expand Up @@ -1400,17 +1479,39 @@ ()
}

my @dirs = ();

if(is_url($FASTA_URL)) {
$FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/;
$ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
$ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
$ftp->binary();
if($USE_HTTPS_PROTO) {
# get species list in HTML format
my $ftp_html = get_html($FASTA_URL);
unless (defined $ftp_html) {
print "curl failed to retrieve species list from FTP - $FASTA_URL\n" unless $QUIET;
return;
}

foreach my $sub(split /\//, $3) {
$ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
# parse HTML species list
my $te = HTML::TableExtract->new();
$te->parse($ftp_html);

foreach my $rows ($te->first_table_found->rows){
my $name = $rows->[1];

next unless $name =~ /\/$/;
next if $name =~ /ancestral_alleles/;

push @dirs, substr($name, 0, -1);
}
}
else {
$FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/;
$ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
$ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
$ftp->binary();

foreach my $sub(split /\//, $3) {
$ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
}
push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls;
}
push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls;
}
else {
opendir DIR, $FASTA_URL;
Expand Down Expand Up @@ -1459,6 +1560,25 @@ ()
@files = $ftp->ls;
$dna_path = $ftp->pwd =~ /dna_index/ ? 'dna_index' : 'dna';
}
elsif($USE_HTTPS_PROTO) {
# get FASTA file list in HTML format
my $ftp_html = get_html("$FASTA_URL/$species/dna");
unless (defined $ftp_html) {
print "curl failed to retrieve file list from FTP - $FASTA_URL/$species/dna\n" unless $QUIET;
return;
}

# parse HTML FASTA file list
my $te = HTML::TableExtract->new();
$te->parse($ftp_html);

foreach my $rows ($te->first_table_found->rows){
my $name = $rows->[1];
next unless $name =~ /fa.gz/;

push @files, $name;
}
}
else {
if(!opendir DIR, "$FASTA_URL/$species/dna") {
warn "WARNING: Could not read from directory $FASTA_URL/$species/dna\n$@\n";
Expand Down Expand Up @@ -1517,6 +1637,10 @@ ()
$ftp->get($file, $ex) or download_to_file("$FASTA_URL/$species/$dna_path/$file", $ex);
}
}
elsif($USE_HTTPS_PROTO) {
print " - downloading $file\n" unless $QUIET;
download_to_file("$FASTA_URL/$species/dna/$file", $ex);
}
else {
print " - copying $file\n" unless $QUIET;
copy("$FASTA_URL/$species/dna/$file", $ex) unless $TEST;
Expand Down Expand Up @@ -1550,7 +1674,12 @@ ()
if(!$TEST) {
$index_file =~ /$file(\..+)/;
print " - downloading $index_file\n" unless $QUIET;
$ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1);
if($USE_HTTPS_PROTO) {
download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1);
}
else {
$ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1);
}
$got_indexes++;
}
}
Expand Down Expand Up @@ -1802,7 +1931,8 @@ sub download_to_file {
my ($url, $file) = @_;
if($CAN_USE_CURL) {
my $response = `curl -s -o $file -w '%{http_code}' --location "$url" `;
my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : "";
my $response = `curl -s -o $file -w '%{http_code}' $header --location "$url" `;
if ( $response != 200 && $response != 226) {
print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET;
$CAN_USE_CURL = 0;
Expand Down Expand Up @@ -1856,6 +1986,52 @@ sub download_to_file {
}
}
sub get_html {
my $url = shift;
if($CAN_USE_CURL) {
my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : "";
my $response = `curl -s -w '%{http_code}' $header --location "$url" `;
my @lines = split(/\n/, $response);
my $status_code = pop @lines;
if ( $status_code != 200 && $status_code != 226) {
print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET;
$CAN_USE_CURL = 0;
get_html($url);
}
return join("\n", @lines);
}
elsif($CAN_USE_LWP) {
my $req = HTTP::Request->new(GET => $url);
my $response = $ua->request($req);
unless($response->is_success) {
print "LWP::Simple failed ($response), trying to fetch using HTTP::Tiny\n" unless $QUIET;
$CAN_USE_LWP = 0;
get_html($url);
}
return $response->content;
}
elsif($CAN_USE_HTTP_TINY) {
my $response = HTTP::Tiny->new()->get($url);
unless($response->{success}) {
print("Failed last resort of using HTTP::Tiny to download $url\n");
return undef;
}
return $response->{content};
}
else {
print("Failed to get HTML content without curl, LWP or HTTP::Tiny installed\n");
return undef;
}
}
# unpack a tarball
sub unpack_arch {
my ($arch_file, $dir) = @_;
Expand Down Expand Up @@ -1936,6 +2112,12 @@ sub usage {
a remote (e.g. FTP) address. The FASTA URL/directory must have
gzipped FASTA files under the following structure:
[species]/[dna]/
--USE_HTTPS_PROTO Download cache and FASTA file using HTTPs protocol instead of FTP.
Useful for networks where FTP port is blocked by firewall.
--GITHUBTOKEN Set token to use for authentication when querying GitHub API. Authenticated
user have increased rate-limit. NOTE: use token with read-only access.
END
print $usage;
Expand Down

0 comments on commit 5a816ab

Please sign in to comment.