Merge pull request #1805 from nakib103/install_https

Allow https download and GITHUB_TOKEN in VEP Install
Ensembl · Jan 28, 2025 · 5a816ab · 5a816ab
2 parents 70fa548 + 9765531
commit 5a816ab
Showing 1 changed file with 206 additions and 24 deletions.
diff --git a/INSTALL.pl b/INSTALL.pl
@@ -45,6 +45,7 @@ =head1 NAME
 use Net::FTP;
 use Cwd;
 use Scalar::Util qw(looks_like_number);
+use List::Util qw(first);
 use Bio::EnsEMBL::VEP::Utils qw(get_version_data get_version_string);
 
 our (
@@ -79,6 +80,8 @@ =head1 NAME
   $REALPATH_DEST_DIR,
   $NO_TEST,
   $NO_BIOPERL,
+  $USE_HTTPS_PROTO,
+  $GITHUB_TOKEN,
   $ua,
 
   $CAN_USE_CURL,
@@ -90,6 +93,7 @@ =head1 NAME
   $CAN_USE_TAR,
   $CAN_USE_DBI,
   $CAN_USE_DBD_MYSQL,
+  $CAN_USE_HTTML_EXTRACT
 );
 
 
@@ -120,6 +124,10 @@ BEGIN
     $CAN_USE_DBD_MYSQL = 1;
   }
 
+  if(eval q{ use HTML::TableExtract; 1 }) {
+    $CAN_USE_HTTML_EXTRACT = 1;
+  }
+
   $CAN_USE_CURL      = 1 if `which curl` =~ /\/curl/;
   $CAN_USE_HTTP_TINY = 1 if eval q{ use HTTP::Tiny; 1 };
   $CAN_USE_ARCHIVE   = 1 if eval q{ use Archive::Extract; 1 };
@@ -146,7 +154,7 @@ BEGIN
 my $git_api_root = 'https://api.github.com/repos/Ensembl/';
 my $VEP_MODULE_NAME = 'ensembl-vep';
 
-our (@store_species, @indexes, @files, $ftp, $dirname);
+our (@store_species, @indexes, @files, %file_sizes, $ftp, $dirname);
 
 my $config = {};
 GetOptions(
@@ -172,7 +180,9 @@ BEGIN
   'TEST',
   'NO_HTSLIB|l',
   'NO_TEST',
-  'NO_BIOPERL'
+  'NO_BIOPERL',
+  'USE_HTTPS_PROTO',
+  'GITHUBTOKEN|GITHUB_TOKEN=s',
 ) or die("ERROR: Failed to parse arguments");
 
 # Read configuration from environment variables starting with VEP_
@@ -181,8 +191,8 @@ sub read_config_from_environment {
   my $config = shift;
 
   for my $key (keys %ENV) {
-    # Look for environment variables that start with VEP_
-    next unless $key =~ "^VEP_";
+    # Look for environment variables that start with VEP_ unless it is GITHUB_TOKEN
+    next unless ($key =~ "^VEP_" or $key eq "GITHUB_TOKEN");
 
     # Avoid setting empty strings
     my $value = $ENV{$key};
@@ -219,6 +229,8 @@ sub read_config_from_environment {
 $NO_HTSLIB    ||=  $config->{NO_HTSLIB};
 $NO_TEST      ||=  $config->{NO_TEST};
 $NO_BIOPERL   ||=  $config->{NO_BIOPERL};
+$USE_HTTPS_PROTO  ||=  $config->{USE_HTTPS_PROTO};
+$GITHUB_TOKEN ||= $config->{GITHUBTOKEN};
 
 # load version data
 our $CURRENT_VERSION_DATA = get_version_data($RealBin.'/.version');
@@ -1076,6 +1088,22 @@ ()
 # CACHE FILES
 #############
 
+sub convert_file_size {
+  # convert file size string to byte
+
+  my $size = shift;
+  my @units = ( 'K', 'M', 'G', 'T' );
+
+  $size =~ m/\s?(\d+\.?\d+)([TGMK]?)/;
+
+  # in bytes already
+  return $1 unless defined $2;
+
+  # convert to bytes
+  my $scale = first {$units[$_] eq $2} 0..$#units;
+  return defined $scale ? $1 * (1024**($scale + 1)) : 0;
+}
+
 sub format_file_size {
   # Format $size (in bytes) based on most adequate unit, e.g.:
   #             0 =>   0 bytes
@@ -1099,6 +1127,11 @@ sub format_file_size {
 
 sub cache() {
 
+  if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) {
+    print "Cannot use HTTPS protocol for downloading cache without HTML::TableExtract Perl library installed\nSkipping cache installation\n" unless $QUIET;
+    return;
+  }
+
   my $ok;
 
   if($AUTO) {
@@ -1150,16 +1183,38 @@ ()
   my $URL_TO_USE = (-e $tabix) ? $CACHE_URL_INDEXED : die "ERROR: Could not find tabix. It is recommended to use indexed cache files";
 
   if(is_url($URL_TO_USE)) {
-    $URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/;
-    $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
-    $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
-    $ftp->binary();
+    if($USE_HTTPS_PROTO) {
+      # get cache file list in HTML format
+      my $ftp_html = get_html($URL_TO_USE);
+      unless (defined $ftp_html) {
+        print "curl failed to retrieve file list from FTP - $URL_TO_USE\n" unless $QUIET;
+        return;
+      }
+
+      # parse HTML cache file list
+      my $te = HTML::TableExtract->new();
+      $te->parse($ftp_html);
 
-    foreach my $sub(split /\//, $3) {
-      $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
+      foreach my $rows ($te->first_table_found->rows){
+        my ($name, $size) = ($rows->[1], $rows->[3]);
+
+        next unless $name =~ /tar.gz/;
+        push @files, $name;
+        $file_sizes{$name} = convert_file_size($size);
+      }
     }
+    else {
+      $URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/;
+      $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
+      $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
+      $ftp->binary();
+
+      foreach my $sub(split /\//, $3) {
+        $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
+      }
 
-    push @files, grep {$_ =~ /tar.gz/} $ftp->ls;
+      push @files, grep {$_ =~ /tar.gz/} $ftp->ls;
+    }
   }
   else {
     opendir DIR, $URL_TO_USE;
@@ -1191,7 +1246,16 @@ ()
   my $size;
   my $total_size = 0;
   foreach my $file(@files) {
-    $size = defined $ftp ? $ftp->size($file) : 0;
+    if (defined $ftp) {
+      $size = $ftp->size($file);
+    }
+    elsif (%file_sizes) {
+      $size = $file_sizes{$file};
+    }
+    else {
+      $size = 0;
+    }
+
     $total_size += $size;
     $size = $size ? " (" . format_file_size($size) . ")" : "";
     $species_list .= ++$num . " : $file$size\n";
@@ -1311,11 +1375,21 @@ ()
     if(is_url($URL_TO_USE)) {
       print " - downloading $URL_TO_USE/$file_path\n" unless $QUIET;
       if(!$TEST) {
-        $ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file);
+        if ($USE_HTTPS_PROTO) {
+          download_to_file("$URL_TO_USE/$file_path", $target_file);
+        }
+        else {
+          $ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file);
+        }
 
         my $checksums = "CHECKSUMS";
         my $checksums_target_file = "$CACHE_DIR/tmp/$checksums";
-        $ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file);
+        if ($USE_HTTPS_PROTO) {
+          download_to_file("$URL_TO_USE/$checksums", $checksums_target_file);
+        }
+        else {
+          $ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file);
+        }
         if (-e $checksums_target_file) {
           my $sum_download = `sum $target_file`;
           $sum_download =~ m/([0-9]+)(\s+)([0-9]+)/;
@@ -1366,6 +1440,11 @@ ()
 #############
 sub fasta() {
 
+  if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) {
+    print "Cannot use HTTPS protocol for downloading FASTA without HTML::TableExtract Perl library installed\nSkipping FASTA installation\n" unless $QUIET;
+    return;
+  }
+
   ### SPECIAL CASE GRCh37
   if((grep {$files[$_ - 1] =~ /GRCh37/} @indexes) || (defined($ASSEMBLY) && $ASSEMBLY eq 'GRCh37')) {
 
@@ -1400,17 +1479,39 @@ ()
   }
 
   my @dirs = ();
-
   if(is_url($FASTA_URL)) {
-    $FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/;
-    $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
-    $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
-    $ftp->binary();
+    if($USE_HTTPS_PROTO) {
+      # get species list in HTML format
+      my $ftp_html = get_html($FASTA_URL);
+      unless (defined $ftp_html) {
+        print "curl failed to retrieve species list from FTP - $FASTA_URL\n" unless $QUIET;
+        return;
+      }
 
-    foreach my $sub(split /\//, $3) {
-      $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
+      # parse HTML species list
+      my $te = HTML::TableExtract->new();
+      $te->parse($ftp_html);
+
+      foreach my $rows ($te->first_table_found->rows){
+        my $name = $rows->[1];
+
+        next unless $name =~ /\/$/;
+        next if $name =~ /ancestral_alleles/;
+
+        push @dirs,  substr($name, 0, -1);
+      }
+    }
+    else {
+      $FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/;
+      $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
+      $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
+      $ftp->binary();
+
+      foreach my $sub(split /\//, $3) {
+        $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
+      }
+      push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls;
     }
-    push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls;
   }
   else {
     opendir DIR, $FASTA_URL;
@@ -1459,6 +1560,25 @@ ()
       @files = $ftp->ls;
       $dna_path = $ftp->pwd =~ /dna_index/ ? 'dna_index' : 'dna';
     }
+    elsif($USE_HTTPS_PROTO) {
+      # get FASTA file list in HTML format
+      my $ftp_html = get_html("$FASTA_URL/$species/dna");
+      unless (defined $ftp_html) {
+        print "curl failed to retrieve file list from FTP - $FASTA_URL/$species/dna\n" unless $QUIET;
+        return;
+      }
+
+      # parse HTML FASTA file list
+      my $te = HTML::TableExtract->new();
+      $te->parse($ftp_html);
+
+      foreach my $rows ($te->first_table_found->rows){
+        my $name = $rows->[1];
+        next unless $name =~ /fa.gz/;
+
+        push @files,  $name;
+      }
+    }
     else {
       if(!opendir DIR, "$FASTA_URL/$species/dna") {
         warn "WARNING: Could not read from directory $FASTA_URL/$species/dna\n$@\n";
@@ -1517,6 +1637,10 @@ ()
         $ftp->get($file, $ex) or download_to_file("$FASTA_URL/$species/$dna_path/$file", $ex);
       }
     }
+    elsif($USE_HTTPS_PROTO) {
+      print " - downloading $file\n" unless $QUIET;
+      download_to_file("$FASTA_URL/$species/dna/$file", $ex);
+    }
     else {
       print " - copying $file\n" unless $QUIET;
       copy("$FASTA_URL/$species/dna/$file", $ex) unless $TEST;
@@ -1550,7 +1674,12 @@ ()
         if(!$TEST) {
           $index_file =~ /$file(\..+)/;
           print " - downloading $index_file\n" unless $QUIET;
-          $ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1);
+          if($USE_HTTPS_PROTO) {
+            download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1);
+          }
+          else {
+            $ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1);
+          }
           $got_indexes++;
         }
       }
@@ -1802,7 +1931,8 @@ sub download_to_file {
   my ($url, $file) = @_;
 
   if($CAN_USE_CURL) {
-    my $response = `curl -s -o $file -w '%{http_code}' --location "$url" `;
+    my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : "";
+    my $response = `curl -s -o $file -w '%{http_code}' $header --location "$url" `;
     if ( $response != 200 && $response != 226) {
       print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET;
       $CAN_USE_CURL = 0;
@@ -1856,6 +1986,52 @@ sub download_to_file {
   }
 }
 
+sub get_html {
+  my $url = shift;
+
+  if($CAN_USE_CURL) {
+    my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : "";
+    my $response = `curl -s -w '%{http_code}' $header --location "$url" `;
+    my @lines = split(/\n/, $response);
+    my $status_code = pop @lines;
+    if ( $status_code != 200 && $status_code != 226) {
+      print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET;
+      $CAN_USE_CURL = 0;
+      get_html($url);
+    }
+
+    return join("\n", @lines);
+  }
+
+  elsif($CAN_USE_LWP) {
+    my $req = HTTP::Request->new(GET => $url);
+    my $response = $ua->request($req);
+
+    unless($response->is_success) {
+      print "LWP::Simple failed ($response), trying to fetch using HTTP::Tiny\n" unless $QUIET;
+      $CAN_USE_LWP = 0;
+      get_html($url);
+    }
+
+    return $response->content;
+  }
+  elsif($CAN_USE_HTTP_TINY) {
+    my $response = HTTP::Tiny->new()->get($url);
+
+    unless($response->{success}) {
+      print("Failed last resort of using HTTP::Tiny to download $url\n");
+
+      return undef;
+    }
+
+    return $response->{content};
+  }
+  else {
+    print("Failed to get HTML content without curl, LWP or HTTP::Tiny installed\n");
+    return undef;
+  }
+}
+
 # unpack a tarball
 sub unpack_arch {
   my ($arch_file, $dir) = @_;
@@ -1936,6 +2112,12 @@ sub usage {
                    a remote (e.g. FTP) address. The FASTA URL/directory must have
                    gzipped FASTA files under the following structure:
                    [species]/[dna]/
+--USE_HTTPS_PROTO  Download cache and FASTA file using HTTPs protocol instead of FTP.
+                   Useful for networks where FTP port is blocked by firewall.
+
+
+--GITHUBTOKEN      Set token to use for authentication when querying GitHub API. Authenticated
+                   user have increased rate-limit. NOTE: use token with read-only access.
 END
 
   print $usage;