3131
3232
3333class FileIO :
34- def __init__ (self , warehouse : str , catalog_options : dict ):
34+ def __init__ (self , path : str , catalog_options : dict ):
3535 self .properties = catalog_options
3636 self .logger = logging .getLogger (__name__ )
37- scheme , netloc , path = self .parse_location (warehouse )
37+ scheme , netloc , _ = self .parse_location (path )
3838 if scheme in {"oss" }:
39- self .filesystem = self ._initialize_oss_fs ()
39+ self .filesystem = self ._initialize_oss_fs (path )
4040 elif scheme in {"s3" , "s3a" , "s3n" }:
4141 self .filesystem = self ._initialize_s3_fs ()
4242 elif scheme in {"hdfs" , "viewfs" }:
@@ -56,7 +56,29 @@ def parse_location(location: str):
5656 else :
5757 return uri .scheme , uri .netloc , f"{ uri .netloc } { uri .path } "
5858
59- def _initialize_oss_fs (self ) -> FileSystem :
59+ def _extract_oss_bucket (self , location ) -> str :
60+ uri = urlparse (location )
61+ if uri .scheme and uri .scheme != "oss" :
62+ raise ValueError ("Not an OSS URI: {}" .format (location ))
63+
64+ netloc = uri .netloc or ""
65+ # parse oss://access_id:secret_key@Endpoint/bucket/path/to/object
66+ if (getattr (uri , "username" , None ) or getattr (uri , "password" , None )) or ("@" in netloc ):
67+ first_segment = uri .path .lstrip ("/" ).split ("/" , 1 )[0 ]
68+ if not first_segment :
69+ raise ValueError ("Invalid OSS URI without bucket: {}" .format (location ))
70+ return first_segment
71+
72+ # parse oss://bucket/... or oss://bucket.endpoint/...
73+ host = getattr (uri , "hostname" , None ) or netloc
74+ if not host :
75+ raise ValueError ("Invalid OSS URI without host: {}" .format (location ))
76+ bucket = host .split ("." , 1 )[0 ]
77+ if not bucket :
78+ raise ValueError ("Invalid OSS URI without bucket: {}" .format (location ))
79+ return bucket
80+
81+ def _initialize_oss_fs (self , path ) -> FileSystem :
6082 from pyarrow .fs import S3FileSystem
6183
6284 client_kwargs = {
@@ -71,7 +93,8 @@ def _initialize_oss_fs(self) -> FileSystem:
7193 client_kwargs ['force_virtual_addressing' ] = True
7294 client_kwargs ['endpoint_override' ] = self .properties .get (OssOptions .OSS_ENDPOINT )
7395 else :
74- client_kwargs ['endpoint_override' ] = (self .properties .get (OssOptions .OSS_BUCKET ) + "." +
96+ oss_bucket = self ._extract_oss_bucket (path )
97+ client_kwargs ['endpoint_override' ] = (oss_bucket + "." +
7598 self .properties .get (OssOptions .OSS_ENDPOINT ))
7699
77100 return S3FileSystem (** client_kwargs )
0 commit comments