From b3f658e90e1fdcfdbc86f01f52b89d1bbc46e890 Mon Sep 17 00:00:00 2001 From: Khuwnchai J Date: Mon, 13 Jun 2022 00:19:58 +0700 Subject: [PATCH 1/3] Add support for Spark 3 --- aws-glue-datacatalog-client-common/pom.xml | 5 - .../metastore/AWSCatalogMetastoreClient.java | 55 +-- aws-glue-datacatalog-spark-client/pom.xml | 4 +- .../metastore/AWSCatalogMetastoreClient.java | 316 ++++++++++++++---- .../AWSGlueDataCatalogHiveClientFactory.java | 9 +- .../AWSCatalogMetastoreClientTest.java | 3 +- pom.xml | 6 +- shims/spark-hive-shims/pom.xml | 4 +- .../glue/shims/AwsGlueSparkHiveShims.java | 80 ++++- 9 files changed, 351 insertions(+), 131 deletions(-) diff --git a/aws-glue-datacatalog-client-common/pom.xml b/aws-glue-datacatalog-client-common/pom.xml index 06fcc6e0..763a6e91 100644 --- a/aws-glue-datacatalog-client-common/pom.xml +++ b/aws-glue-datacatalog-client-common/pom.xml @@ -69,11 +69,6 @@ shims-loader ${project.version} - - com.google.guava - guava - ${guava.version} - diff --git a/aws-glue-datacatalog-hive2-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java b/aws-glue-datacatalog-hive2-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java index a1871c3c..4a5849c7 100644 --- a/aws-glue-datacatalog-hive2-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java +++ b/aws-glue-datacatalog-hive2-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java @@ -40,54 +40,7 @@ import org.apache.hadoop.hive.metastore.PartitionDropOptions; import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.Warehouse; -import org.apache.hadoop.hive.metastore.api.AggrStats; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.api.CompactionType; -import org.apache.hadoop.hive.metastore.api.ConfigValSecurityException; -import org.apache.hadoop.hive.metastore.api.CurrentNotificationEventId; -import org.apache.hadoop.hive.metastore.api.DataOperationType; -import org.apache.hadoop.hive.metastore.api.Database; -import org.apache.hadoop.hive.metastore.api.EnvironmentContext; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.FireEventRequest; -import org.apache.hadoop.hive.metastore.api.FireEventResponse; -import org.apache.hadoop.hive.metastore.api.ForeignKeysRequest; -import org.apache.hadoop.hive.metastore.api.GetAllFunctionsResponse; -import org.apache.hadoop.hive.metastore.api.GetOpenTxnsInfoResponse; -import org.apache.hadoop.hive.metastore.api.GetRoleGrantsForPrincipalRequest; -import org.apache.hadoop.hive.metastore.api.GetRoleGrantsForPrincipalResponse; -import org.apache.hadoop.hive.metastore.api.HeartbeatTxnRangeResponse; -import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; -import org.apache.hadoop.hive.metastore.api.HiveObjectRef; -import org.apache.hadoop.hive.metastore.api.HiveObjectType; -import org.apache.hadoop.hive.metastore.api.Index; -import org.apache.hadoop.hive.metastore.api.InvalidObjectException; -import org.apache.hadoop.hive.metastore.api.InvalidOperationException; -import org.apache.hadoop.hive.metastore.api.InvalidPartitionException; -import org.apache.hadoop.hive.metastore.api.LockRequest; -import org.apache.hadoop.hive.metastore.api.LockResponse; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.MetadataPpdResult; -import org.apache.hadoop.hive.metastore.api.NoSuchLockException; -import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; -import org.apache.hadoop.hive.metastore.api.NoSuchTxnException; -import org.apache.hadoop.hive.metastore.api.NotificationEventResponse; -import org.apache.hadoop.hive.metastore.api.OpenTxnsResponse; -import org.apache.hadoop.hive.metastore.api.PartitionEventType; -import org.apache.hadoop.hive.metastore.api.PrimaryKeysRequest; -import org.apache.hadoop.hive.metastore.api.SQLForeignKey; -import org.apache.hadoop.hive.metastore.api.SQLPrimaryKey; -import org.apache.hadoop.hive.metastore.api.ShowCompactResponse; -import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; -import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; -import org.apache.hadoop.hive.metastore.api.TableMeta; -import org.apache.hadoop.hive.metastore.api.TxnAbortedException; -import org.apache.hadoop.hive.metastore.api.TxnOpenException; -import org.apache.hadoop.hive.metastore.api.UnknownDBException; -import org.apache.hadoop.hive.metastore.api.UnknownPartitionException; -import org.apache.hadoop.hive.metastore.api.UnknownTableException; -import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; -import org.apache.hadoop.hive.metastore.api.CompactionResponse; +import org.apache.hadoop.hive.metastore.api.*; import org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy; import org.apache.log4j.Logger; import org.apache.thrift.TException; @@ -1250,6 +1203,12 @@ public List listPartitionNames(String databaseName, String tableName, return glueMetastoreClientDelegate.listPartitionNames(databaseName, tableName, values, max); } + @Override + public PartitionValuesResponse listPartitionValues(PartitionValuesRequest partitionValuesRequest) + throws MetaException, TException, NoSuchObjectException { + throw new UnsupportedOperationException("listPartitionValues is not supported"); + } + @Override public int getNumPartitionsByFilter(String dbName, String tableName, String filter) throws MetaException, NoSuchObjectException, TException { diff --git a/aws-glue-datacatalog-spark-client/pom.xml b/aws-glue-datacatalog-spark-client/pom.xml index ed8c8103..043c009e 100644 --- a/aws-glue-datacatalog-spark-client/pom.xml +++ b/aws-glue-datacatalog-spark-client/pom.xml @@ -13,13 +13,13 @@ aws-glue-datacatalog-spark-client - org.spark-project.hive + org.apache.hive hive-metastore ${spark-hive.version} provided - org.spark-project.hive + org.apache.hive hive-exec ${spark-hive.version} provided diff --git a/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java b/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java index 34590b34..ab29cd96 100644 --- a/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java +++ b/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java @@ -20,7 +20,6 @@ import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.util.concurrent.ThreadFactoryBuilder; - import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -30,60 +29,16 @@ import org.apache.hadoop.hive.common.ValidTxnList; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; -import org.apache.hadoop.hive.metastore.HiveMetaHookLoader; -import org.apache.hadoop.hive.metastore.IMetaStoreClient; -import org.apache.hadoop.hive.metastore.MetaStoreUtils; -import org.apache.hadoop.hive.metastore.PartitionDropOptions; -import org.apache.hadoop.hive.metastore.TableType; -import org.apache.hadoop.hive.metastore.Warehouse; -import org.apache.hadoop.hive.metastore.api.AggrStats; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.api.CompactionType; -import org.apache.hadoop.hive.metastore.api.ConfigValSecurityException; -import org.apache.hadoop.hive.metastore.api.CurrentNotificationEventId; -import org.apache.hadoop.hive.metastore.api.Database; -import org.apache.hadoop.hive.metastore.api.EnvironmentContext; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.FireEventRequest; -import org.apache.hadoop.hive.metastore.api.FireEventResponse; -import org.apache.hadoop.hive.metastore.api.GetOpenTxnsInfoResponse; -import org.apache.hadoop.hive.metastore.api.GetRoleGrantsForPrincipalRequest; -import org.apache.hadoop.hive.metastore.api.GetRoleGrantsForPrincipalResponse; -import org.apache.hadoop.hive.metastore.api.HeartbeatTxnRangeResponse; -import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; -import org.apache.hadoop.hive.metastore.api.HiveObjectRef; -import org.apache.hadoop.hive.metastore.api.HiveObjectType; -import org.apache.hadoop.hive.metastore.api.Index; -import org.apache.hadoop.hive.metastore.api.InvalidObjectException; -import org.apache.hadoop.hive.metastore.api.InvalidOperationException; -import org.apache.hadoop.hive.metastore.api.InvalidPartitionException; -import org.apache.hadoop.hive.metastore.api.LockRequest; -import org.apache.hadoop.hive.metastore.api.LockResponse; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.NoSuchLockException; -import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; -import org.apache.hadoop.hive.metastore.api.NoSuchTxnException; -import org.apache.hadoop.hive.metastore.api.NotificationEventResponse; -import org.apache.hadoop.hive.metastore.api.OpenTxnsResponse; -import org.apache.hadoop.hive.metastore.api.PartitionEventType; -import org.apache.hadoop.hive.metastore.api.ShowCompactResponse; -import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; -import org.apache.hadoop.hive.metastore.api.TxnAbortedException; -import org.apache.hadoop.hive.metastore.api.TxnOpenException; -import org.apache.hadoop.hive.metastore.api.UnknownDBException; -import org.apache.hadoop.hive.metastore.api.UnknownPartitionException; -import org.apache.hadoop.hive.metastore.api.UnknownTableException; -import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.metastore.*; +import org.apache.hadoop.hive.metastore.api.*; import org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy; import org.apache.log4j.Logger; import org.apache.thrift.TException; import java.io.IOException; import java.net.URI; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.nio.ByteBuffer; +import java.util.*; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -366,6 +321,27 @@ public void alter_partition(String dbName, String tblName, org.apache.hadoop.hiv glueMetastoreClientDelegate.alterPartitions(dbName, tblName, Lists.newArrayList(partition)); } + @Override + public void alter_partition( + String dbName, + String tblName, + org.apache.hadoop.hive.metastore.api.Partition partition, + EnvironmentContext environmentContext + ) throws InvalidOperationException, MetaException, TException { + List p = new ArrayList<>(); + p.add(partition); + glueMetastoreClientDelegate.alterPartitions(dbName, tblName, p); + } + + @Override + public void alter_partitions( + String dbName, String tblName, + List partitions, + EnvironmentContext environmentContext + ) throws InvalidOperationException, MetaException, TException { + glueMetastoreClientDelegate.alterPartitions(dbName, tblName, partitions); + } + @Override public void alter_partitions(String dbName, String tblName, List partitions) throws InvalidOperationException, MetaException, TException { @@ -396,6 +372,16 @@ public void alter_table( glueMetastoreClientDelegate.alterTable(dbName, tblName, table, environmentContext); } + @Override + public void alter_table_with_environmentContext( + String dbName, + String tblName, + org.apache.hadoop.hive.metastore.api.Table table, + EnvironmentContext environmentContext + ) throws InvalidOperationException, MetaException, TException { + glueMetastoreClientDelegate.alterTable(dbName, tblName, table, environmentContext); + } + @Override public org.apache.hadoop.hive.metastore.api.Partition appendPartition( String dbName, @@ -477,6 +463,46 @@ public String getTokenStrForm() throws IOException { return glueMetastoreClientDelegate.getTokenStrForm(); } + @Override + public boolean addToken(String tokenIdentifier, String delegationToken) throws TException { + return glueMetastoreClientDelegate.addToken(tokenIdentifier, delegationToken); + } + + @Override + public boolean removeToken(String tokenIdentifier) throws TException { + return glueMetastoreClientDelegate.removeToken(tokenIdentifier); + } + + @Override + public String getToken(String tokenIdentifier) throws TException { + return glueMetastoreClientDelegate.getToken(tokenIdentifier); + } + + @Override + public List getAllTokenIdentifiers() throws TException { + return glueMetastoreClientDelegate.getAllTokenIdentifiers(); + } + + @Override + public int addMasterKey(String key) throws MetaException, TException { + return glueMetastoreClientDelegate.addMasterKey(key); + } + + @Override + public void updateMasterKey(Integer seqNo, String key) throws NoSuchObjectException, MetaException, TException { + glueMetastoreClientDelegate.updateMasterKey(seqNo, key); + } + + @Override + public boolean removeMasterKey(Integer keySeq) throws TException { + return glueMetastoreClientDelegate.removeMasterKey(keySeq); + } + + @Override + public String[] getMasterKeys() throws TException { + return glueMetastoreClientDelegate.getMasterKeys(); + } + @Override public LockResponse checkLock(long lockId) throws NoSuchTxnException, TxnAbortedException, NoSuchLockException, TException { @@ -493,6 +519,11 @@ public void commitTxn(long txnId) throws NoSuchTxnException, TxnAbortedException glueMetastoreClientDelegate.commitTxn(txnId); } + @Override + public void abortTxns(List txnIds) throws TException { + glueMetastoreClientDelegate.abortTxns(txnIds); + } + @Override public void compact( String dbName, @@ -503,6 +534,30 @@ public void compact( glueMetastoreClientDelegate.compact(dbName, tblName, partitionName, compactionType); } + @Override + public void compact( + String dbName, + String tblName, + String partitionName, + CompactionType compactionType, + Map tblProperties + ) throws TException { + glueMetastoreClientDelegate.compact(dbName, tblName, partitionName, compactionType, tblProperties); + } + + @Override + public CompactionResponse compact2( + String dbName, + String tblName, + String partitionName, + CompactionType compactionType, + Map tblProperties + ) throws TException { + + return glueMetastoreClientDelegate.compact2( + dbName, tblName, partitionName, compactionType, tblProperties); + } + @Override public void createFunction(org.apache.hadoop.hive.metastore.api.Function function) throws InvalidObjectException, MetaException, TException { glueMetastoreClientDelegate.createFunction(function); @@ -623,6 +678,17 @@ public boolean dropPartition(String dbName, String tblName, List values, return glueMetastoreClientDelegate.dropPartition(dbName, tblName, values, options.ifExists, options.deleteData, options.purgeData); } + @Override + public List dropPartitions( + String dbName, + String tblName, + List> partExprs, + boolean deleteData, + boolean ifExists + ) throws NoSuchObjectException, MetaException, TException { + throw new UnsupportedOperationException("dropPartitions is not supported"); + } + @Override public boolean dropPartition(String dbName, String tblName, String partitionName, boolean deleteData) throws NoSuchObjectException, MetaException, TException { @@ -638,18 +704,6 @@ public List dropPartitions( return dropPartitions_core(dbName, tblName, partExprs, deleteData, false); } - @Override - public List dropPartitions( - String dbName, - String tblName, List> partExprs, - boolean deleteData, - boolean ignoreProtection, - boolean ifExists, - boolean needResult) - throws NoSuchObjectException, MetaException, TException { - return dropPartitions_core(dbName, tblName, partExprs, deleteData, false); - } - @Override public List dropPartitions( String dbName, @@ -807,6 +861,18 @@ public org.apache.hadoop.hive.metastore.api.Partition exchange_partition( return glueMetastoreClientDelegate.exchangePartition(partitionSpecs, srcDb, srcTbl, dstDb, dstTbl); } + // https://github.com/ismailsimsek/aws-glue-data-catalog-client-for-apache-hive-metastore/blob/1f819a8502c388c0717c1b353c9e8216a03f1ce8/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java#L834 + @Override + public List exchange_partitions( + Map partitionSpecs, + String sourceDb, String sourceTable, String destdb, + String destTableName + ) throws MetaException, NoSuchObjectException, InvalidObjectException, TException { + List p = new ArrayList<>(); + p.add(glueMetastoreClientDelegate.exchangePartition(partitionSpecs, sourceDb, sourceTable, destdb, destTableName)); + return p; + } + @Override public AggrStats getAggrColStatsFor(String dbName, String tblName, List colNames, List partName) throws NoSuchObjectException, MetaException, TException { @@ -868,6 +934,11 @@ public List getFunctions(String dbName, String pattern) throws MetaExcep return glueMetastoreClientDelegate.getFunctions(dbName, pattern); } + @Override + public GetAllFunctionsResponse getAllFunctions() throws MetaException, TException { + throw new UnsupportedOperationException("getAllFunctions is not supported"); + } + @Override public Index getIndex(String dbName, String tblName, String indexName) throws MetaException, UnknownTableException, NoSuchObjectException, TException { @@ -983,6 +1054,16 @@ public List getTables(String dbname, String tablePattern) throws MetaExc return glueMetastoreClientDelegate.getTables(dbname, tablePattern); } + @Override + public List getTables(String dbname, String tablePattern, TableType tableType) throws MetaException, TException, UnknownDBException { + return glueMetastoreClientDelegate.getTables(dbname, tablePattern, tableType); + } + + @Override + public List getTableMeta(String dbPatterns, String tablePatterns, List tableTypes) throws MetaException, TException, UnknownDBException { + return glueMetastoreClientDelegate.getTableMeta(dbPatterns, tablePatterns, tableTypes); + } + @Override public ValidTxnList getValidTxns() throws TException { return glueMetastoreClientDelegate.getValidTxns(); @@ -1104,6 +1185,18 @@ public List listPartitionNames(String databaseName, String tableName, return glueMetastoreClientDelegate.listPartitionNames(databaseName, tableName, values, max); } + @Override + public PartitionValuesResponse listPartitionValues(PartitionValuesRequest partitionValuesRequest) + throws MetaException, TException, NoSuchObjectException { + throw new UnsupportedOperationException("listPartitionValues is not supported"); + } + + @Override + public int getNumPartitionsByFilter(String dbName, String tableName, String filter) + throws MetaException, NoSuchObjectException, TException { + return glueMetastoreClientDelegate.getNumPartitionsByFilter(dbName, tableName, filter); + } + @Override public PartitionSpecProxy listPartitionSpecs(String dbName, String tblName, int max) throws TException { return glueMetastoreClientDelegate.listPartitionSpecs(dbName, tblName, max); @@ -1420,6 +1513,86 @@ public boolean setPartitionColumnStatistics(org.apache.hadoop.hive.metastore.api return glueMetastoreClientDelegate.setPartitionColumnStatistics(request); } + @Override + public boolean isLocalMetaStore() { + return false; + } + + @Override + public void flushCache() { + throw new UnsupportedOperationException("flushCache is not supported"); + } + + @Override + public Iterable> getFileMetadata(List fileIds) throws TException { + return glueMetastoreClientDelegate.getFileMetadata(fileIds); + } + + @Override + public Iterable> getFileMetadataBySarg( + List fileIds, + ByteBuffer sarg, + boolean doGetFooters + ) throws TException { + return glueMetastoreClientDelegate.getFileMetadataBySarg(fileIds, sarg, doGetFooters); + } + + @Override + public void clearFileMetadata(List fileIds) throws TException { + glueMetastoreClientDelegate.clearFileMetadata(fileIds); + } + + @Override + public void putFileMetadata(List fileIds, List metadata) throws TException { + glueMetastoreClientDelegate.putFileMetadata(fileIds, metadata); + } + + @Override + public boolean isSameConfObj(HiveConf hiveConf) { + throw new UnsupportedOperationException("isSameConfObj is not supported"); + } + + @Override + public boolean cacheFileMetadata( + String dbName, + String tblName, + String partName, + boolean allParts) throws TException { + return glueMetastoreClientDelegate.cacheFileMetadata(dbName, tblName, partName, allParts); + } + + @Override + public List getPrimaryKeys(PrimaryKeysRequest primaryKeysRequest) throws MetaException, NoSuchObjectException, TException { + throw new UnsupportedOperationException("getPrimaryKeys is unsupported"); + } + + @Override + public List getForeignKeys(ForeignKeysRequest foreignKeysRequest) throws MetaException, NoSuchObjectException, TException { + throw new UnsupportedOperationException("getForeignKeys is unsupported"); + } + + @Override + public void createTableWithConstraints(org.apache.hadoop.hive.metastore.api.Table table, List primaryKeys, List foreignKeys) throws AlreadyExistsException, InvalidObjectException, MetaException, NoSuchObjectException, TException { + this.createTable(table); + glueMetastoreClientDelegate.createTableWithConstraints(table, primaryKeys, foreignKeys + ); + } + + @Override + public void dropConstraint(String dbName, String tblName, String constraintName) throws MetaException, NoSuchObjectException, TException { + glueMetastoreClientDelegate.dropConstraint(dbName, tblName, constraintName); + } + + @Override + public void addPrimaryKey(List primaryKeyCols) throws MetaException, NoSuchObjectException, TException { + glueMetastoreClientDelegate.addPrimaryKey(primaryKeyCols); + } + + @Override + public void addForeignKey(List foreignKeyCols) throws MetaException, NoSuchObjectException, TException { + glueMetastoreClientDelegate.addForeignKey(foreignKeyCols); + } + @Override public ShowCompactResponse showCompactions() throws TException { return glueMetastoreClientDelegate.showCompactions(); @@ -1435,6 +1608,22 @@ public void addDynamicPartitions( glueMetastoreClientDelegate.addDynamicPartitions(txnId, dbName, tblName, partNames); } + @Override + public void addDynamicPartitions( + long txnId, + String dbName, + String tblName, + List partNames, + DataOperationType operationType + ) throws TException { + glueMetastoreClientDelegate.addDynamicPartitions(txnId, dbName, tblName, partNames, operationType); + } + + @Override + public void insertTable(org.apache.hadoop.hive.metastore.api.Table table, boolean overwrite) throws MetaException { + glueMetastoreClientDelegate.insertTable(table, overwrite); + } + @Override public NotificationEventResponse getNextNotification( long lastEventId, int maxEvents, NotificationFilter notificationFilter) throws TException { @@ -1456,6 +1645,11 @@ public ShowLocksResponse showLocks() throws TException { return glueMetastoreClientDelegate.showLocks(); } + @Override + public ShowLocksResponse showLocks(ShowLocksRequest showLocksRequest) throws TException { + return glueMetastoreClientDelegate.showLocks(showLocksRequest); + } + @Override public GetOpenTxnsInfoResponse showTxns() throws TException { return glueMetastoreClientDelegate.showTxns(); diff --git a/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSGlueDataCatalogHiveClientFactory.java b/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSGlueDataCatalogHiveClientFactory.java index 713a78ad..29f225a7 100644 --- a/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSGlueDataCatalogHiveClientFactory.java +++ b/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSGlueDataCatalogHiveClientFactory.java @@ -6,15 +6,18 @@ import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.ql.metadata.HiveMetaStoreClientFactory; +import java.util.concurrent.ConcurrentHashMap; + public class AWSGlueDataCatalogHiveClientFactory implements HiveMetaStoreClientFactory { @Override public IMetaStoreClient createMetaStoreClient( HiveConf conf, - HiveMetaHookLoader hookLoader + HiveMetaHookLoader hookLoader, + boolean allowEmbedded, + ConcurrentHashMap metaCallTimeMap ) throws MetaException { - AWSCatalogMetastoreClient client = new AWSCatalogMetastoreClient(conf, hookLoader); - return client; + return new AWSCatalogMetastoreClient(conf, hookLoader); } } \ No newline at end of file diff --git a/aws-glue-datacatalog-spark-client/src/test/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClientTest.java b/aws-glue-datacatalog-spark-client/src/test/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClientTest.java index 62653f07..26cd82de 100644 --- a/aws-glue-datacatalog-spark-client/src/test/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClientTest.java +++ b/aws-glue-datacatalog-spark-client/src/test/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClientTest.java @@ -155,7 +155,8 @@ public void setUp() throws Exception { clientFactory = mock(GlueClientFactory.class); metastoreFactory = mock(AWSGlueMetastoreFactory.class); when(clientFactory.newClient()).thenReturn(glueClient); - when(metastoreFactory.newMetastore(conf)).thenReturn(new DefaultAWSGlueMetastore(conf, glueClient)); + DefaultAWSGlueMetastore defaultMetastore = new DefaultAWSGlueMetastore(conf, glueClient); + when(metastoreFactory.newMetastore(conf)).thenReturn(defaultMetastore); metastoreClient = new AWSCatalogMetastoreClient.Builder().withClientFactory(clientFactory) .withMetastoreFactory(metastoreFactory).withWarehouse(wh).createDefaults(false).withHiveConf(conf).build(); } diff --git a/pom.xml b/pom.xml index d32a20f5..1126424e 100644 --- a/pom.xml +++ b/pom.xml @@ -19,9 +19,9 @@ UTF-8 14.0.1 - 2.3.3 - 1.2.1 - 1.11.267 + 2.3.7 + 2.3.7 + 1.11.901 4.11 1.10.19 2.15 diff --git a/shims/spark-hive-shims/pom.xml b/shims/spark-hive-shims/pom.xml index d88ee48f..91cf027b 100644 --- a/shims/spark-hive-shims/pom.xml +++ b/shims/spark-hive-shims/pom.xml @@ -16,13 +16,13 @@ - org.spark-project.hive + org.apache.hive hive-exec ${spark-hive.version} provided - org.spark-project.hive + org.apache.hive hive-metastore ${spark-hive.version} provided diff --git a/shims/spark-hive-shims/src/main/java/com/amazonaws/glue/shims/AwsGlueSparkHiveShims.java b/shims/spark-hive-shims/src/main/java/com/amazonaws/glue/shims/AwsGlueSparkHiveShims.java index 775b947f..44def06b 100644 --- a/shims/spark-hive-shims/src/main/java/com/amazonaws/glue/shims/AwsGlueSparkHiveShims.java +++ b/shims/spark-hive-shims/src/main/java/com/amazonaws/glue/shims/AwsGlueSparkHiveShims.java @@ -1,19 +1,28 @@ package com.amazonaws.glue.shims; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.EnvironmentContext; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.log4j.Logger; + +import java.util.HashMap; +import java.util.Map; class AwsGlueSparkHiveShims implements AwsGlueHiveShims { + private static final Logger logger = Logger.getLogger(AwsGlueSparkHiveShims.class); + private static final String SPARK_HIVE_VERSION = "1.2."; static boolean supportsVersion(String version) { @@ -22,22 +31,22 @@ static boolean supportsVersion(String version) { @Override public ExprNodeGenericFuncDesc getDeserializeExpression(byte[] exprBytes) { - return Utilities.deserializeExpressionFromKryo(exprBytes); + return SerializationUtilities.deserializeExpressionFromKryo(exprBytes); } @Override public byte[] getSerializeExpression(ExprNodeGenericFuncDesc expr) { - return Utilities.serializeExpressionToKryo(expr); + return SerializationUtilities.serializeExpressionToKryo(expr); } @Override public Path getDefaultTablePath(Database db, String tableName, Warehouse warehouse) throws MetaException { - return warehouse.getTablePath(db, tableName); + return warehouse.getDefaultTablePath(db, tableName); } @Override public boolean validateTableName(String name, Configuration conf) { - return MetaStoreUtils.validateName(name); + return MetaStoreUtils.validateName(name, conf); } @Override @@ -47,7 +56,7 @@ public boolean requireCalStats( Partition newPart, Table tbl, EnvironmentContext environmentContext) { - return MetaStoreUtils.requireCalStats(conf, oldPart, newPart, tbl); + return MetaStoreUtils.requireCalStats(conf, oldPart, newPart, tbl, environmentContext); } @Override @@ -59,7 +68,66 @@ public boolean updateTableStatsFast( boolean forceRecompute, EnvironmentContext environmentContext ) throws MetaException { - return MetaStoreUtils.updateUnpartitionedTableStatsFast(db, tbl, wh, madeDir, forceRecompute); + return updateUnpartitionedTableStatsFast(db, tbl, wh, madeDir, forceRecompute); } + /** + * Ref: https://github.com/facebookarchive/swift-hive-metastore/blob/9c89d3ce58cfdb3b13e4f5fde8ec28a701ef6885/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java#L1295 + * Updates the numFiles and totalSize parameters for the passed unpartitioned Table by querying + * the warehouse if the passed Table does not already have values for these parameters. + * @param db + * @param tbl + * @param wh + * @param madeDir if true, the directory was just created and can be assumed to be empty + * @param forceRecompute Recompute stats even if the passed Table already has + * these parameters set + * @return true if the stats were updated, false otherwise + */ + public static boolean updateUnpartitionedTableStatsFast( + Database db, + Table tbl, + Warehouse wh, + boolean madeDir, + boolean forceRecompute + ) throws MetaException { + + FileStatus[] fileStatus = wh.getFileStatusesForUnpartitionedTable(db, tbl); + Map params = tbl.getParameters(); + + if ((params != null) && params.containsKey(StatsSetupConst.DO_NOT_UPDATE_STATS)) { + boolean doNotUpdateStats = Boolean.parseBoolean(params.get(StatsSetupConst.DO_NOT_UPDATE_STATS)); + params.remove(StatsSetupConst.DO_NOT_UPDATE_STATS); + tbl.setParameters(params); // to make sure we remove this marker property + if (doNotUpdateStats) { + return false; + } + } + + boolean updated = false; + if (forceRecompute || params == null || !MetaStoreUtils.containsAllFastStats(params)) { + if (params == null) { + params = new HashMap<>(); + } + if (!madeDir) { + // The table location already exists and may contain data. + // Let's try to populate those stats that don't require full scan. + logger.info("Updating table stats fast for " + tbl.getTableName()); + MetaStoreUtils.populateQuickStats(fileStatus, params); + logger.info("Updated size of table " + tbl.getTableName() + " to " + params.get(StatsSetupConst.TOTAL_SIZE)); + if (!params.containsKey(StatsSetupConst.STATS_GENERATED)) { + // invalidate stats requiring scan since this is a regular ddl alter case + for (String stat : StatsSetupConst.statsRequireCompute) { + params.put(stat, "-1"); + } + params.put(StatsSetupConst.COLUMN_STATS_ACCURATE, StatsSetupConst.FALSE); + } else { + params.remove(StatsSetupConst.STATS_GENERATED); + params.put(StatsSetupConst.COLUMN_STATS_ACCURATE, StatsSetupConst.TRUE); + } + } + tbl.setParameters(params); + updated = true; + } + return updated; + } } From 693c493efe3603db90cec6e7e2cd66f6d834e11e Mon Sep 17 00:00:00 2001 From: Khuwnchai J Date: Mon, 13 Jun 2022 00:31:36 +0700 Subject: [PATCH 2/3] Minor formatting --- .../metastore/AWSCatalogMetastoreClient.java | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java b/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java index ab29cd96..9cfb961e 100644 --- a/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java +++ b/aws-glue-datacatalog-spark-client/src/main/java/com/amazonaws/glue/catalog/metastore/AWSCatalogMetastoreClient.java @@ -678,22 +678,21 @@ public boolean dropPartition(String dbName, String tblName, List values, return glueMetastoreClientDelegate.dropPartition(dbName, tblName, values, options.ifExists, options.deleteData, options.purgeData); } + @Override + public boolean dropPartition(String dbName, String tblName, String partitionName, boolean deleteData) + throws NoSuchObjectException, MetaException, TException { + List values = partitionNameToVals(partitionName); + return glueMetastoreClientDelegate.dropPartition(dbName, tblName, values, false, deleteData, false); + } + @Override public List dropPartitions( String dbName, String tblName, List> partExprs, boolean deleteData, - boolean ifExists - ) throws NoSuchObjectException, MetaException, TException { - throw new UnsupportedOperationException("dropPartitions is not supported"); - } - - @Override - public boolean dropPartition(String dbName, String tblName, String partitionName, boolean deleteData) - throws NoSuchObjectException, MetaException, TException { - List values = partitionNameToVals(partitionName); - return glueMetastoreClientDelegate.dropPartition(dbName, tblName, values, false, deleteData, false); + boolean ifExists) throws NoSuchObjectException, MetaException, TException { + return dropPartitions_core(dbName, tblName, partExprs, deleteData, false); } @Override @@ -1553,11 +1552,8 @@ public boolean isSameConfObj(HiveConf hiveConf) { } @Override - public boolean cacheFileMetadata( - String dbName, - String tblName, - String partName, - boolean allParts) throws TException { + public boolean cacheFileMetadata(String dbName, String tblName, String partName, boolean allParts) + throws TException { return glueMetastoreClientDelegate.cacheFileMetadata(dbName, tblName, partName, allParts); } From 1a809bbde419fd2d078333f3f80f67f5f0f02b36 Mon Sep 17 00:00:00 2001 From: Khwunchai Jaengsawang Date: Sat, 19 Nov 2022 16:05:17 +0700 Subject: [PATCH 3/3] Add CI --- .github/workflows/build.yaml | 125 +++++++++++++++++++++ docker/delta-hadoop-cloud-dependencies.xml | 0 pom.xml | 8 +- 3 files changed, 129 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/build.yaml create mode 100644 docker/delta-hadoop-cloud-dependencies.xml diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 00000000..decee1d7 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,125 @@ +name: core +on: + push: + +env: + # Disable keepAlive and pool + # https://github.com/actions/virtual-environments/issues/1499#issuecomment-689467080 + MAVEN_OPTS: >- + -Xms1024M -Xmx2048M -XX:MaxMetaspaceSize=1024m -XX:-UseGCOverheadLimit -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn + -Dhttp.keepAlive=false + -Dmaven.wagon.http.pool=false + -Dmaven.wagon.http.retryHandler.count=3 + CONTAINER_REGISTRY: ghcr.io/khwj + +# Use the bash login, because we are using miniconda +defaults: + run: + shell: bash -l {0} + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + build: + runs-on: ubuntu-20.04 + permissions: + packages: write + strategy: + fail-fast: false + matrix: + hadoop: [spark3.3] + env: + SPARK_VERSION: 3.3.1 + HADOOP_VERSION: 3.3.2 + HIVE_VERSION: 2.3.9 + HIVE_REF: rel/release-2.3.9-imetastore + SCALA_VERSION: 2.12 + AWS_SDK_VERSION: 1.12.206 + steps: + - name: Checkout Hive + uses: actions/checkout@v3 + with: + repository: khwj/hive + ref: rel/release-2.3.9-imetastore + path: hive + # - name: Set up JDK 11 + # uses: actions/setup-java@v3 + # with: + # java-version: '11' + # distribution: 'adopt' + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: "8" + distribution: "zulu" + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: | + ~/.m2/repository + !~/.m2/repository/org/apache/hive/ + ~/.spark-dist + ~/.cache + key: ${{ runner.os }}-hive-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-hive- + - name: Build Hive + run: | + cd hive + mvn --batch-mode -DskipTests clean install + - name: Checkout Glue Data Catalog client + uses: actions/checkout@v3 + - name: Build Glue Data Catalog client + run: | + mvn clean install package -DskipTests -Dhive2.version=$HIVE_VERSION -Dspark-hive.version=$HIVE_VERSION -Dhadoop.version=$HADOOP_VERSION -Daws.sdk.version=$AWS_SDK_VERSION + mkdir artifacts + find . -not -path "./spark/**" -not -path "./hive/**" -name "*.jar" -exec cp {} artifacts/ \; + - name: Archive Glue Data Catalog client binary + uses: actions/upload-artifact@v3 + with: + name: aws-glue-datacatalog-hive2-client + path: | + artifacts/*.jar + - name: Checkout Spark + uses: actions/checkout@v3 + with: + repository: apache/spark + ref: refs/tags/v3.3.1 + path: spark + - name: Set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: "11" + distribution: "adopt" + - name: Build Spark + env: + MAVEN_OPTS: -Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g + run: | + cd spark + ./dev/make-distribution.sh --name hadoop3.2-glue-thriftserver -Dhadoop-3.2 -Phive -Phive-thriftserver -Pkubernetes + - name: Archive Spark binary + uses: actions/upload-artifact@v3 + with: + name: spark-${{ env.SPARK_VERSION }}-bin-hadoop3.2-glue-thriftserver + path: | + spark/dist/* + - name: Log in to the Container registry + uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 + with: + registry: ${{ env.CONTAINER_REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + platforms: linux/amd64 + - name: Build Spark container images + run: | + cp artifacts/*.jar spark/dist/jars/; + cd spark/dist + ./bin/docker-image-tool.sh -nX -r $CONTAINER_REGISTRY -t ${SPARK_VERSION}-hadoop${HADOOP_VERSION}-glue-thriftserver build + ./bin/docker-image-tool.sh -nX -r $CONTAINER_REGISTRY -t ${SPARK_VERSION}-hadoop${HADOOP_VERSION}-glue-thriftserver -p kubernetes/dockerfiles/spark/bindings/python/Dockerfile build + ./bin/docker-image-tool.sh -nX -r $CONTAINER_REGISTRY -t ${SPARK_VERSION}-hadoop${HADOOP_VERSION}-glue-thriftserver push diff --git a/docker/delta-hadoop-cloud-dependencies.xml b/docker/delta-hadoop-cloud-dependencies.xml new file mode 100644 index 00000000..e69de29b diff --git a/pom.xml b/pom.xml index 1126424e..450e4401 100644 --- a/pom.xml +++ b/pom.xml @@ -19,14 +19,14 @@ UTF-8 14.0.1 - 2.3.7 - 2.3.7 - 1.11.901 + 2.3.9 + 2.3.9 + 1.12.206 4.11 1.10.19 2.15 1.6.4 - 2.8.3 + 2.10.1 2.9 1.3 4.5.3