apache · rahil-c · Nov 24, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 26, 2025
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchemaCache.java b/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchemaCache.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.schema;
+
+import com.github.benmanes.caffeine.cache.Caffeine;
+import com.github.benmanes.caffeine.cache.LoadingCache;
+
+/**
+ * A global cache for HoodieSchema instances to ensure that there is only one
+ * variable instance of the same schema within an entire JVM lifetime.
+ *
+ * <p>This is a global cache which works for a JVM lifecycle.
+ * A collection of schema instances are maintained.
+ *
+ * <p>NOTE: The schema which is used frequently should be cached through this cache.
+ */
+public class HoodieSchemaCache {
+
+  // Ensure that there is only one variable instance of the same schema within an entire JVM lifetime
+  private static final LoadingCache<HoodieSchema, HoodieSchema> SCHEMA_CACHE =
+      Caffeine.newBuilder().weakValues().maximumSize(1024).build(k -> k);
+
+  /**
+   * Get schema variable from global cache. If not found, put it into the cache and then return it.
+   * @param schema schema to get
+   * @return if found, return the exist schema variable, otherwise return the param itself.
+   */
+  public static HoodieSchema intern(HoodieSchema schema) {
+    return SCHEMA_CACHE.get(schema);
+  }
+}
diff --git a/...k-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/FilebasedSchemaProvider.java b/...k-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/FilebasedSchemaProvider.java
@@ -20,12 +20,12 @@
 
 import org.apache.hudi.common.config.ConfigProperty;
 import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.schema.HoodieSchema;
 import org.apache.hudi.configuration.FlinkOptions;
 import org.apache.hudi.configuration.HadoopConfigurations;
 import org.apache.hudi.exception.HoodieIOException;
 import org.apache.hudi.hadoop.fs.HadoopFSUtils;
 
-import org.apache.avro.Schema;
 import org.apache.flink.configuration.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -61,20 +61,20 @@ public static class Config {
         .withDocumentation("The schema of the target you are writing to");
   }
 
-  private final Schema sourceSchema;
+  private final HoodieSchema sourceSchema;
 
-  private Schema targetSchema;
+  private HoodieSchema targetSchema;
 
   @Deprecated
   public FilebasedSchemaProvider(TypedProperties props) {
     checkRequiredConfigProperties(props, Collections.singletonList(Config.SOURCE_SCHEMA_FILE));
     String sourceSchemaFile = getStringWithAltKeys(props, Config.SOURCE_SCHEMA_FILE);
     FileSystem fs = HadoopFSUtils.getFs(sourceSchemaFile, HadoopConfigurations.getHadoopConf(new Configuration()));
     try {
-      this.sourceSchema = new Schema.Parser().parse(fs.open(new Path(sourceSchemaFile)));
+      this.sourceSchema = new HoodieSchema.Parser().parse(fs.open(new Path(sourceSchemaFile)));
       if (containsConfigProperty(props, Config.TARGET_SCHEMA_FILE)) {
         this.targetSchema =
-            new Schema.Parser().parse(fs.open(new Path(getStringWithAltKeys(props, Config.TARGET_SCHEMA_FILE))));
+            new HoodieSchema.Parser().parse(fs.open(new Path(getStringWithAltKeys(props, Config.TARGET_SCHEMA_FILE))));
       }
     } catch (IOException ioe) {
       throw new HoodieIOException("Error reading schema", ioe);
@@ -85,19 +85,19 @@ public FilebasedSchemaProvider(Configuration conf) {
     final String sourceSchemaPath = conf.get(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH);
     final FileSystem fs = HadoopFSUtils.getFs(sourceSchemaPath, HadoopConfigurations.getHadoopConf(conf));
     try {
-      this.sourceSchema = new Schema.Parser().parse(fs.open(new Path(sourceSchemaPath)));
+      this.sourceSchema = new HoodieSchema.Parser().parse(fs.open(new Path(sourceSchemaPath)));
     } catch (IOException ioe) {
       throw new HoodieIOException("Error reading schema", ioe);
     }
   }
 
   @Override
-  public Schema getSourceSchema() {
+  public HoodieSchema getSourceSchema() {
     return sourceSchema;
   }
 
   @Override
-  public Schema getTargetSchema() {
+  public HoodieSchema getTargetSchema() {
     if (targetSchema != null) {
       return targetSchema;
     } else {

diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaProvider.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaProvider.java
@@ -18,7 +18,7 @@
 
 package org.apache.hudi.schema;
 
-import org.apache.avro.Schema;
+import org.apache.hudi.common.schema.HoodieSchema;
 
 import java.io.Serializable;
 
@@ -29,9 +29,9 @@ public abstract class SchemaProvider implements Serializable {
 
   private static final long serialVersionUID = 1L;
 
-  public abstract Schema getSourceSchema();
+  public abstract HoodieSchema getSourceSchema();
 
-  public Schema getTargetSchema() {
+  public HoodieSchema getTargetSchema() {
     // by default, use source schema as target for hoodie table as well
     return getSourceSchema();
   }

diff --git a/...nk-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaRegistryProvider.java b/...nk-datasource/hudi-flink/src/main/java/org/apache/hudi/schema/SchemaRegistryProvider.java
@@ -20,11 +20,11 @@
 
 import org.apache.hudi.common.config.ConfigProperty;
 import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.schema.HoodieSchema;
 import org.apache.hudi.exception.HoodieIOException;
 
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
-import org.apache.avro.Schema;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -110,12 +110,12 @@ public SchemaRegistryProvider(TypedProperties props) {
     checkRequiredConfigProperties(props, Collections.singletonList(Config.SRC_SCHEMA_REGISTRY_URL));
   }
 
-  private Schema getSchema(String registryUrl) throws IOException {
-    return new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl));
+  private HoodieSchema getSchema(String registryUrl) throws IOException {
+    return new HoodieSchema.Parser().parse(fetchSchemaFromRegistry(registryUrl));
   }
 
   @Override
-  public Schema getSourceSchema() {
+  public HoodieSchema getSourceSchema() {
     String registryUrl = getStringWithAltKeys(config, Config.SRC_SCHEMA_REGISTRY_URL);
     try {
       return getSchema(registryUrl);
@@ -125,7 +125,7 @@ public Schema getSourceSchema() {
   }
 
   @Override
-  public Schema getTargetSchema() {
+  public HoodieSchema getTargetSchema() {
     String registryUrl = getStringWithAltKeys(config, Config.SRC_SCHEMA_REGISTRY_URL);
     String targetRegistryUrl = getStringWithAltKeys(
         config, Config.TARGET_SCHEMA_REGISTRY_URL, registryUrl);

diff --git a/...datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java b/...datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java
@@ -22,6 +22,7 @@
 import org.apache.hudi.client.model.HoodieFlinkInternalRow;
 import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.model.FileSlice;
+import org.apache.hudi.common.schema.HoodieSchema;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.TableSchemaResolver;
 import org.apache.hudi.common.table.read.HoodieFileGroupReader;
@@ -44,7 +45,6 @@
 import org.apache.hudi.util.StreamerUtil;
 import org.apache.hudi.utils.RuntimeContextUtils;
 
-import org.apache.avro.Schema;
 import org.apache.flink.annotation.VisibleForTesting;
 import org.apache.flink.api.common.state.ListState;
 import org.apache.flink.api.common.state.ListStateDescriptor;
@@ -217,7 +217,8 @@ protected void loadRecords(String partitionPath) throws Exception {
     Option<HoodieInstant> latestCommitTime = commitsTimeline.filterCompletedAndCompactionInstants().lastInstant();
 
     if (latestCommitTime.isPresent()) {
-      Schema schema = new TableSchemaResolver(this.hoodieTable.getMetaClient()).getTableAvroSchema();
+      HoodieSchema schema = HoodieSchema.fromAvroSchema(
+          new TableSchemaResolver(this.hoodieTable.getMetaClient()).getTableAvroSchema());
 
       List<FileSlice> fileSlices = this.hoodieTable.getSliceView()
           .getLatestMergedFileSlicesBeforeOrOn(partitionPath, latestCommitTime.get().requestedTime())
@@ -250,7 +251,7 @@ protected void loadRecords(String partitionPath) throws Exception {
    *
    * @return A record key iterator for the file slice.
    */
-  private ClosableIterator<String> getRecordKeyIterator(FileSlice fileSlice, Schema tableSchema) throws IOException {
+  private ClosableIterator<String> getRecordKeyIterator(FileSlice fileSlice, HoodieSchema tableSchema) throws IOException {
     FileSlice scanFileSlice = new FileSlice(fileSlice.getPartitionPath(), fileSlice.getBaseInstantTime(), fileSlice.getFileId());
     // filter out crushed base file
     fileSlice.getBaseFile().map(f -> isValidFile(f.getPathInfo()) ? f : null).ifPresent(scanFileSlice::setBaseFile);

diff --git a/...tasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringOperator.java b/...tasource/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/ClusteringOperator.java
@@ -20,7 +20,6 @@
 
 import org.apache.hudi.adapter.MaskingOutputAdapter;
 import org.apache.hudi.adapter.Utils;
-import org.apache.hudi.avro.AvroSchemaUtils;
 import org.apache.hudi.client.HoodieFlinkWriteClient;
 import org.apache.hudi.client.WriteStatus;
 import org.apache.hudi.client.utils.CloseableConcatenatingIterator;
@@ -32,6 +31,7 @@
 import org.apache.hudi.common.model.HoodieLogFile;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.schema.HoodieSchema;
+import org.apache.hudi.common.schema.HoodieSchemaUtils;
 import org.apache.hudi.common.table.read.HoodieFileGroupReader;
 import org.apache.hudi.common.util.CollectionUtils;
 import org.apache.hudi.common.util.Option;
@@ -59,7 +59,6 @@
 import org.apache.hudi.util.FlinkWriteClients;
 import org.apache.hudi.utils.RuntimeContextUtils;
 
-import org.apache.avro.Schema;
 import org.apache.flink.annotation.VisibleForTesting;
 import org.apache.flink.configuration.Configuration;
 import org.apache.flink.metrics.Gauge;
@@ -107,8 +106,8 @@ public class ClusteringOperator extends TableStreamOperator<ClusteringCommitEven
   private int taskID;
   private transient HoodieWriteConfig writeConfig;
   private transient HoodieFlinkTable<?> table;
-  private transient Schema schema;
-  private transient Schema readerSchema;
+  private transient HoodieSchema schema;
+  private transient HoodieSchema readerSchema;
   private transient HoodieFlinkWriteClient writeClient;
   private transient StreamRecordCollector<ClusteringCommitEvent> collector;
   private transient BinaryRowDataSerializer binarySerializer;
@@ -170,11 +169,12 @@ public void open() throws Exception {
     this.writeClient = FlinkWriteClients.createWriteClient(conf, getRuntimeContext());
     this.table = writeClient.getHoodieTable();
 
-    this.schema = AvroSchemaConverter.convertToSchema(rowType);
+    //TODO make a converter class for HoodieSchemaConverter
+    this.schema = HoodieSchema.fromAvroSchema(AvroSchemaConverter.convertToSchema(rowType));
     // Since there exists discrepancies between flink and spark dealing with nullability of primary key field,
     // and there may be some files written by spark, force update schema as nullable to make sure clustering
     // scan successfully without schema validating exception.
-    this.readerSchema = AvroSchemaUtils.asNullable(schema);
+    this.readerSchema = HoodieSchemaUtils.createNullableSchema(schema);
 
     this.binarySerializer = new BinaryRowDataSerializer(rowType.getFieldCount());
 
@@ -312,8 +312,7 @@ private Iterator<RowData> readRecordsForGroupBaseFiles(List<ClusteringOperation>
         HoodieRowDataParquetReader fileReader = (HoodieRowDataParquetReader) fileReaderFactory.getFileReader(
             table.getConfig(), new StoragePath(clusteringOp.getDataFilePath()));
 
-        //TODO boundary to revisit in later pr to use HoodieSchema directly
-        return new CloseableMappingIterator<>(fileReader.getRecordIterator(HoodieSchema.fromAvroSchema(readerSchema)), HoodieRecord::getData);
+        return new CloseableMappingIterator<>(fileReader.getRecordIterator(readerSchema), HoodieRecord::getData);
       } catch (IOException e) {
         throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath()
             + " and " + clusteringOp.getDeltaFilePaths(), e);

diff --git a/...ce/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/HoodieFlinkClusteringJob.java b/...ce/hudi-flink/src/main/java/org/apache/hudi/sink/clustering/HoodieFlinkClusteringJob.java
@@ -21,6 +21,7 @@
 import org.apache.hudi.async.HoodieAsyncTableService;
 import org.apache.hudi.avro.model.HoodieClusteringPlan;
 import org.apache.hudi.client.HoodieFlinkWriteClient;
+import org.apache.hudi.common.schema.HoodieSchema;
 import org.apache.hudi.common.table.HoodieTableConfig;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
@@ -39,7 +40,6 @@
 import org.apache.hudi.util.StreamerUtil;
 
 import com.beust.jcommander.JCommander;
-import org.apache.avro.Schema;
 import org.apache.flink.annotation.VisibleForTesting;
 import org.apache.flink.api.common.typeinfo.TypeInformation;
 import org.apache.flink.client.deployment.application.ApplicationExecutionException;
@@ -319,8 +319,8 @@ private void cluster() throws Exception {
       // Mark instant as clustering inflight
       ClusteringUtils.transitionClusteringOrReplaceRequestedToInflight(instant, Option.empty(), table.getActiveTimeline());
 
-      final Schema tableAvroSchema = StreamerUtil.getTableAvroSchema(table.getMetaClient(), false);
-      final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema);
+      final HoodieSchema tableSchema = StreamerUtil.getTableAvroSchema(table.getMetaClient(), false);
+      final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableSchema.getAvroSchema());
       final RowType rowType = (RowType) rowDataType.getLogicalType();
 
       StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

diff --git a/...ink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java b/...ink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java
@@ -73,7 +73,7 @@ public static void main(String[] args) throws Exception {
     Configuration conf = FlinkStreamerConfig.toFlinkConfig(cfg);
     // Read from kafka source
     RowType rowType =
-        (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf))
+        (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf).getAvroSchema())
             .getLogicalType();
 
     long ckpTimeout = env.getCheckpointConfig().getCheckpointTimeout();