apache · emkornfield · May 25, 2024 · May 27, 2024 · May 28, 2024 · May 28, 2024
diff --git a/README.md b/README.md
@@ -118,6 +118,51 @@ chunks they are interested in.  The columns chunks should then be read sequentia
 
  ![File Layout](https://raw.github.com/apache/parquet-format/master/doc/images/FileLayout.gif)
 
+ ### PAR3 File Footers
+
+ PAR3 file footer footer format designed to better support wider-schemas and more control
- PAR3 file footer footer format designed to better support wider-schemas and more control
+ PAR3 file footer footer format designed to better support wider-schemas and more control
- PAR3 file footer footer format designed to better support wider-schemas and more control
+ PAR3 file footer format designed to better support wider-schemas and more control
- PAR3 file footer footer format designed to better support wider-schemas and more control
+ PAR3 file footer footer format designed to better support wider-schemas and more control
- PAR3 file footer footer format designed to better support wider-schemas and more control
+ PAR3 file footer format designed to better support wider-schemas and more control
+ over the various footer size vs compute trade-offs.  Its format is as follows:
+   - Serialized Thrift FileMetadata Structure
+   - (Optional) 4 byte CRC32 of the serialized Thrift FileMetadata.
+   - 4-byte length in bytes (little endian) of all preceding elements in the footer.
+   - 4-byte little-endian flag field to indicate features that require special parsing of the footer.
+     Readers MUST raise an error if there is an unrecognized flag.  Current flags:
+
+     * 0x01 - Footer encryption enabled (when set the encryption information is written before 
+        FileMeta structure as in the PAR1 footer).
+     * 0x02 - CRC32 of FileMetadata Footer.
+
+   - 4-byte magic number "PAR3"
+
+  When parsing the footer implementations SHOULD read at least the last 12 bytes of the footer. Then
+  read in the entirety of the footer based on the length of all preceding elements. This prevents further
+  I/O cost for accessing metadata stored in the data pages. PAR3 footers can fully replace PAR1 footers.
+  If a file is written with only PAR3 footer, implementation MUST write PAR3 as the first four bytes in 
+  they file. PAR3 footers can also be written in a backwards compatible way after PAR1 Metadata 
+  (see next section for details).
+
+  #### Dual Mode PAR1 and PAR3 footers
+
+  The following section defines a layout that allows PAR1
+  and PAR3 headers to co-exist in a single logical footer
+  but allow legacy readers to still read files.
+
+  The laout consists of the following:
+    - Serialized PAR1 FileMetadata Thrift object
+    - PAR3 footer as described above
+    - 4 byte little-endian length in bytes of all 
+      preceding elements.
+    - 4-byte magic number "PAR1"
+
+  Readers aware of PAR3 can check for the "PAR3" magic number
+  beginning 12 bytes from the end of the file (This should
+  be unambiguous because thrift serialization of structs 
+  use 0x00 as a field end delimiter).
+  (TODO: decide if one of the alternatives of embedding
+  the footer as a unknown field FileMetadata desirable as discussed in [Alkis's doc](https://docs.google.com/document/d/1PQpY418LkIDHMFYCY8ne_G-CFpThK15LLpzWYbc7rFU/edit))
+
+  When embedded into a PAR1 file no modification to the magic number at the beginning of the file is mandated.
+
 ## Metadata
 There are three types of metadata: file metadata, column (chunk) metadata and page
 header metadata.  All thrift structures are serialized using the TCompactProtocol.

diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
@@ -770,17 +770,31 @@ struct PageEncodingStats {
 
 /**
  * Description for column metadata
+ * Next-Id: 20
  */
 struct ColumnMetaData {
-  /** Type of this column **/
-  1: required Type type
+  /** Type of this column 
+    * 
+    * Available from schema via efficient lookup with schema_index.
+    * 
+    * PAR1: Required.
+    * PAR3: Don't populate.
+    **/
+  1: optional Type type
 
   /** Set of all encodings used for this column. The purpose is to validate
-   * whether we can decode those pages. **/
-  2: required list<Encoding> encodings
+   * whether we can decode those pages. 
+   * 
+   * PAR1: Required. 
+   * PAR3: don't populate redundant with column page stats.
+   **/
+  2: optional list<Encoding> encodings
 
-  /** Path in schema **/
-  3: required list<string> path_in_schema
+  /** Path in schema 
+    *  PAR1 Footer: Required.
+    *  PAR3 Footer: Deprecated (don't populate). Can be inferred from schema element.
+    */
+  3: optional list<string> path_in_schema
 
   /** Compression codec **/
   4: required CompressionCodec codec
@@ -792,12 +806,23 @@ struct ColumnMetaData {
   6: required i64 total_uncompressed_size
 
   /** total byte size of all compressed, and potentially encrypted, pages 
-   *  in this column chunk (including the headers) **/
+   *  in this column chunk (including the headers) 
+   * 
+   *  Fetching the range of min(dictionary_page_offset, data_page_offset) 
+   *  + total_compressed_size should fetch all data in the the given column 
+   * chunk.
+   */
   7: required i64 total_compressed_size
 
-  /** Optional key/value metadata **/
+  /** Optional key/value metadata 
+    * PAR1: Optional. 
+    * PAR3: Don't write use key_value_metadata instead.
+    **/
   8: optional list<KeyValue> key_value_metadata
 
+  /** See description on FileMetata.key_value_metadata **/
+  19: optional MetadataPage key_value_metadata_page
+
   /** Byte offset from beginning of file to first data page **/
   9: required i64 data_page_offset
 
@@ -812,8 +837,20 @@ struct ColumnMetaData {
 
   /** Set of all encodings used for pages in this column chunk.
    * This information can be used to determine if all data pages are
-   * dictionary encoded for example **/
+   * dictionary encoded for example  
+   *
+   *  PAR1: Optional. May be deprecated in a future release in favor
+   *        serialized_encoding_stats.
+   *  PAR3: Don't populate.  Write serialized_page_encoding_stats.
+   **/
   13: optional list<PageEncodingStats> encoding_stats;
+  /** 
+    * Serialized page encoding stats.
+    *
+    * PAR1: Start populating after encoding_stats is deprecated.
+    * PAR3: Populate instead of encoding_stats.
+    */
+  17: optional binary serialized_encoding_stats
 
   /** Byte offset from beginning of file to Bloom filter data. **/
   14: optional i64 bloom_filter_offset;
@@ -831,8 +868,13 @@ struct ColumnMetaData {
    * representations. The histograms contained in these statistics can
    * also be useful in some cases for more fine-grained nullability/list length
    * filter pushdown.
+   * 
+   *  PAR1: Optional.
+   *  PAR3: Populate serialized_size_statistics.
    */
   16: optional SizeStatistics size_statistics;
+  /** Thrift serialized SizeStatistics **/
+  18: optional binary serialized_size_statistics; 
 }
 
 struct EncryptionWithFooterKey {
@@ -854,6 +896,9 @@ union ColumnCryptoMetaData {
 struct ColumnChunk {
   /** File where column data is stored.  If not set, assumed to be same file as
     * metadata.  This path is relative to the current file.
+    *
+    * DEPRECATED. The one know use-case for this is metadata cache files.
+    * These have been superceded by open source table formats, prefer those.
     **/
   1: optional string file_path
 
@@ -883,13 +928,42 @@ struct ColumnChunk {
 
   /** Encrypted column metadata for this chunk **/
   9: optional binary encrypted_column_metadata
+  /** 
+    * The column order for this chunk.
+    * 
+    * If not set readers should check FileMetadata.column_orders
+    * instead.
+    *
+    * Populated in both PAR1 and PAR3
+    */
+  10: optional ColumnOrder column_order
+  /** Set to true if all pages in the column chunk are dictionary 
+    * encoded 
+    */
+  11: optional bool all_pages_dictionary_encoded
+ /** 
+   * The index to the SchemaElement in FileMetadata for this 
+   * column.
+   */
+  12: optional i32 schema_index 
 }
 
 struct RowGroup {
   /** Metadata for each column chunk in this row group.
    * This list must have the same order as the SchemaElement list in FileMetaData.
+   *
+   *  PAR1: Required
+   *  PAR3: Not populated. Use columns_page.
    **/
-  1: required list<ColumnChunk> columns
+  1: optional list<ColumnChunk> columns
+
+  /** Page has BYTE_ARRAY data where each element is REQUIRED. 
+    *
+    * Each element is a Thrift Serialized ColumnChunk
+    *
+    * PAR1: Don't include
+    * PAR3: Required **/
+  8: optional MetadataPage columns_page
 
   /** Total byte size of all the uncompressed column data in this row group **/
   2: required i64 total_byte_size
@@ -1115,6 +1189,34 @@ union EncryptionAlgorithm {
   2: AesGcmCtrV1 AES_GCM_CTR_V1
 }
 
+/**
+ * Embedded metadata page.
+ * 
+ * A metadata page is a data page used to store metadata about
+ * the data stored in the file. This is a key feature of PAR3
+ * footers which allow for deferred decoding of metadata.
+ *
+ * For common use cases the current recommendation is to use a 
+ * an encoding that supported random access but implementations may choose
+ * other configuration parameters if necessary. Implementations
+ * SHOULD consider allowing configurability per page to allow for end-users
+ * to optimize size vs compute trade-offs that make sense for their use-case.
+ *
+ * Statistics for Metadata pages SHOULD NOT be written.
+ *
+ * Structs of this type should never be written in PAR1.
+ */
+struct MetadataPage {
+   // A serialized page including metadata thrift header and data.
+   1: required binary page
+   // Optional compression applied to the page.
+   2: optional CompressionCodec compression
+   // Number of elements stored.  This is duplicated here to help in 
+   // use-cases where knowing the total number of elements up front for
+   // computation would be useful.
+   3: num_values
+}
+
 /**
  * Description for file metadata
  */
@@ -1127,18 +1229,48 @@ struct FileMetaData {
    * are flattened to a list by doing a depth-first traversal.
    * The column metadata contains the path in the schema for that column which can be
    * used to map columns to nodes in the schema.
-   * The first element is the root **/
-  2: required list<SchemaElement> schema;
+   * The first element is the root
+   *
+   * PAR1: Required
+   * PAR3: Use schema_page
+   **/
+  2: optional list<SchemaElement> schema;
+
+  /** Page has BYTE_ARRAY data where each element is REQUIRED. 
+    *
+    * Each element is a serialized SchemaElement.  The order and content should
+    * have a one to one correspondence with schema.
+    */
+  10: optional MetadataPage schema_page;
 
   /** Number of rows in this file **/
   3: required i64 num_rows
 
-  /** Row groups in this file **/
-  4: required list<RowGroup> row_groups
+  /** Row groups in this file 
+    *
+    * PAR1: Required
+    * PAR3: Use row_groups_page
+    **/
+  4: optional list<RowGroup> row_groups
+  /** Page has BYTE_ARRAY data where each element is REQUIRED. 
+    * 
+    * Each element is a thrift serialized RowGroup.
+    */
+  10: optional MetadataPage row_groups_page
 
-  /** Optional key/value metadata **/
+  /** Optional key/value metadata 
+    *
+    * PAR1: optional
+    * PAR3: Use key_value_metadata_page
+    **/
   5: optional list<KeyValue> key_value_metadata
 
+  /** Page has BYTE_ARRAY data where each element is REQUIRED.
+    *
+    * Each element in the page is a serialized KeyValue struct.
+    */ 
+  13: optional MetadataPage key_value_metadata_page
+
   /** String for application that wrote this file.  This should be in the format
    * <Application> version <App Version> (build <App Build Hash>).
    * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
@@ -1160,6 +1292,10 @@ struct FileMetaData {
    *
    * The obsolete min and max fields in the Statistics object are always sorted
    * by signed comparison regardless of column_orders.
+   * 
+   * PAR1: Optional, may be deprecated in the future in favor of 
+   *       ColumnChunk.column_order
+   * PAR3: Not written use ColumnChunk.column_order.
    */
   7: optional list<ColumnOrder> column_orders;