Async metadata fetching

2026-02-21 03:41:33 +00:00 · 2024-01-15 11:10:26 -08:00 · 2024-01-15 11:10:26 -08:00 · be7f2a8c77
commit be7f2a8c77
parent 1da38f040d
4 changed files with 76 additions and 9 deletions
--- a/src/hyparquet.d.ts
+++ b/src/hyparquet.d.ts
@ -1,4 +1,4 @@
-export { FileMetaData } from './types'
+export { AsyncBuffer, FileMetaData } from './types'

 /**
 * Read parquet data rows from a file
@ -8,11 +8,28 @@ export { FileMetaData } from './types'
 */
 export function parquetRead(arrayBuffer: ArrayBuffer): any[][]

+/**
+ * Read parquet metadata from an async buffer.
+ *
+ * An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
+ * asynchronously, possibly over the network.
+ *
+ * To make this efficient, we initially request the last 512kb of the file,
+ * which is likely to contain the metadata. If the metadata length exceeds the
+ * initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
+ *
+ * This ensures that we either make one 512kb initial request for the metadata,
+ * or two requests for exactly the metadata size.
+ *
+ * @param {AsyncBuffer} asyncBuffer parquet file contents
+ * @param {number} initialFetchSize initial fetch size in bytes (default 512kb)
+ * @returns {Promise<FileMetaData>} metadata object
+ */
+export async function parquetMetadataAsync(asyncBuffer: ArrayBuffer, initialFetchSize: number = 1 << 19 /* 512kb */): Promise<FileMetaData>

 /**
- * Read parquet header, metadata, and schema information from a file
+ * Read parquet metadata from a buffer
 *
- * @typedef {import("./hyparquet.js").FileMetaData} FileMetaData
 * @param {ArrayBuffer} arrayBuffer parquet file contents
 * @returns {FileMetaData} metadata object
 */
--- a/src/hyparquet.js
+++ b/src/hyparquet.js
@ -1,5 +1,5 @@
-import { parquetMetadata } from './metadata.js'
-export { parquetMetadata }
+import { parquetMetadata, parquetMetadataAsync } from './metadata.js'
+export { parquetMetadata, parquetMetadataAsync }

 import { snappyUncompress } from './snappy.js'
 export { snappyUncompress }
--- a/src/metadata.js
+++ b/src/metadata.js
@ -1,9 +1,50 @@
 import { deserializeTCompactProtocol } from './thrift.js'

 /**
- * Read parquet header, metadata, and schema information from a file
+ * Read parquet metadata from an async buffer.
 *
+ * An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
+ * asynchronously, possibly over the network.
+ *
+ * To make this efficient, we initially request the last 512kb of the file,
+ * which is likely to contain the metadata. If the metadata length exceeds the
+ * initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
+ *
+ * This ensures that we either make one 512kb initial request for the metadata,
+ * or two requests for exactly the metadata size.
+ *
+ * @typedef {import("./types.d.ts").AsyncBuffer} AsyncBuffer
 * @typedef {import("./types.d.ts").FileMetaData} FileMetaData
+ * @param {AsyncBuffer} asyncBuffer parquet file contents
+ * @param {number} initialFetchSize initial fetch size in bytes
+ * @returns {Promise<FileMetaData>} metadata object
+ */
+export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
+  // fetch last bytes (footer) of the file
+  const footerOffset = asyncBuffer.byteLength - initialFetchSize
+  const footerBuffer = await asyncBuffer.slice(footerOffset)
+  // check if metadata size fits inside the initial fetch
+  const footerView = new DataView(footerBuffer)
+  const metadataLength = footerView.getUint32(footerBuffer.byteLength - 8, true)
+  if (metadataLength + 8 > initialFetchSize) {
+    // fetch the rest of the metadata
+    const metadataOffset = asyncBuffer.byteLength - metadataLength - 8
+    const metadataBuffer = await asyncBuffer.slice(metadataOffset, footerOffset)
+    // combine the buffers
+    const combinedBuffer = new ArrayBuffer(metadataLength + 8)
+    const combinedView = new Uint8Array(combinedBuffer)
+    combinedView.set(new Uint8Array(metadataBuffer), 0)
+    combinedView.set(new Uint8Array(footerBuffer), footerOffset - metadataOffset)
+    return parquetMetadata(combinedBuffer)
+  } else {
+    // parse metadata from the footer
+    return parquetMetadata(footerBuffer)
+  }
+}
+
+/**
+ * Read parquet metadata from a buffer
+ *
 * @param {ArrayBuffer} arrayBuffer parquet file contents
 * @returns {FileMetaData} metadata object
 */
@ -22,12 +63,13 @@ export function parquetMetadata(arrayBuffer) {
  // Parquet files store metadata at the end of the file
  // Metadata length is 4 bytes before the last PAR1
  const metadataLengthOffset = view.byteLength - 8
-  const metadataLength = view.getUint32(view.byteLength - 8, true)
+  const metadataLength = view.getUint32(metadataLengthOffset, true)
  if (metadataLength <= 0) {
-    throw new Error('parquet invalid metadata length')
+    throw new Error(`parquet invalid metadata length ${metadataLength}`)
  }
  if (metadataLength > view.byteLength - 8) {
-    throw new Error('parquet metadata length exceeds buffer size')
+    // {metadata}, metadata_length, PAR1
+    throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${view.byteLength - 8}`)
  }

  const metadataOffset = metadataLengthOffset - metadataLength
--- a/src/types.d.ts
+++ b/src/types.d.ts
@ -1,3 +1,11 @@
+/**
+ * File-like object that can read slices of a file asynchronously.
+ */
+export interface AsyncBuffer {
+  byteLength: number
+  slice(start: number, end?: number): Promise<ArrayBuffer>
+}
+
 /**
 * Just like an ArrayBuffer, but an interface
 */