diff --git a/src/hyparquet.d.ts b/src/hyparquet.d.ts index 465a395..d3d612b 100644 --- a/src/hyparquet.d.ts +++ b/src/hyparquet.d.ts @@ -1,4 +1,4 @@ -export { FileMetaData } from './types' +export { AsyncBuffer, FileMetaData } from './types' /** * Read parquet data rows from a file @@ -8,11 +8,28 @@ export { FileMetaData } from './types' */ export function parquetRead(arrayBuffer: ArrayBuffer): any[][] +/** + * Read parquet metadata from an async buffer. + * + * An AsyncBuffer is like an ArrayBuffer, but the slices are loaded + * asynchronously, possibly over the network. + * + * To make this efficient, we initially request the last 512kb of the file, + * which is likely to contain the metadata. If the metadata length exceeds the + * initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer. + * + * This ensures that we either make one 512kb initial request for the metadata, + * or two requests for exactly the metadata size. + * + * @param {AsyncBuffer} asyncBuffer parquet file contents + * @param {number} initialFetchSize initial fetch size in bytes (default 512kb) + * @returns {Promise} metadata object + */ +export async function parquetMetadataAsync(asyncBuffer: ArrayBuffer, initialFetchSize: number = 1 << 19 /* 512kb */): Promise /** - * Read parquet header, metadata, and schema information from a file + * Read parquet metadata from a buffer * - * @typedef {import("./hyparquet.js").FileMetaData} FileMetaData * @param {ArrayBuffer} arrayBuffer parquet file contents * @returns {FileMetaData} metadata object */ diff --git a/src/hyparquet.js b/src/hyparquet.js index 2144943..1a80a59 100644 --- a/src/hyparquet.js +++ b/src/hyparquet.js @@ -1,5 +1,5 @@ -import { parquetMetadata } from './metadata.js' -export { parquetMetadata } +import { parquetMetadata, parquetMetadataAsync } from './metadata.js' +export { parquetMetadata, parquetMetadataAsync } import { snappyUncompress } from './snappy.js' export { snappyUncompress } diff --git a/src/metadata.js b/src/metadata.js index 9cf61d2..a4374ef 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,9 +1,50 @@ import { deserializeTCompactProtocol } from './thrift.js' /** - * Read parquet header, metadata, and schema information from a file + * Read parquet metadata from an async buffer. * + * An AsyncBuffer is like an ArrayBuffer, but the slices are loaded + * asynchronously, possibly over the network. + * + * To make this efficient, we initially request the last 512kb of the file, + * which is likely to contain the metadata. If the metadata length exceeds the + * initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer. + * + * This ensures that we either make one 512kb initial request for the metadata, + * or two requests for exactly the metadata size. + * + * @typedef {import("./types.d.ts").AsyncBuffer} AsyncBuffer * @typedef {import("./types.d.ts").FileMetaData} FileMetaData + * @param {AsyncBuffer} asyncBuffer parquet file contents + * @param {number} initialFetchSize initial fetch size in bytes + * @returns {Promise} metadata object + */ +export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) { + // fetch last bytes (footer) of the file + const footerOffset = asyncBuffer.byteLength - initialFetchSize + const footerBuffer = await asyncBuffer.slice(footerOffset) + // check if metadata size fits inside the initial fetch + const footerView = new DataView(footerBuffer) + const metadataLength = footerView.getUint32(footerBuffer.byteLength - 8, true) + if (metadataLength + 8 > initialFetchSize) { + // fetch the rest of the metadata + const metadataOffset = asyncBuffer.byteLength - metadataLength - 8 + const metadataBuffer = await asyncBuffer.slice(metadataOffset, footerOffset) + // combine the buffers + const combinedBuffer = new ArrayBuffer(metadataLength + 8) + const combinedView = new Uint8Array(combinedBuffer) + combinedView.set(new Uint8Array(metadataBuffer), 0) + combinedView.set(new Uint8Array(footerBuffer), footerOffset - metadataOffset) + return parquetMetadata(combinedBuffer) + } else { + // parse metadata from the footer + return parquetMetadata(footerBuffer) + } +} + +/** + * Read parquet metadata from a buffer + * * @param {ArrayBuffer} arrayBuffer parquet file contents * @returns {FileMetaData} metadata object */ @@ -22,12 +63,13 @@ export function parquetMetadata(arrayBuffer) { // Parquet files store metadata at the end of the file // Metadata length is 4 bytes before the last PAR1 const metadataLengthOffset = view.byteLength - 8 - const metadataLength = view.getUint32(view.byteLength - 8, true) + const metadataLength = view.getUint32(metadataLengthOffset, true) if (metadataLength <= 0) { - throw new Error('parquet invalid metadata length') + throw new Error(`parquet invalid metadata length ${metadataLength}`) } if (metadataLength > view.byteLength - 8) { - throw new Error('parquet metadata length exceeds buffer size') + // {metadata}, metadata_length, PAR1 + throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${view.byteLength - 8}`) } const metadataOffset = metadataLengthOffset - metadataLength diff --git a/src/types.d.ts b/src/types.d.ts index a47095d..d9c5236 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -1,3 +1,11 @@ +/** + * File-like object that can read slices of a file asynchronously. + */ +export interface AsyncBuffer { + byteLength: number + slice(start: number, end?: number): Promise +} + /** * Just like an ArrayBuffer, but an interface */