From 800b39b57537faa56aa11c69af669fbe261d5f4e Mon Sep 17 00:00:00 2001
From: Kenny Daniel <platypii@gmail.com>
Date: Thu, 1 Feb 2024 20:12:39 -0800
Subject: [PATCH] Fix initial fetch size larger > file size

---
 src/column.js   |  5 +++--
 src/metadata.js | 13 ++++++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/column.js b/src/column.js
index 3d289a0..0725da2 100644
--- a/src/column.js
+++ b/src/column.js
@@ -45,6 +45,7 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) {
       columnOffset + byteOffset + header.compressed_page_size
     ))
     // decompress bytes
+    /** @type {Uint8Array | undefined} */
     let page
     const uncompressed_page_size = Number(header.uncompressed_page_size)
     const { codec } = columnMetadata
@@ -56,8 +57,8 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) {
     } else if (codec === CompressionCodec.LZO) {
       throw new Error('parquet lzo compression not supported')
     }
-    if (!page || page.length !== uncompressed_page_size) {
-      throw new Error('parquet decompressed page size does not match header')
+    if (page?.length !== uncompressed_page_size) {
+      throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`)
     }
 
     // parse page data by type
diff --git a/src/metadata.js b/src/metadata.js
index fc7ea3a..cc93458 100644
--- a/src/metadata.js
+++ b/src/metadata.js
@@ -7,12 +7,19 @@ import { deserializeTCompactProtocol } from './thrift.js'
  * An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
  * asynchronously, possibly over the network.
  *
+ * You must provide the byteLength of the buffer, typically from a HEAD request.
+ *
+ * In theory, you could use suffix-range requests to fetch the end of the file,
+ * and save a round trip. But in practice, this doesn't work because chrome
+ * deems suffix-range requests as a not-safe-listed header, and will require
+ * a pre-flight. So the byteLength is required.
+ *
  * To make this efficient, we initially request the last 512kb of the file,
  * which is likely to contain the metadata. If the metadata length exceeds the
  * initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
  *
  * This ensures that we either make one 512kb initial request for the metadata,
- * or two requests for exactly the metadata size.
+ * or a second request for up to the metadata size.
  *
  * @typedef {import("./types.d.ts").AsyncBuffer} AsyncBuffer
  * @typedef {import("./types.d.ts").FileMetaData} FileMetaData
@@ -22,7 +29,7 @@ import { deserializeTCompactProtocol } from './thrift.js'
  */
 export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
   // fetch last bytes (footer) of the file
-  const footerOffset = asyncBuffer.byteLength - initialFetchSize
+  const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
   const footerBuffer = await asyncBuffer.slice(footerOffset)
   // check if metadata size fits inside the initial fetch
   const footerView = new DataView(footerBuffer)
@@ -31,7 +38,7 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
     // fetch the rest of the metadata
     const metadataOffset = asyncBuffer.byteLength - metadataLength - 8
     const metadataBuffer = await asyncBuffer.slice(metadataOffset, footerOffset)
-    // combine the buffers
+    // combine initial fetch with the new slice
     const combinedBuffer = new ArrayBuffer(metadataLength + 8)
     const combinedView = new Uint8Array(combinedBuffer)
     combinedView.set(new Uint8Array(metadataBuffer), 0)