From 79021427124bb1cc7c6391571ffacbd8423adf9f Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 4 Feb 2024 23:28:59 -0800 Subject: [PATCH] Demo from URL --- demo.js | 57 ++++++++++++++++++++++++++++++++++++--------------- index.html | 2 +- src/column.js | 4 +++- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/demo.js b/demo.js index 32595fc..b00c546 100644 --- a/demo.js +++ b/demo.js @@ -1,4 +1,4 @@ -import { parquetMetadata, toJson } from './src/hyparquet.js' +import { parquetMetadata, parquetMetadataAsync, toJson } from './src/hyparquet.js' const dropzone = document.getElementById('dropzone') const layout = document.getElementById('layout') @@ -36,14 +36,37 @@ dropzone.addEventListener('drop', e => { } }) -function processUrl(url) { - fetch(url) - .then(response => response.arrayBuffer()) - .then(arrayBuffer => renderSidebar(arrayBuffer, url)) - .catch(e => { - dropzone.innerHTML = `${url}` - dropzone.innerHTML += `
Error fetching file\n${e}
` - }) +async function processUrl(url) { + // Check if file is accessible and get its size + const head = await fetch(url, { method: 'HEAD' }) + if (!head.ok) { + dropzone.innerHTML = `${url}` + dropzone.innerHTML += `
Error fetching file\n${head.status} ${head.statusText}
` + return + } + const size = head.headers.get('content-length') + if (!size) { + dropzone.innerHTML = `${url}` + dropzone.innerHTML += '
Error fetching file\nNo content-length header
' + return + } + const asyncBuffer = { + byteLength: Number(size), + slice: async (start, end) => { + const res = await fetch(url, { + headers: { Range: `bytes=${start}-${end - 1}` }, + }) + return res.arrayBuffer() + }, + } + try { + const metadata = await parquetMetadataAsync(asyncBuffer) + renderSidebar(asyncBuffer, metadata, url) + } catch (e) { + console.error('Error fetching file', e) + dropzone.innerHTML = `${url}` + dropzone.innerHTML += `
Error fetching file\n${e}
` + } } function processFile(file) { @@ -51,7 +74,8 @@ function processFile(file) { reader.onload = e => { try { const arrayBuffer = e.target.result - renderSidebar(arrayBuffer, file.name) + const metadata = parquetMetadata(arrayBuffer) + renderSidebar(arrayBuffer, metadata, file.name) } catch (e) { console.error('Error parsing file', e) dropzone.innerHTML = `${file.name}` @@ -65,11 +89,10 @@ function processFile(file) { reader.readAsArrayBuffer(file) } -function renderSidebar(asyncBuffer, name) { - const metadata = parquetMetadata(asyncBuffer) +function renderSidebar(asyncBuffer, metadata, name) { layout.innerHTML = `${name}` // render file layout - layout.appendChild(fileLayout(metadata, asyncBuffer)) + layout.appendChild(fileLayout(metadata, asyncBuffer.byteLength)) // display metadata metadataDiv.innerHTML = '' metadataDiv.appendChild(fileMetadata(toJson(metadata))) @@ -86,7 +109,7 @@ fileInput.addEventListener('change', () => { }) // Render file layout -function fileLayout(metadata, arrayBuffer) { +function fileLayout(metadata, byteLength) { let html = '

File layout

' html += cell('PAR1', 0, 4, 4) // magic number for (const rowGroupIndex in metadata.row_groups) { @@ -106,9 +129,9 @@ function fileLayout(metadata, arrayBuffer) { } html += '' } - const metadataStart = arrayBuffer.byteLength - metadata.metadata_length - 4 - html += cell('Metadata', metadataStart, metadata.metadata_length, arrayBuffer.byteLength - 4) - html += cell('PAR1', arrayBuffer.byteLength - 4, 4, arrayBuffer.byteLength) // magic number + const metadataStart = byteLength - metadata.metadata_length - 4 + html += cell('Metadata', metadataStart, metadata.metadata_length, byteLength - 4) + html += cell('PAR1', byteLength - 4, 4, byteLength) // magic number const div = document.createElement('div') div.innerHTML = html div.classList.add('collapsed') // start collapsed diff --git a/index.html b/index.html index e6d0e32..41524d1 100644 --- a/index.html +++ b/index.html @@ -11,7 +11,7 @@

hyparquet

parquet file reader

- This is a online demo of hyparquet: a parser for apache parquet files. + Online demo of hyparquet: a parser for apache parquet files.

Drag and drop a parquet file onto the dropzone to see parquet data. diff --git a/src/column.js b/src/column.js index 54967f2..3f3b523 100644 --- a/src/column.js +++ b/src/column.js @@ -37,7 +37,9 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) { // parse column header const { value: header, byteLength: headerLength } = parquetHeader(arrayBuffer, columnOffset + byteOffset) byteOffset += headerLength - if (!header || header.compressed_page_size === undefined) throw new Error('parquet header is undefined') + if (header.compressed_page_size === undefined) { + throw new Error(`parquet compressed page size is undefined in column '${columnMetadata.path_in_schema}'`) + } // read compressed_page_size bytes starting at offset const compressedBytes = new Uint8Array(arrayBuffer.slice(