diff --git a/demo.css b/demo/demo.css similarity index 96% rename from demo.css rename to demo/demo.css index de7d141..54efbec 100644 --- a/demo.css +++ b/demo/demo.css @@ -136,16 +136,17 @@ th, td { font-size: 10pt; margin-top: 20px; } -#layout { +.sidebar { word-break: break-all; } -.layout a { +.sidebar a { color: #445; text-decoration: none; } -.layout a:hover { +.sidebar a:hover { text-decoration: underline; } +.layout, .layout div { background-color: rgba(100, 80, 180, 0.05); border: 1px solid #ccc; @@ -190,11 +191,11 @@ nav ul, margin: 0 4px; vertical-align: middle; } -.layout .collapsed h2::before { +.layout.collapsed h2::before { content: "▶"; } -#metadata pre { +.layout pre { white-space: pre-wrap; word-break: break-all; } diff --git a/demo.js b/demo/demo.js similarity index 60% rename from demo.js rename to demo/demo.js index 531de86..fd70262 100644 --- a/demo.js +++ b/demo/demo.js @@ -1,18 +1,23 @@ -import { parquetMetadata, parquetMetadataAsync, parquetRead, parquetSchema, toJson } from './src/hyparquet.js' +import { + parquetMetadata, parquetMetadataAsync, parquetRead, parquetSchema, toJson, +} from '../src/hyparquet.js' +import { fileLayout, fileMetadata } from './layout.js' -const dropzone = document.getElementById('dropzone') -const fileInput = document.getElementById('file-input') -const content = document.getElementById('content') -const welcome = document.getElementById('welcome') -const label = document.getElementById('filename') +/** + * @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer + * @typedef {import('../src/types.js').FileMetaData} FileMetaData + */ -const layout = document.getElementById('layout') -const metadataDiv = document.getElementById('metadata') +/* eslint-disable no-extra-parens */ +const dropzone = /** @type {HTMLElement} */ (document.getElementById('dropzone')) +const fileInput = /** @type {HTMLInputElement} */ (document.getElementById('#file-input')) +const content = document.querySelectorAll('#content')[0] +const welcome = document.querySelectorAll('#welcome')[0] let enterCount = 0 dropzone.addEventListener('dragenter', e => { - e.dataTransfer.dropEffect = 'copy' + if (e.dataTransfer) e.dataTransfer.dropEffect = 'copy' dropzone.classList.add('over') enterCount++ }) @@ -30,6 +35,7 @@ dropzone.addEventListener('drop', e => { e.preventDefault() // prevent dropped file from being "downloaded" dropzone.classList.remove('over') + if (!e.dataTransfer) throw new Error('Missing dataTransfer') const { files, items } = e.dataTransfer if (files.length > 0) { const file = files[0] @@ -47,6 +53,9 @@ dropzone.addEventListener('drop', e => { } }) +/** + * @param {string} url + */ async function processUrl(url) { content.innerHTML = '' try { @@ -66,6 +75,11 @@ async function processUrl(url) { // Construct an AsyncBuffer that fetches file chunks const asyncBuffer = { byteLength: Number(size), + /** + * @param {number} start + * @param {number} end + * @returns {Promise} + */ slice: async (start, end) => { const rangeEnd = end === undefined ? '' : end - 1 console.log(`Fetch ${url} bytes=${start}-${rangeEnd}`) @@ -84,12 +98,16 @@ async function processUrl(url) { } } +/** + * @param {File} file + */ function processFile(file) { content.innerHTML = '' const reader = new FileReader() reader.onload = async e => { try { - const arrayBuffer = e.target.result + const arrayBuffer = e.target?.result + if (!(arrayBuffer instanceof ArrayBuffer)) throw new Error('Missing arrayBuffer') const metadata = parquetMetadata(arrayBuffer) await render(arrayBuffer, metadata, file.name) } catch (e) { @@ -101,11 +119,16 @@ function processFile(file) { reader.onerror = e => { console.error('Error reading file', e) content.innerHTML = `${file.name}` - content.innerHTML += `
Error reading file\n${e.target.error}
` + content.innerHTML += `
Error reading file\n${e.target?.error}
` } reader.readAsArrayBuffer(file) } +/** + * @param {AsyncBuffer} asyncBuffer + * @param {FileMetaData} metadata + * @param {string} name + */ async function render(asyncBuffer, metadata, name) { renderSidebar(asyncBuffer, metadata, name) @@ -116,7 +139,7 @@ async function render(asyncBuffer, metadata, name) { await parquetRead({ file: asyncBuffer, rowEnd: 1000, - onComplete(data) { + onComplete(/** @type {any[][]} */ data) { const ms = performance.now() - startTime console.log(`parsed ${name} in ${ms.toFixed(0)} ms`) content.appendChild(renderTable(header, data)) @@ -124,86 +147,33 @@ async function render(asyncBuffer, metadata, name) { }) } +/** + * @param {AsyncBuffer} asyncBuffer + * @param {FileMetaData} metadata + * @param {string} name + */ function renderSidebar(asyncBuffer, metadata, name) { - label.innerText = name - // render file layout - layout.innerHTML = '' - layout.appendChild(fileLayout(metadata, asyncBuffer.byteLength)) - // display metadata - metadataDiv.innerHTML = '' - metadataDiv.appendChild(fileMetadata(toJson(metadata))) + const sidebar = /** @type {HTMLElement} */ (document.getElementById('sidebar')) + sidebar.innerHTML = `
${name}
` + sidebar.appendChild(fileMetadata(toJson(metadata))) + sidebar.appendChild(fileLayout(metadata, asyncBuffer.byteLength)) } welcome.addEventListener('click', () => { - fileInput.click() + fileInput?.click() }) -fileInput.addEventListener('change', () => { - if (fileInput.files.length > 0) { +fileInput?.addEventListener('change', () => { + if (fileInput.files?.length) { processFile(fileInput.files[0]) } }) -// Render file layout -function fileLayout(metadata, byteLength) { - let html = '

File layout

' - html += cell('PAR1', 0, 4, 4) // magic number - for (const rowGroupIndex in metadata.row_groups) { - const rowGroup = metadata.row_groups[rowGroupIndex] - html += group(`Row group ${rowGroupIndex} (${rowGroup.total_byte_size.toLocaleString()} bytes)`) - for (const column of rowGroup.columns) { - const columnName = column.meta_data.path_in_schema.join('.') - - let columnOffset = column.meta_data.dictionary_page_offset - if (!columnOffset || column.meta_data.data_page_offset < columnOffset) { - columnOffset = column.meta_data.data_page_offset - } - columnOffset = Number(columnOffset) - const bytes = Number(column.meta_data.total_compressed_size) - const end = columnOffset + bytes - html += cell(`Column ${columnName}`, columnOffset, bytes, end) - } - html += '' - } - const metadataStart = byteLength - metadata.metadata_length - 4 - html += cell('Metadata', metadataStart, metadata.metadata_length, byteLength - 4) - html += cell('PAR1', byteLength - 4, 4, byteLength) // magic number - const div = document.createElement('div') - div.innerHTML = html - div.classList.add('collapsed') // start collapsed - div.children[0].addEventListener('click', () => { - div.classList.toggle('collapsed') - }) - return div -} -function group(name) { - return `
${name}` -} -function cell(name, start, bytes, end) { - return ` -
- -
    -
  • start ${start.toLocaleString()}
  • -
  • bytes ${bytes.toLocaleString()}
  • -
  • end ${end.toLocaleString()}
  • -
-
` -} - -// Render metadata -function fileMetadata(metadata) { - let html = '

Metadata

' - html += `
${JSON.stringify(metadata, null, 2)}
` - const div = document.createElement('div') - div.innerHTML = html - div.classList.add('collapsed') // start collapsed - div.children[0].addEventListener('click', () => { - div.classList.toggle('collapsed') - }) - return div -} - +/** + * @param {string[]} header + * @param {any[][]} data + * @returns {HTMLTableElement} + */ function renderTable(header, data) { const table = document.createElement('table') const thead = document.createElement('thead') @@ -229,6 +199,11 @@ function renderTable(header, data) { return table } +/** + * @param {any} value + * @param {number} depth + * @returns {string} + */ function stringify(value, depth = 0) { if (value === null) return depth ? 'null' : '' if (value === undefined) return depth ? 'undefined' : '' diff --git a/demo/layout.js b/demo/layout.js new file mode 100644 index 0000000..317ed94 --- /dev/null +++ b/demo/layout.js @@ -0,0 +1,111 @@ +/** + * @typedef {import('../src/types.js').FileMetaData} FileMetaData + */ + +import { getColumnRange } from '../src/column.js' + +/** + * @param {FileMetaData} metadata + * @returns {HTMLDivElement} + */ +export function fileMetadata(metadata) { + let html = '

Metadata

' + html += `
${JSON.stringify(metadata, null, 2)}
` + const div = document.createElement('div') + div.innerHTML = html + div.classList.add('layout', 'collapsed') // start collapsed + div.children[0].addEventListener('click', () => { + div.classList.toggle('collapsed') + }) + return div +} + +/** + * Render parquet file layout. + * + * @param {FileMetaData} metadata + * @param {number} byteLength + * @returns {HTMLDivElement} + */ +export function fileLayout(metadata, byteLength) { + let html = '

File layout

' + html += cell('PAR1', 0n, 4n) // magic number + /** @type {[string, bigint, bigint][]} */ + const indexPages = [] + for (const rowGroupIndex in metadata.row_groups) { + const rowGroup = metadata.row_groups[rowGroupIndex] + html += group(`RowGroup ${rowGroupIndex} (${rowGroup.total_byte_size.toLocaleString()} bytes)`) + for (const column of rowGroup.columns) { + const columnName = column.meta_data?.path_in_schema.join('.') + html += group(`Column ${columnName}`) + if (column.meta_data) { + const end = getColumnRange(column.meta_data)[1] + /* eslint-disable no-extra-parens */ + const pages = (/** @type {[string, bigint][]} */ + ([ + ['Dictionary', column.meta_data.dictionary_page_offset], + ['Data', column.meta_data.data_page_offset], + ['Index', column.meta_data.index_page_offset], + ['End', end], + ])) + .filter(([, offset]) => offset !== undefined) + .sort((a, b) => Number(a[1]) - Number(b[1])) + + for (let i = 0; i < pages.length - 1; i++) { + const [name, start] = pages[i] + const end = pages[i + 1][1] + html += cell(name, start, end) + } + } + if (column.column_index_offset) { + indexPages.push([`ColumnIndex RowGroup${rowGroupIndex} ${columnName}`, column.column_index_offset, BigInt(column.column_index_length || 0)]) + } + if (column.offset_index_offset) { + indexPages.push([`OffsetIndex RowGroup${rowGroupIndex} ${columnName}`, column.offset_index_offset, BigInt(column.offset_index_length || 0)]) + } + html += '
' + } + html += '' + } + for (const [name, start, length] of indexPages) { + html += cell(name, start, start + length) + } + const metadataStart = BigInt(byteLength - metadata.metadata_length - 4) + const metadataEnd = BigInt(byteLength - 4) + html += cell('Metadata', metadataStart, metadataEnd) + html += cell('PAR1', metadataEnd, BigInt(byteLength)) // magic number + const div = document.createElement('div') + div.innerHTML = html + div.classList.add('layout', 'collapsed') // start collapsed + div.children[0].addEventListener('click', () => { + div.classList.toggle('collapsed') + }) + return div +} + +/** + * @param {string} name + * @returns {string} + */ +function group(name) { + return `
${name}` +} + +/** + * @param {string} name + * @param {bigint} start + * @param {bigint} end + * @returns {string} + */ +function cell(name, start, end) { + const bytes = end - start + return ` +
+ +
    +
  • start ${start.toLocaleString()}
  • +
  • bytes ${bytes.toLocaleString()}
  • +
  • end ${end.toLocaleString()}
  • +
+
` +} diff --git a/index.html b/index.html index 08c0a97..67a311f 100644 --- a/index.html +++ b/index.html @@ -3,7 +3,7 @@ hyparquet parquet file parser - + @@ -24,9 +24,7 @@
  • github
  • npm
  • -
    -
    -
    +
    @@ -36,6 +34,6 @@
    - + diff --git a/src/column.js b/src/column.js index 5e856e1..7b2ae17 100644 --- a/src/column.js +++ b/src/column.js @@ -110,12 +110,12 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr * Find the start byte offset for a column chunk. * * @param {ColumnMetaData} columnMetadata - * @returns {number} byte offset + * @returns {[bigint, bigint]} byte offset range */ -export function getColumnOffset({ dictionary_page_offset, data_page_offset }) { +export function getColumnRange({ dictionary_page_offset, data_page_offset, total_compressed_size }) { let columnOffset = dictionary_page_offset - if (!dictionary_page_offset || data_page_offset < dictionary_page_offset) { + if (!columnOffset || data_page_offset < columnOffset) { columnOffset = data_page_offset } - return Number(columnOffset) + return [columnOffset, columnOffset + total_compressed_size] } diff --git a/src/hyparquet.d.ts b/src/hyparquet.d.ts index 7979c56..bf57406 100644 --- a/src/hyparquet.d.ts +++ b/src/hyparquet.d.ts @@ -86,7 +86,7 @@ export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean * @param {any} obj object to convert * @returns {unknown} converted object */ -export function toJson(obj: any): unknown +export function toJson(obj: any): any /** * Parquet query options for reading data diff --git a/src/read.js b/src/read.js index c1dedd1..aa5baf5 100644 --- a/src/read.js +++ b/src/read.js @@ -1,6 +1,6 @@ import { assembleNested } from './assemble.js' -import { getColumnOffset, readColumn } from './column.js' +import { getColumnRange, readColumn } from './column.js' import { parquetMetadataAsync } from './metadata.js' import { getSchemaPath } from './schema.js' import { concat } from './utils.js' @@ -91,10 +91,9 @@ async function readRowGroup(options, rowGroup, groupStart) { // skip columns that are not requested if (columns && !columns.includes(columnMetadata.path_in_schema[0])) return - const startByte = getColumnOffset(columnMetadata) - const endByte = startByte + Number(columnMetadata.total_compressed_size) - groupStartByte = Math.min(groupStartByte, startByte) - groupEndByte = Math.max(groupEndByte, endByte) + const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number) + groupStartByte = Math.min(groupStartByte, columnStartByte) + groupEndByte = Math.max(groupEndByte, columnEndByte) }) if (groupStartByte >= groupEndByte && columns?.length) { // TODO: should throw if any column is missing @@ -124,8 +123,7 @@ async function readRowGroup(options, rowGroup, groupStart) { const columnName = columnMetadata.path_in_schema[0] if (columns && !columns.includes(columnName)) continue - const columnStartByte = getColumnOffset(columnMetadata) - const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size) + const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number) const columnBytes = columnEndByte - columnStartByte // skip columns larger than 1gb diff --git a/tsconfig.json b/tsconfig.json index d4f8118..7e282e7 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -10,5 +10,5 @@ "strict": true, "target": "esnext", }, - "include": ["src", "test"] + "include": ["src", "test", "demo"] }