Demo: move to folder, typecheck, and render column indices

This commit is contained in:
Kenny Daniel 2024-05-31 19:40:44 -07:00
parent 941c6633a0
commit 6d769a4336
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
8 changed files with 188 additions and 105 deletions

@ -136,16 +136,17 @@ th, td {
font-size: 10pt;
margin-top: 20px;
}
#layout {
.sidebar {
word-break: break-all;
}
.layout a {
.sidebar a {
color: #445;
text-decoration: none;
}
.layout a:hover {
.sidebar a:hover {
text-decoration: underline;
}
.layout,
.layout div {
background-color: rgba(100, 80, 180, 0.05);
border: 1px solid #ccc;
@ -190,11 +191,11 @@ nav ul,
margin: 0 4px;
vertical-align: middle;
}
.layout .collapsed h2::before {
.layout.collapsed h2::before {
content: "▶";
}
#metadata pre {
.layout pre {
white-space: pre-wrap;
word-break: break-all;
}

@ -1,18 +1,23 @@
import { parquetMetadata, parquetMetadataAsync, parquetRead, parquetSchema, toJson } from './src/hyparquet.js'
import {
parquetMetadata, parquetMetadataAsync, parquetRead, parquetSchema, toJson,
} from '../src/hyparquet.js'
import { fileLayout, fileMetadata } from './layout.js'
const dropzone = document.getElementById('dropzone')
const fileInput = document.getElementById('file-input')
const content = document.getElementById('content')
const welcome = document.getElementById('welcome')
const label = document.getElementById('filename')
/**
* @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer
* @typedef {import('../src/types.js').FileMetaData} FileMetaData
*/
const layout = document.getElementById('layout')
const metadataDiv = document.getElementById('metadata')
/* eslint-disable no-extra-parens */
const dropzone = /** @type {HTMLElement} */ (document.getElementById('dropzone'))
const fileInput = /** @type {HTMLInputElement} */ (document.getElementById('#file-input'))
const content = document.querySelectorAll('#content')[0]
const welcome = document.querySelectorAll('#welcome')[0]
let enterCount = 0
dropzone.addEventListener('dragenter', e => {
e.dataTransfer.dropEffect = 'copy'
if (e.dataTransfer) e.dataTransfer.dropEffect = 'copy'
dropzone.classList.add('over')
enterCount++
})
@ -30,6 +35,7 @@ dropzone.addEventListener('drop', e => {
e.preventDefault() // prevent dropped file from being "downloaded"
dropzone.classList.remove('over')
if (!e.dataTransfer) throw new Error('Missing dataTransfer')
const { files, items } = e.dataTransfer
if (files.length > 0) {
const file = files[0]
@ -47,6 +53,9 @@ dropzone.addEventListener('drop', e => {
}
})
/**
* @param {string} url
*/
async function processUrl(url) {
content.innerHTML = ''
try {
@ -66,6 +75,11 @@ async function processUrl(url) {
// Construct an AsyncBuffer that fetches file chunks
const asyncBuffer = {
byteLength: Number(size),
/**
* @param {number} start
* @param {number} end
* @returns {Promise<ArrayBuffer>}
*/
slice: async (start, end) => {
const rangeEnd = end === undefined ? '' : end - 1
console.log(`Fetch ${url} bytes=${start}-${rangeEnd}`)
@ -84,12 +98,16 @@ async function processUrl(url) {
}
}
/**
* @param {File} file
*/
function processFile(file) {
content.innerHTML = ''
const reader = new FileReader()
reader.onload = async e => {
try {
const arrayBuffer = e.target.result
const arrayBuffer = e.target?.result
if (!(arrayBuffer instanceof ArrayBuffer)) throw new Error('Missing arrayBuffer')
const metadata = parquetMetadata(arrayBuffer)
await render(arrayBuffer, metadata, file.name)
} catch (e) {
@ -101,11 +119,16 @@ function processFile(file) {
reader.onerror = e => {
console.error('Error reading file', e)
content.innerHTML = `<strong>${file.name}</strong>`
content.innerHTML += `<div class="error">Error reading file\n${e.target.error}</div>`
content.innerHTML += `<div class="error">Error reading file\n${e.target?.error}</div>`
}
reader.readAsArrayBuffer(file)
}
/**
* @param {AsyncBuffer} asyncBuffer
* @param {FileMetaData} metadata
* @param {string} name
*/
async function render(asyncBuffer, metadata, name) {
renderSidebar(asyncBuffer, metadata, name)
@ -116,7 +139,7 @@ async function render(asyncBuffer, metadata, name) {
await parquetRead({
file: asyncBuffer,
rowEnd: 1000,
onComplete(data) {
onComplete(/** @type {any[][]} */ data) {
const ms = performance.now() - startTime
console.log(`parsed ${name} in ${ms.toFixed(0)} ms`)
content.appendChild(renderTable(header, data))
@ -124,86 +147,33 @@ async function render(asyncBuffer, metadata, name) {
})
}
/**
* @param {AsyncBuffer} asyncBuffer
* @param {FileMetaData} metadata
* @param {string} name
*/
function renderSidebar(asyncBuffer, metadata, name) {
label.innerText = name
// render file layout
layout.innerHTML = ''
layout.appendChild(fileLayout(metadata, asyncBuffer.byteLength))
// display metadata
metadataDiv.innerHTML = ''
metadataDiv.appendChild(fileMetadata(toJson(metadata)))
const sidebar = /** @type {HTMLElement} */ (document.getElementById('sidebar'))
sidebar.innerHTML = `<div id="filename">${name}</div>`
sidebar.appendChild(fileMetadata(toJson(metadata)))
sidebar.appendChild(fileLayout(metadata, asyncBuffer.byteLength))
}
welcome.addEventListener('click', () => {
fileInput.click()
fileInput?.click()
})
fileInput.addEventListener('change', () => {
if (fileInput.files.length > 0) {
fileInput?.addEventListener('change', () => {
if (fileInput.files?.length) {
processFile(fileInput.files[0])
}
})
// Render file layout
function fileLayout(metadata, byteLength) {
let html = '<h2>File layout</h2>'
html += cell('PAR1', 0, 4, 4) // magic number
for (const rowGroupIndex in metadata.row_groups) {
const rowGroup = metadata.row_groups[rowGroupIndex]
html += group(`Row group ${rowGroupIndex} (${rowGroup.total_byte_size.toLocaleString()} bytes)`)
for (const column of rowGroup.columns) {
const columnName = column.meta_data.path_in_schema.join('.')
let columnOffset = column.meta_data.dictionary_page_offset
if (!columnOffset || column.meta_data.data_page_offset < columnOffset) {
columnOffset = column.meta_data.data_page_offset
}
columnOffset = Number(columnOffset)
const bytes = Number(column.meta_data.total_compressed_size)
const end = columnOffset + bytes
html += cell(`Column ${columnName}`, columnOffset, bytes, end)
}
html += '</div>'
}
const metadataStart = byteLength - metadata.metadata_length - 4
html += cell('Metadata', metadataStart, metadata.metadata_length, byteLength - 4)
html += cell('PAR1', byteLength - 4, 4, byteLength) // magic number
const div = document.createElement('div')
div.innerHTML = html
div.classList.add('collapsed') // start collapsed
div.children[0].addEventListener('click', () => {
div.classList.toggle('collapsed')
})
return div
}
function group(name) {
return `<div>${name}`
}
function cell(name, start, bytes, end) {
return `
<div class="cell">
<label>${name}</label>
<ul>
<li>start ${start.toLocaleString()}</li>
<li>bytes ${bytes.toLocaleString()}</li>
<li>end ${end.toLocaleString()}</li>
</ul>
</div>`
}
// Render metadata
function fileMetadata(metadata) {
let html = '<h2>Metadata</h2>'
html += `<pre>${JSON.stringify(metadata, null, 2)}</pre>`
const div = document.createElement('div')
div.innerHTML = html
div.classList.add('collapsed') // start collapsed
div.children[0].addEventListener('click', () => {
div.classList.toggle('collapsed')
})
return div
}
/**
* @param {string[]} header
* @param {any[][]} data
* @returns {HTMLTableElement}
*/
function renderTable(header, data) {
const table = document.createElement('table')
const thead = document.createElement('thead')
@ -229,6 +199,11 @@ function renderTable(header, data) {
return table
}
/**
* @param {any} value
* @param {number} depth
* @returns {string}
*/
function stringify(value, depth = 0) {
if (value === null) return depth ? 'null' : ''
if (value === undefined) return depth ? 'undefined' : ''

111
demo/layout.js Normal file

@ -0,0 +1,111 @@
/**
* @typedef {import('../src/types.js').FileMetaData} FileMetaData
*/
import { getColumnRange } from '../src/column.js'
/**
* @param {FileMetaData} metadata
* @returns {HTMLDivElement}
*/
export function fileMetadata(metadata) {
let html = '<h2>Metadata</h2>'
html += `<pre>${JSON.stringify(metadata, null, 2)}</pre>`
const div = document.createElement('div')
div.innerHTML = html
div.classList.add('layout', 'collapsed') // start collapsed
div.children[0].addEventListener('click', () => {
div.classList.toggle('collapsed')
})
return div
}
/**
* Render parquet file layout.
*
* @param {FileMetaData} metadata
* @param {number} byteLength
* @returns {HTMLDivElement}
*/
export function fileLayout(metadata, byteLength) {
let html = '<h2>File layout</h2>'
html += cell('PAR1', 0n, 4n) // magic number
/** @type {[string, bigint, bigint][]} */
const indexPages = []
for (const rowGroupIndex in metadata.row_groups) {
const rowGroup = metadata.row_groups[rowGroupIndex]
html += group(`RowGroup ${rowGroupIndex} (${rowGroup.total_byte_size.toLocaleString()} bytes)`)
for (const column of rowGroup.columns) {
const columnName = column.meta_data?.path_in_schema.join('.')
html += group(`Column ${columnName}`)
if (column.meta_data) {
const end = getColumnRange(column.meta_data)[1]
/* eslint-disable no-extra-parens */
const pages = (/** @type {[string, bigint][]} */
([
['Dictionary', column.meta_data.dictionary_page_offset],
['Data', column.meta_data.data_page_offset],
['Index', column.meta_data.index_page_offset],
['End', end],
]))
.filter(([, offset]) => offset !== undefined)
.sort((a, b) => Number(a[1]) - Number(b[1]))
for (let i = 0; i < pages.length - 1; i++) {
const [name, start] = pages[i]
const end = pages[i + 1][1]
html += cell(name, start, end)
}
}
if (column.column_index_offset) {
indexPages.push([`ColumnIndex RowGroup${rowGroupIndex} ${columnName}`, column.column_index_offset, BigInt(column.column_index_length || 0)])
}
if (column.offset_index_offset) {
indexPages.push([`OffsetIndex RowGroup${rowGroupIndex} ${columnName}`, column.offset_index_offset, BigInt(column.offset_index_length || 0)])
}
html += '</div>'
}
html += '</div>'
}
for (const [name, start, length] of indexPages) {
html += cell(name, start, start + length)
}
const metadataStart = BigInt(byteLength - metadata.metadata_length - 4)
const metadataEnd = BigInt(byteLength - 4)
html += cell('Metadata', metadataStart, metadataEnd)
html += cell('PAR1', metadataEnd, BigInt(byteLength)) // magic number
const div = document.createElement('div')
div.innerHTML = html
div.classList.add('layout', 'collapsed') // start collapsed
div.children[0].addEventListener('click', () => {
div.classList.toggle('collapsed')
})
return div
}
/**
* @param {string} name
* @returns {string}
*/
function group(name) {
return `<div>${name}`
}
/**
* @param {string} name
* @param {bigint} start
* @param {bigint} end
* @returns {string}
*/
function cell(name, start, end) {
const bytes = end - start
return `
<div class="cell">
<label>${name}</label>
<ul>
<li>start ${start.toLocaleString()}</li>
<li>bytes ${bytes.toLocaleString()}</li>
<li>end ${end.toLocaleString()}</li>
</ul>
</div>`
}

@ -3,7 +3,7 @@
<head>
<meta charset="UTF-8">
<title>hyparquet parquet file parser</title>
<link rel="stylesheet" href="demo.css">
<link rel="stylesheet" href="demo/demo.css">
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Mulish:wght@400;600&display=swap"/>
</head>
<body>
@ -24,9 +24,7 @@
<li><a href="https://github.com/hyparam/hyparquet">github</a></li>
<li><a href="https://www.npmjs.com/package/hyparquet">npm</a></li>
</ul>
<div id="filename"></div>
<div id="metadata" class="layout"></div>
<div id="layout" class="layout"></div>
<div id="sidebar"></div>
</nav>
<div id="content">
<div id="welcome">
@ -36,6 +34,6 @@
</div>
<input id="file-input" type="file">
<script type="module" src="demo.js"></script>
<script type="module" src="demo/demo.js"></script>
</body>
</html>

@ -110,12 +110,12 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
* Find the start byte offset for a column chunk.
*
* @param {ColumnMetaData} columnMetadata
* @returns {number} byte offset
* @returns {[bigint, bigint]} byte offset range
*/
export function getColumnOffset({ dictionary_page_offset, data_page_offset }) {
export function getColumnRange({ dictionary_page_offset, data_page_offset, total_compressed_size }) {
let columnOffset = dictionary_page_offset
if (!dictionary_page_offset || data_page_offset < dictionary_page_offset) {
if (!columnOffset || data_page_offset < columnOffset) {
columnOffset = data_page_offset
}
return Number(columnOffset)
return [columnOffset, columnOffset + total_compressed_size]
}

2
src/hyparquet.d.ts vendored

@ -86,7 +86,7 @@ export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean
* @param {any} obj object to convert
* @returns {unknown} converted object
*/
export function toJson(obj: any): unknown
export function toJson(obj: any): any
/**
* Parquet query options for reading data

@ -1,6 +1,6 @@
import { assembleNested } from './assemble.js'
import { getColumnOffset, readColumn } from './column.js'
import { getColumnRange, readColumn } from './column.js'
import { parquetMetadataAsync } from './metadata.js'
import { getSchemaPath } from './schema.js'
import { concat } from './utils.js'
@ -91,10 +91,9 @@ async function readRowGroup(options, rowGroup, groupStart) {
// skip columns that are not requested
if (columns && !columns.includes(columnMetadata.path_in_schema[0])) return
const startByte = getColumnOffset(columnMetadata)
const endByte = startByte + Number(columnMetadata.total_compressed_size)
groupStartByte = Math.min(groupStartByte, startByte)
groupEndByte = Math.max(groupEndByte, endByte)
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
groupStartByte = Math.min(groupStartByte, columnStartByte)
groupEndByte = Math.max(groupEndByte, columnEndByte)
})
if (groupStartByte >= groupEndByte && columns?.length) {
// TODO: should throw if any column is missing
@ -124,8 +123,7 @@ async function readRowGroup(options, rowGroup, groupStart) {
const columnName = columnMetadata.path_in_schema[0]
if (columns && !columns.includes(columnName)) continue
const columnStartByte = getColumnOffset(columnMetadata)
const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size)
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
const columnBytes = columnEndByte - columnStartByte
// skip columns larger than 1gb

@ -10,5 +10,5 @@
"strict": true,
"target": "esnext",
},
"include": ["src", "test"]
"include": ["src", "test", "demo"]
}