mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-05 22:41:55 +00:00
Demo: move to folder, typecheck, and render column indices
This commit is contained in:
parent
941c6633a0
commit
6d769a4336
@ -136,16 +136,17 @@ th, td {
|
||||
font-size: 10pt;
|
||||
margin-top: 20px;
|
||||
}
|
||||
#layout {
|
||||
.sidebar {
|
||||
word-break: break-all;
|
||||
}
|
||||
.layout a {
|
||||
.sidebar a {
|
||||
color: #445;
|
||||
text-decoration: none;
|
||||
}
|
||||
.layout a:hover {
|
||||
.sidebar a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.layout,
|
||||
.layout div {
|
||||
background-color: rgba(100, 80, 180, 0.05);
|
||||
border: 1px solid #ccc;
|
||||
@ -190,11 +191,11 @@ nav ul,
|
||||
margin: 0 4px;
|
||||
vertical-align: middle;
|
||||
}
|
||||
.layout .collapsed h2::before {
|
||||
.layout.collapsed h2::before {
|
||||
content: "▶";
|
||||
}
|
||||
|
||||
#metadata pre {
|
||||
.layout pre {
|
||||
white-space: pre-wrap;
|
||||
word-break: break-all;
|
||||
}
|
||||
@ -1,18 +1,23 @@
|
||||
import { parquetMetadata, parquetMetadataAsync, parquetRead, parquetSchema, toJson } from './src/hyparquet.js'
|
||||
import {
|
||||
parquetMetadata, parquetMetadataAsync, parquetRead, parquetSchema, toJson,
|
||||
} from '../src/hyparquet.js'
|
||||
import { fileLayout, fileMetadata } from './layout.js'
|
||||
|
||||
const dropzone = document.getElementById('dropzone')
|
||||
const fileInput = document.getElementById('file-input')
|
||||
const content = document.getElementById('content')
|
||||
const welcome = document.getElementById('welcome')
|
||||
const label = document.getElementById('filename')
|
||||
/**
|
||||
* @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer
|
||||
* @typedef {import('../src/types.js').FileMetaData} FileMetaData
|
||||
*/
|
||||
|
||||
const layout = document.getElementById('layout')
|
||||
const metadataDiv = document.getElementById('metadata')
|
||||
/* eslint-disable no-extra-parens */
|
||||
const dropzone = /** @type {HTMLElement} */ (document.getElementById('dropzone'))
|
||||
const fileInput = /** @type {HTMLInputElement} */ (document.getElementById('#file-input'))
|
||||
const content = document.querySelectorAll('#content')[0]
|
||||
const welcome = document.querySelectorAll('#welcome')[0]
|
||||
|
||||
let enterCount = 0
|
||||
|
||||
dropzone.addEventListener('dragenter', e => {
|
||||
e.dataTransfer.dropEffect = 'copy'
|
||||
if (e.dataTransfer) e.dataTransfer.dropEffect = 'copy'
|
||||
dropzone.classList.add('over')
|
||||
enterCount++
|
||||
})
|
||||
@ -30,6 +35,7 @@ dropzone.addEventListener('drop', e => {
|
||||
e.preventDefault() // prevent dropped file from being "downloaded"
|
||||
dropzone.classList.remove('over')
|
||||
|
||||
if (!e.dataTransfer) throw new Error('Missing dataTransfer')
|
||||
const { files, items } = e.dataTransfer
|
||||
if (files.length > 0) {
|
||||
const file = files[0]
|
||||
@ -47,6 +53,9 @@ dropzone.addEventListener('drop', e => {
|
||||
}
|
||||
})
|
||||
|
||||
/**
|
||||
* @param {string} url
|
||||
*/
|
||||
async function processUrl(url) {
|
||||
content.innerHTML = ''
|
||||
try {
|
||||
@ -66,6 +75,11 @@ async function processUrl(url) {
|
||||
// Construct an AsyncBuffer that fetches file chunks
|
||||
const asyncBuffer = {
|
||||
byteLength: Number(size),
|
||||
/**
|
||||
* @param {number} start
|
||||
* @param {number} end
|
||||
* @returns {Promise<ArrayBuffer>}
|
||||
*/
|
||||
slice: async (start, end) => {
|
||||
const rangeEnd = end === undefined ? '' : end - 1
|
||||
console.log(`Fetch ${url} bytes=${start}-${rangeEnd}`)
|
||||
@ -84,12 +98,16 @@ async function processUrl(url) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {File} file
|
||||
*/
|
||||
function processFile(file) {
|
||||
content.innerHTML = ''
|
||||
const reader = new FileReader()
|
||||
reader.onload = async e => {
|
||||
try {
|
||||
const arrayBuffer = e.target.result
|
||||
const arrayBuffer = e.target?.result
|
||||
if (!(arrayBuffer instanceof ArrayBuffer)) throw new Error('Missing arrayBuffer')
|
||||
const metadata = parquetMetadata(arrayBuffer)
|
||||
await render(arrayBuffer, metadata, file.name)
|
||||
} catch (e) {
|
||||
@ -101,11 +119,16 @@ function processFile(file) {
|
||||
reader.onerror = e => {
|
||||
console.error('Error reading file', e)
|
||||
content.innerHTML = `<strong>${file.name}</strong>`
|
||||
content.innerHTML += `<div class="error">Error reading file\n${e.target.error}</div>`
|
||||
content.innerHTML += `<div class="error">Error reading file\n${e.target?.error}</div>`
|
||||
}
|
||||
reader.readAsArrayBuffer(file)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {AsyncBuffer} asyncBuffer
|
||||
* @param {FileMetaData} metadata
|
||||
* @param {string} name
|
||||
*/
|
||||
async function render(asyncBuffer, metadata, name) {
|
||||
renderSidebar(asyncBuffer, metadata, name)
|
||||
|
||||
@ -116,7 +139,7 @@ async function render(asyncBuffer, metadata, name) {
|
||||
await parquetRead({
|
||||
file: asyncBuffer,
|
||||
rowEnd: 1000,
|
||||
onComplete(data) {
|
||||
onComplete(/** @type {any[][]} */ data) {
|
||||
const ms = performance.now() - startTime
|
||||
console.log(`parsed ${name} in ${ms.toFixed(0)} ms`)
|
||||
content.appendChild(renderTable(header, data))
|
||||
@ -124,86 +147,33 @@ async function render(asyncBuffer, metadata, name) {
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {AsyncBuffer} asyncBuffer
|
||||
* @param {FileMetaData} metadata
|
||||
* @param {string} name
|
||||
*/
|
||||
function renderSidebar(asyncBuffer, metadata, name) {
|
||||
label.innerText = name
|
||||
// render file layout
|
||||
layout.innerHTML = ''
|
||||
layout.appendChild(fileLayout(metadata, asyncBuffer.byteLength))
|
||||
// display metadata
|
||||
metadataDiv.innerHTML = ''
|
||||
metadataDiv.appendChild(fileMetadata(toJson(metadata)))
|
||||
const sidebar = /** @type {HTMLElement} */ (document.getElementById('sidebar'))
|
||||
sidebar.innerHTML = `<div id="filename">${name}</div>`
|
||||
sidebar.appendChild(fileMetadata(toJson(metadata)))
|
||||
sidebar.appendChild(fileLayout(metadata, asyncBuffer.byteLength))
|
||||
}
|
||||
|
||||
welcome.addEventListener('click', () => {
|
||||
fileInput.click()
|
||||
fileInput?.click()
|
||||
})
|
||||
|
||||
fileInput.addEventListener('change', () => {
|
||||
if (fileInput.files.length > 0) {
|
||||
fileInput?.addEventListener('change', () => {
|
||||
if (fileInput.files?.length) {
|
||||
processFile(fileInput.files[0])
|
||||
}
|
||||
})
|
||||
|
||||
// Render file layout
|
||||
function fileLayout(metadata, byteLength) {
|
||||
let html = '<h2>File layout</h2>'
|
||||
html += cell('PAR1', 0, 4, 4) // magic number
|
||||
for (const rowGroupIndex in metadata.row_groups) {
|
||||
const rowGroup = metadata.row_groups[rowGroupIndex]
|
||||
html += group(`Row group ${rowGroupIndex} (${rowGroup.total_byte_size.toLocaleString()} bytes)`)
|
||||
for (const column of rowGroup.columns) {
|
||||
const columnName = column.meta_data.path_in_schema.join('.')
|
||||
|
||||
let columnOffset = column.meta_data.dictionary_page_offset
|
||||
if (!columnOffset || column.meta_data.data_page_offset < columnOffset) {
|
||||
columnOffset = column.meta_data.data_page_offset
|
||||
}
|
||||
columnOffset = Number(columnOffset)
|
||||
const bytes = Number(column.meta_data.total_compressed_size)
|
||||
const end = columnOffset + bytes
|
||||
html += cell(`Column ${columnName}`, columnOffset, bytes, end)
|
||||
}
|
||||
html += '</div>'
|
||||
}
|
||||
const metadataStart = byteLength - metadata.metadata_length - 4
|
||||
html += cell('Metadata', metadataStart, metadata.metadata_length, byteLength - 4)
|
||||
html += cell('PAR1', byteLength - 4, 4, byteLength) // magic number
|
||||
const div = document.createElement('div')
|
||||
div.innerHTML = html
|
||||
div.classList.add('collapsed') // start collapsed
|
||||
div.children[0].addEventListener('click', () => {
|
||||
div.classList.toggle('collapsed')
|
||||
})
|
||||
return div
|
||||
}
|
||||
function group(name) {
|
||||
return `<div>${name}`
|
||||
}
|
||||
function cell(name, start, bytes, end) {
|
||||
return `
|
||||
<div class="cell">
|
||||
<label>${name}</label>
|
||||
<ul>
|
||||
<li>start ${start.toLocaleString()}</li>
|
||||
<li>bytes ${bytes.toLocaleString()}</li>
|
||||
<li>end ${end.toLocaleString()}</li>
|
||||
</ul>
|
||||
</div>`
|
||||
}
|
||||
|
||||
// Render metadata
|
||||
function fileMetadata(metadata) {
|
||||
let html = '<h2>Metadata</h2>'
|
||||
html += `<pre>${JSON.stringify(metadata, null, 2)}</pre>`
|
||||
const div = document.createElement('div')
|
||||
div.innerHTML = html
|
||||
div.classList.add('collapsed') // start collapsed
|
||||
div.children[0].addEventListener('click', () => {
|
||||
div.classList.toggle('collapsed')
|
||||
})
|
||||
return div
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string[]} header
|
||||
* @param {any[][]} data
|
||||
* @returns {HTMLTableElement}
|
||||
*/
|
||||
function renderTable(header, data) {
|
||||
const table = document.createElement('table')
|
||||
const thead = document.createElement('thead')
|
||||
@ -229,6 +199,11 @@ function renderTable(header, data) {
|
||||
return table
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} value
|
||||
* @param {number} depth
|
||||
* @returns {string}
|
||||
*/
|
||||
function stringify(value, depth = 0) {
|
||||
if (value === null) return depth ? 'null' : ''
|
||||
if (value === undefined) return depth ? 'undefined' : ''
|
||||
111
demo/layout.js
Normal file
111
demo/layout.js
Normal file
@ -0,0 +1,111 @@
|
||||
/**
|
||||
* @typedef {import('../src/types.js').FileMetaData} FileMetaData
|
||||
*/
|
||||
|
||||
import { getColumnRange } from '../src/column.js'
|
||||
|
||||
/**
|
||||
* @param {FileMetaData} metadata
|
||||
* @returns {HTMLDivElement}
|
||||
*/
|
||||
export function fileMetadata(metadata) {
|
||||
let html = '<h2>Metadata</h2>'
|
||||
html += `<pre>${JSON.stringify(metadata, null, 2)}</pre>`
|
||||
const div = document.createElement('div')
|
||||
div.innerHTML = html
|
||||
div.classList.add('layout', 'collapsed') // start collapsed
|
||||
div.children[0].addEventListener('click', () => {
|
||||
div.classList.toggle('collapsed')
|
||||
})
|
||||
return div
|
||||
}
|
||||
|
||||
/**
|
||||
* Render parquet file layout.
|
||||
*
|
||||
* @param {FileMetaData} metadata
|
||||
* @param {number} byteLength
|
||||
* @returns {HTMLDivElement}
|
||||
*/
|
||||
export function fileLayout(metadata, byteLength) {
|
||||
let html = '<h2>File layout</h2>'
|
||||
html += cell('PAR1', 0n, 4n) // magic number
|
||||
/** @type {[string, bigint, bigint][]} */
|
||||
const indexPages = []
|
||||
for (const rowGroupIndex in metadata.row_groups) {
|
||||
const rowGroup = metadata.row_groups[rowGroupIndex]
|
||||
html += group(`RowGroup ${rowGroupIndex} (${rowGroup.total_byte_size.toLocaleString()} bytes)`)
|
||||
for (const column of rowGroup.columns) {
|
||||
const columnName = column.meta_data?.path_in_schema.join('.')
|
||||
html += group(`Column ${columnName}`)
|
||||
if (column.meta_data) {
|
||||
const end = getColumnRange(column.meta_data)[1]
|
||||
/* eslint-disable no-extra-parens */
|
||||
const pages = (/** @type {[string, bigint][]} */
|
||||
([
|
||||
['Dictionary', column.meta_data.dictionary_page_offset],
|
||||
['Data', column.meta_data.data_page_offset],
|
||||
['Index', column.meta_data.index_page_offset],
|
||||
['End', end],
|
||||
]))
|
||||
.filter(([, offset]) => offset !== undefined)
|
||||
.sort((a, b) => Number(a[1]) - Number(b[1]))
|
||||
|
||||
for (let i = 0; i < pages.length - 1; i++) {
|
||||
const [name, start] = pages[i]
|
||||
const end = pages[i + 1][1]
|
||||
html += cell(name, start, end)
|
||||
}
|
||||
}
|
||||
if (column.column_index_offset) {
|
||||
indexPages.push([`ColumnIndex RowGroup${rowGroupIndex} ${columnName}`, column.column_index_offset, BigInt(column.column_index_length || 0)])
|
||||
}
|
||||
if (column.offset_index_offset) {
|
||||
indexPages.push([`OffsetIndex RowGroup${rowGroupIndex} ${columnName}`, column.offset_index_offset, BigInt(column.offset_index_length || 0)])
|
||||
}
|
||||
html += '</div>'
|
||||
}
|
||||
html += '</div>'
|
||||
}
|
||||
for (const [name, start, length] of indexPages) {
|
||||
html += cell(name, start, start + length)
|
||||
}
|
||||
const metadataStart = BigInt(byteLength - metadata.metadata_length - 4)
|
||||
const metadataEnd = BigInt(byteLength - 4)
|
||||
html += cell('Metadata', metadataStart, metadataEnd)
|
||||
html += cell('PAR1', metadataEnd, BigInt(byteLength)) // magic number
|
||||
const div = document.createElement('div')
|
||||
div.innerHTML = html
|
||||
div.classList.add('layout', 'collapsed') // start collapsed
|
||||
div.children[0].addEventListener('click', () => {
|
||||
div.classList.toggle('collapsed')
|
||||
})
|
||||
return div
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} name
|
||||
* @returns {string}
|
||||
*/
|
||||
function group(name) {
|
||||
return `<div>${name}`
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} name
|
||||
* @param {bigint} start
|
||||
* @param {bigint} end
|
||||
* @returns {string}
|
||||
*/
|
||||
function cell(name, start, end) {
|
||||
const bytes = end - start
|
||||
return `
|
||||
<div class="cell">
|
||||
<label>${name}</label>
|
||||
<ul>
|
||||
<li>start ${start.toLocaleString()}</li>
|
||||
<li>bytes ${bytes.toLocaleString()}</li>
|
||||
<li>end ${end.toLocaleString()}</li>
|
||||
</ul>
|
||||
</div>`
|
||||
}
|
||||
@ -3,7 +3,7 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>hyparquet parquet file parser</title>
|
||||
<link rel="stylesheet" href="demo.css">
|
||||
<link rel="stylesheet" href="demo/demo.css">
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Mulish:wght@400;600&display=swap"/>
|
||||
</head>
|
||||
<body>
|
||||
@ -24,9 +24,7 @@
|
||||
<li><a href="https://github.com/hyparam/hyparquet">github</a></li>
|
||||
<li><a href="https://www.npmjs.com/package/hyparquet">npm</a></li>
|
||||
</ul>
|
||||
<div id="filename"></div>
|
||||
<div id="metadata" class="layout"></div>
|
||||
<div id="layout" class="layout"></div>
|
||||
<div id="sidebar"></div>
|
||||
</nav>
|
||||
<div id="content">
|
||||
<div id="welcome">
|
||||
@ -36,6 +34,6 @@
|
||||
</div>
|
||||
<input id="file-input" type="file">
|
||||
|
||||
<script type="module" src="demo.js"></script>
|
||||
<script type="module" src="demo/demo.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -110,12 +110,12 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
|
||||
* Find the start byte offset for a column chunk.
|
||||
*
|
||||
* @param {ColumnMetaData} columnMetadata
|
||||
* @returns {number} byte offset
|
||||
* @returns {[bigint, bigint]} byte offset range
|
||||
*/
|
||||
export function getColumnOffset({ dictionary_page_offset, data_page_offset }) {
|
||||
export function getColumnRange({ dictionary_page_offset, data_page_offset, total_compressed_size }) {
|
||||
let columnOffset = dictionary_page_offset
|
||||
if (!dictionary_page_offset || data_page_offset < dictionary_page_offset) {
|
||||
if (!columnOffset || data_page_offset < columnOffset) {
|
||||
columnOffset = data_page_offset
|
||||
}
|
||||
return Number(columnOffset)
|
||||
return [columnOffset, columnOffset + total_compressed_size]
|
||||
}
|
||||
|
||||
2
src/hyparquet.d.ts
vendored
2
src/hyparquet.d.ts
vendored
@ -86,7 +86,7 @@ export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean
|
||||
* @param {any} obj object to convert
|
||||
* @returns {unknown} converted object
|
||||
*/
|
||||
export function toJson(obj: any): unknown
|
||||
export function toJson(obj: any): any
|
||||
|
||||
/**
|
||||
* Parquet query options for reading data
|
||||
|
||||
12
src/read.js
12
src/read.js
@ -1,6 +1,6 @@
|
||||
|
||||
import { assembleNested } from './assemble.js'
|
||||
import { getColumnOffset, readColumn } from './column.js'
|
||||
import { getColumnRange, readColumn } from './column.js'
|
||||
import { parquetMetadataAsync } from './metadata.js'
|
||||
import { getSchemaPath } from './schema.js'
|
||||
import { concat } from './utils.js'
|
||||
@ -91,10 +91,9 @@ async function readRowGroup(options, rowGroup, groupStart) {
|
||||
// skip columns that are not requested
|
||||
if (columns && !columns.includes(columnMetadata.path_in_schema[0])) return
|
||||
|
||||
const startByte = getColumnOffset(columnMetadata)
|
||||
const endByte = startByte + Number(columnMetadata.total_compressed_size)
|
||||
groupStartByte = Math.min(groupStartByte, startByte)
|
||||
groupEndByte = Math.max(groupEndByte, endByte)
|
||||
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
|
||||
groupStartByte = Math.min(groupStartByte, columnStartByte)
|
||||
groupEndByte = Math.max(groupEndByte, columnEndByte)
|
||||
})
|
||||
if (groupStartByte >= groupEndByte && columns?.length) {
|
||||
// TODO: should throw if any column is missing
|
||||
@ -124,8 +123,7 @@ async function readRowGroup(options, rowGroup, groupStart) {
|
||||
const columnName = columnMetadata.path_in_schema[0]
|
||||
if (columns && !columns.includes(columnName)) continue
|
||||
|
||||
const columnStartByte = getColumnOffset(columnMetadata)
|
||||
const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size)
|
||||
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
|
||||
const columnBytes = columnEndByte - columnStartByte
|
||||
|
||||
// skip columns larger than 1gb
|
||||
|
||||
@ -10,5 +10,5 @@
|
||||
"strict": true,
|
||||
"target": "esnext",
|
||||
},
|
||||
"include": ["src", "test"]
|
||||
"include": ["src", "test", "demo"]
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user