hyparquet/src/read.js

import { assembleNested } from './assemble.js'
import { getColumnRange, readColumn } from './column.js'
import { parquetMetadataAsync } from './metadata.js'
import { getSchemaPath } from './schema.js'
import { concat } from './utils.js'

/**
 * Read parquet data rows from a file-like object.
 * Reads the minimal number of row groups and columns to satisfy the request.
 *
 * Returns a void promise when complete, and to throw errors.
 * Data is returned in onComplete, not the return promise, because
 * if onComplete is undefined, we parse the data, and emit chunks, but skip
 * computing the row view directly. This saves on allocation if the caller
 * wants to cache the full chunks, and make their own view of the data from
 * the chunks.
 *
 * @param {ParquetReadOptions} options read options
 * @returns {Promise<void>} resolves when all requested rows and columns are parsed, all errors are thrown here
 */
export async function parquetRead(options) {
  if (!options.file || !(options.file.byteLength >= 0)) {
    throw new Error('parquetRead expected file AsyncBuffer')
  }

  // load metadata if not provided
  options.metadata ||= await parquetMetadataAsync(options.file)
  if (!options.metadata) throw new Error('parquet metadata not found')

  const { metadata, onComplete, rowEnd } = options
  const rowStart = options.rowStart || 0
  /** @type {any[][]} */
  const rowData = []

  // find which row groups to read
  let groupStart = 0 // first row index of the current group
  for (const rowGroup of metadata.row_groups) {
    // number of rows in this row group
    const groupRows = Number(rowGroup.num_rows)
    // if row group overlaps with row range, read it
    if (groupStart + groupRows >= rowStart && (rowEnd === undefined || groupStart < rowEnd)) {
      // read row group
      const rowLimit = rowEnd && rowEnd - groupStart
      const groupData = await readRowGroup(options, rowGroup, groupStart, rowLimit)
      if (onComplete) {
        // filter to rows in range
        const start = Math.max(rowStart - groupStart, 0)
        const end = rowEnd === undefined ? undefined : rowEnd - groupStart
        concat(rowData, groupData.slice(start, end))
      }
    }
    groupStart += groupRows
  }

  if (onComplete) onComplete(rowData)
}

/**
 * Read a row group from a file-like object.
 *
 * @param {ParquetReadOptions} options read options
 * @param {RowGroup} rowGroup row group to read
 * @param {number} groupStart row index of the first row in the group
 * @param {number} [rowLimit] max rows to read from this group
 * @returns {Promise<any[][]>} resolves to row data
 */
export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
  const { file, metadata, columns } = options
  if (!metadata) throw new Error('parquet metadata not found')
  if (rowLimit === undefined || rowLimit > rowGroup.num_rows) rowLimit = Number(rowGroup.num_rows)

  // loop through metadata to find min/max bytes to read
  let [groupStartByte, groupEndByte] = [file.byteLength, 0]
  for (const { meta_data } of rowGroup.columns) {
    if (!meta_data) throw new Error('parquet column metadata is undefined')
    // skip columns that are not requested
    if (columns && !columns.includes(meta_data.path_in_schema[0])) continue

    const [columnStartByte, columnEndByte] = getColumnRange(meta_data).map(Number)
    groupStartByte = Math.min(groupStartByte, columnStartByte)
    groupEndByte = Math.max(groupEndByte, columnEndByte)
  }
  if (groupStartByte >= groupEndByte && columns?.length) {
    throw new Error(`parquet columns not found: ${columns.join(', ')}`)
  }
  // if row group size is less than 32mb, pre-load in one read
  let groupBuffer
  if (groupEndByte - groupStartByte <= 1 << 25) {
    // pre-load row group byte data in one big read,
    // otherwise read column data individually
    groupBuffer = await file.slice(groupStartByte, groupEndByte)
  }

  const promises = []
  // Top-level columns to assemble
  const { children } = getSchemaPath(metadata.schema, [])[0]
  const subcolumnNames = new Map(children.map(child => [child.element.name, getSubcolumns(child)]))
  /** @type {Map<string, DecodedArray[]>} */
  const subcolumnData = new Map() // columns to assemble as maps
  // read column data
  for (let i = 0; i < rowGroup.columns.length; i++) {
    const columnMetadata = rowGroup.columns[i].meta_data
    if (!columnMetadata) throw new Error('parquet column metadata is undefined')

    // skip columns that are not requested
    const columnName = columnMetadata.path_in_schema[0]
    if (columns && !columns.includes(columnName)) continue

    const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
    const columnBytes = columnEndByte - columnStartByte

    // skip columns larger than 1gb
    // TODO: stream process the data, returning only the requested rows
    if (columnBytes > 1 << 30) {
      console.warn(`parquet skipping huge column "${columnMetadata.path_in_schema}" ${columnBytes.toLocaleString()} bytes`)
      // TODO: set column to new Error('parquet column too large')
      continue
    }

    // use pre-loaded row group byte data if available, else read column data
    /** @type {Promise<ArrayBuffer>} */
    let buffer
    let bufferOffset = 0
    if (groupBuffer) {
      buffer = Promise.resolve(groupBuffer)
      bufferOffset = columnStartByte - groupStartByte
    } else {
      // wrap awaitable to ensure it's a promise
      buffer = Promise.resolve(file.slice(columnStartByte, columnEndByte))
    }

    // read column data async
    promises.push(buffer.then(arrayBuffer => {
      const schemaPath = getSchemaPath(metadata.schema, columnMetadata.path_in_schema)
      const reader = { view: new DataView(arrayBuffer), offset: bufferOffset }
      const columnData = readColumn(reader, rowLimit, columnMetadata, schemaPath, options)
      /** @type {DecodedArray[] | undefined} */
      let chunks = columnData

      // TODO: fast path for non-nested columns
      // Save column data for assembly
      const subcolumn = columnMetadata.path_in_schema.join('.')
      subcolumnData.set(subcolumn, chunks)
      chunks = undefined

      const subcolumns = subcolumnNames.get(columnName)
      if (subcolumns?.every(name => subcolumnData.has(name))) {
        // For every subcolumn, flatten and assemble the column
        const flatData = new Map(subcolumns.map(name => [name, flatten(subcolumnData.get(name))]))
        assembleNested(flatData, schemaPath[1])
        const flatColumn = flatData.get(columnName)
        if (!flatColumn) throw new Error(`parquet column data not assembled: ${columnName}`)
        chunks = [flatColumn]
        subcolumns.forEach(name => subcolumnData.delete(name))
        subcolumnData.set(columnName, chunks)
      }

      // do not emit column data until structs are fully parsed
      if (!chunks) return
      // notify caller of column data
      for (const chunk of chunks) {
        options.onChunk?.({
          columnName,
          columnData: chunk,
          rowStart: groupStart,
          rowEnd: groupStart + rowLimit,
        })
      }
    }))
  }
  await Promise.all(promises)
  if (options.onComplete) {
    const includedColumnNames = children
      .map(child => child.element.name)
      .filter(name => !columns || columns.includes(name))
    const columnOrder = columns || includedColumnNames
    const includedColumns = columnOrder
      .map(name => includedColumnNames.includes(name) ? flatten(subcolumnData.get(name)) : undefined)

    // transpose columns into rows
    const groupData = new Array(rowLimit)
    for (let row = 0; row < rowLimit; row++) {
      if (options.rowFormat === 'object') {
        // return each row as an object
        /** @type {Record<string, any>} */
        const rowData = {}
        for (let i = 0; i < columnOrder.length; i++) {
          rowData[columnOrder[i]] = includedColumns[i]?.[row]
        }
        groupData[row] = rowData
      } else {
        // return each row as an array
        groupData[row] = includedColumns.map(column => column?.[row])
      }
    }
    return groupData
  }
  return []
}

/**
 * Flatten a list of lists into a single list.
 *
 * @param {DecodedArray[] | undefined} chunks
 * @returns {DecodedArray}
 */
function flatten(chunks) {
  if (!chunks) return []
  if (chunks.length === 1) return chunks[0]
  /** @type {any[]} */
  const output = []
  for (const chunk of chunks) {
    concat(output, chunk)
  }
  return output
}

/**
 * Return a list of sub-columns needed to construct a top-level column.
 *
 * @import {DecodedArray, ParquetReadOptions, RowGroup, SchemaTree} from '../src/types.d.ts'
 * @param {SchemaTree} schema
 * @param {string[]} output
 * @returns {string[]}
 */
function getSubcolumns(schema, output = []) {
  if (schema.children.length) {
    for (const child of schema.children) {
      getSubcolumns(child, output)
    }
  } else {
    output.push(schema.path.join('.'))
  }
  return output
}
Assembly of nested column types (#11) 2024-05-18 05:44:03 +00:00			`import { assembleNested } from './assemble.js'`
Demo: move to folder, typecheck, and render column indices 2024-06-01 02:40:44 +00:00			`import { getColumnRange, readColumn } from './column.js'`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`import { parquetMetadataAsync } from './metadata.js'`
Assembly of nested column types (#11) 2024-05-18 05:44:03 +00:00			`import { getSchemaPath } from './schema.js'`
Fast array concat 2024-04-07 16:33:57 +00:00			`import { concat } from './utils.js'`
Async parquetRead with options 2024-01-15 19:08:48 +00:00
			`/**`
			`* Read parquet data rows from a file-like object.`
			`* Reads the minimal number of row groups and columns to satisfy the request.`
			`*`
			`* Returns a void promise when complete, and to throw errors.`
			`* Data is returned in onComplete, not the return promise, because`
			`* if onComplete is undefined, we parse the data, and emit chunks, but skip`
			`* computing the row view directly. This saves on allocation if the caller`
			`* wants to cache the full chunks, and make their own view of the data from`
			`* the chunks.`
			`*`
Use ParquetReadOptions type for parquetRead options (#51) 2024-12-11 00:16:52 +00:00			`* @param {ParquetReadOptions} options read options`
Better error messages 2025-03-04 17:38:39 +00:00			`* @returns {Promise<void>} resolves when all requested rows and columns are parsed, all errors are thrown here`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`*/`
			`export async function parquetRead(options) {`
Better error messages 2025-03-04 17:38:39 +00:00			`if (!options.file \|\| !(options.file.byteLength >= 0)) {`
			`throw new Error('parquetRead expected file AsyncBuffer')`
			`}`
Error handling for undefined parquet file 2024-04-05 18:08:10 +00:00
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`// load metadata if not provided`
			`options.metadata \|\|= await parquetMetadataAsync(options.file)`
			`if (!options.metadata) throw new Error('parquet metadata not found')`

repeated_no_annotation.parquet 2024-05-21 06:09:31 +00:00			`const { metadata, onComplete, rowEnd } = options`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`const rowStart = options.rowStart \|\| 0`
Fix max call stack error in browser: concat not spread... 2024-04-07 03:01:48 +00:00			`/** @type {any[][]} */`
Fast array concat 2024-04-07 16:33:57 +00:00			`const rowData = []`
Async parquetRead with options 2024-01-15 19:08:48 +00:00
			`// find which row groups to read`
			`let groupStart = 0 // first row index of the current group`
			`for (const rowGroup of metadata.row_groups) {`
			`// number of rows in this row group`
			`const groupRows = Number(rowGroup.num_rows)`
			`// if row group overlaps with row range, read it`
repeated_no_annotation.parquet 2024-05-21 06:09:31 +00:00			`if (groupStart + groupRows >= rowStart && (rowEnd === undefined \|\| groupStart < rowEnd)) {`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`// read row group`
Fix handling of multiple pages 2024-06-08 02:30:30 +00:00			`const rowLimit = rowEnd && rowEnd - groupStart`
			`const groupData = await readRowGroup(options, rowGroup, groupStart, rowLimit)`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`if (onComplete) {`
			`// filter to rows in range`
			`const start = Math.max(rowStart - groupStart, 0)`
repeated_no_annotation.parquet 2024-05-21 06:09:31 +00:00			`const end = rowEnd === undefined ? undefined : rowEnd - groupStart`
Fast array concat 2024-04-07 16:33:57 +00:00			`concat(rowData, groupData.slice(start, end))`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`}`
			`}`
			`groupStart += groupRows`
			`}`

			`if (onComplete) onComplete(rowData)`
			`}`

			`/**`
			`* Read a row group from a file-like object.`
			`*`
Use ParquetReadOptions type for parquetRead options (#51) 2024-12-11 00:16:52 +00:00			`* @param {ParquetReadOptions} options read options`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`* @param {RowGroup} rowGroup row group to read`
Fix rowStart in onChunk callback. Also split out readFiles tests. 2024-04-12 20:09:31 +00:00			`* @param {number} groupStart row index of the first row in the group`
Fix handling of multiple pages 2024-06-08 02:30:30 +00:00			`* @param {number} [rowLimit] max rows to read from this group`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`* @returns {Promise<any[][]>} resolves to row data`
			`*/`
Fix handling of multiple pages 2024-06-08 02:30:30 +00:00			`export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {`
Convert byte arrays to utf8 by default 2024-05-23 05:24:54 +00:00			`const { file, metadata, columns } = options`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`if (!metadata) throw new Error('parquet metadata not found')`
Fix handling of multiple pages 2024-06-08 02:30:30 +00:00			`if (rowLimit === undefined \|\| rowLimit > rowGroup.num_rows) rowLimit = Number(rowGroup.num_rows)`
Async parquetRead with options 2024-01-15 19:08:48 +00:00
			`// loop through metadata to find min/max bytes to read`
			`let [groupStartByte, groupEndByte] = [file.byteLength, 0]`
for is faster than forEach 2025-03-17 17:07:08 +00:00			`for (const { meta_data } of rowGroup.columns) {`
			`if (!meta_data) throw new Error('parquet column metadata is undefined')`
Column filter by name 2024-03-14 22:39:00 +00:00			`// skip columns that are not requested`
for is faster than forEach 2025-03-17 17:07:08 +00:00			`if (columns && !columns.includes(meta_data.path_in_schema[0])) continue`
Async parquetRead with options 2024-01-15 19:08:48 +00:00
for is faster than forEach 2025-03-17 17:07:08 +00:00			`const [columnStartByte, columnEndByte] = getColumnRange(meta_data).map(Number)`
Demo: move to folder, typecheck, and render column indices 2024-06-01 02:40:44 +00:00			`groupStartByte = Math.min(groupStartByte, columnStartByte)`
			`groupEndByte = Math.max(groupEndByte, columnEndByte)`
for is faster than forEach 2025-03-17 17:07:08 +00:00			`}`
Column filter by name 2024-03-14 22:39:00 +00:00			`if (groupStartByte >= groupEndByte && columns?.length) {`
			throw new Error(`parquet columns not found: ${columns.join(', ')}`)
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`}`
Adjust read coalesce size 2024-04-29 21:22:07 +00:00			`// if row group size is less than 32mb, pre-load in one read`
Never copy data 2024-02-09 21:44:35 +00:00			`let groupBuffer`
Adjust read coalesce size 2024-04-29 21:22:07 +00:00			`if (groupEndByte - groupStartByte <= 1 << 25) {`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`// pre-load row group byte data in one big read,`
			`// otherwise read column data individually`
Never copy data 2024-02-09 21:44:35 +00:00			`groupBuffer = await file.slice(groupStartByte, groupEndByte)`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`}`

			`const promises = []`
Assembly of nested column types (#11) 2024-05-18 05:44:03 +00:00			`// Top-level columns to assemble`
			`const { children } = getSchemaPath(metadata.schema, [])[0]`
			`const subcolumnNames = new Map(children.map(child => [child.element.name, getSubcolumns(child)]))`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`/** @type {Map<string, DecodedArray[]>} */`
Assembly of nested column types (#11) 2024-05-18 05:44:03 +00:00			`const subcolumnData = new Map() // columns to assemble as maps`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`// read column data`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`for (let i = 0; i < rowGroup.columns.length; i++) {`
			`const columnMetadata = rowGroup.columns[i].meta_data`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`if (!columnMetadata) throw new Error('parquet column metadata is undefined')`
decompressPage for dictionary and data page v1 only 2024-02-24 19:55:04 +00:00
Column filter by name 2024-03-14 22:39:00 +00:00			`// skip columns that are not requested`
Add path to schemaTree 2024-05-06 20:18:27 +00:00			`const columnName = columnMetadata.path_in_schema[0]`
Column filter by name 2024-03-14 22:39:00 +00:00			`if (columns && !columns.includes(columnName)) continue`

Demo: move to folder, typecheck, and render column indices 2024-06-01 02:40:44 +00:00			`const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`const columnBytes = columnEndByte - columnStartByte`
Fix definition level parsing 2024-01-20 21:52:36 +00:00
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`// skip columns larger than 1gb`
decompressPage for dictionary and data page v1 only 2024-02-24 19:55:04 +00:00			`// TODO: stream process the data, returning only the requested rows`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`if (columnBytes > 1 << 30) {`
			console.warn(`parquet skipping huge column "${columnMetadata.path_in_schema}" ${columnBytes.toLocaleString()} bytes`)
Fix struct map parsing 2024-03-12 02:35:57 +00:00			`// TODO: set column to new Error('parquet column too large')`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`continue`
			`}`
Fix definition level parsing 2024-01-20 21:52:36 +00:00
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`// use pre-loaded row group byte data if available, else read column data`
Fix typescript definitions 2024-04-26 19:52:42 +00:00			`/** @type {Promise<ArrayBuffer>} */`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`let buffer`
Never copy data 2024-02-09 21:44:35 +00:00			`let bufferOffset = 0`
			`if (groupBuffer) {`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`buffer = Promise.resolve(groupBuffer)`
Never copy data 2024-02-09 21:44:35 +00:00			`bufferOffset = columnStartByte - groupStartByte`
			`} else {`
Fix typescript definitions 2024-04-26 19:52:42 +00:00			`// wrap awaitable to ensure it's a promise`
			`buffer = Promise.resolve(file.slice(columnStartByte, columnEndByte))`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`}`
Fix definition level parsing 2024-01-20 21:52:36 +00:00
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`// read column data async`
			`promises.push(buffer.then(arrayBuffer => {`
Refactor isListLike and isMapLike to use schemaPath 2024-04-30 01:45:29 +00:00			`const schemaPath = getSchemaPath(metadata.schema, columnMetadata.path_in_schema)`
Convert byte arrays to utf8 by default 2024-05-23 05:24:54 +00:00			`const reader = { view: new DataView(arrayBuffer), offset: bufferOffset }`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`const columnData = readColumn(reader, rowLimit, columnMetadata, schemaPath, options)`
			`/** @type {DecodedArray[] \| undefined} */`
			`let chunks = columnData`
Assembly of nested column types (#11) 2024-05-18 05:44:03 +00:00
			`// TODO: fast path for non-nested columns`
			`// Save column data for assembly`
			`const subcolumn = columnMetadata.path_in_schema.join('.')`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`subcolumnData.set(subcolumn, chunks)`
			`chunks = undefined`
Assembly of nested column types (#11) 2024-05-18 05:44:03 +00:00
			`const subcolumns = subcolumnNames.get(columnName)`
			`if (subcolumns?.every(name => subcolumnData.has(name))) {`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`// For every subcolumn, flatten and assemble the column`
			`const flatData = new Map(subcolumns.map(name => [name, flatten(subcolumnData.get(name))]))`
			`assembleNested(flatData, schemaPath[1])`
			`const flatColumn = flatData.get(columnName)`
for is faster than forEach 2025-03-17 17:07:08 +00:00			if (!flatColumn) throw new Error(`parquet column data not assembled: ${columnName}`)
			`chunks = [flatColumn]`
			`subcolumns.forEach(name => subcolumnData.delete(name))`
			`subcolumnData.set(columnName, chunks)`
Fix struct map parsing 2024-03-12 02:35:57 +00:00			`}`

			`// do not emit column data until structs are fully parsed`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`if (!chunks) return`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`// notify caller of column data`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`for (const chunk of chunks) {`
			`options.onChunk?.({`
			`columnName,`
			`columnData: chunk,`
			`rowStart: groupStart,`
			`rowEnd: groupStart + rowLimit,`
			`})`
			`}`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`}))`
			`}`
			`await Promise.all(promises)`
Faster row transpose 2024-05-14 09:19:37 +00:00			`if (options.onComplete) {`
Add an option to return each row as an object keyed by column name (#25) * Add an option to return each row as an object keyed by column name * rename option to rowFormat and address feedback 2024-08-13 16:15:59 +00:00			`const includedColumnNames = children`
Fix out of order columns in onComplete 2024-07-22 19:03:02 +00:00			`.map(child => child.element.name)`
			`.filter(name => !columns \|\| columns.includes(name))`
return column names in the order requested (#27) * return column names in the order requested * retain correct ordering of columns in object rows as well 2024-08-14 07:01:47 +00:00			`const columnOrder = columns \|\| includedColumnNames`
			`const includedColumns = columnOrder`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`.map(name => includedColumnNames.includes(name) ? flatten(subcolumnData.get(name)) : undefined)`
Add an option to return each row as an object keyed by column name (#25) * Add an option to return each row as an object keyed by column name * rename option to rowFormat and address feedback 2024-08-13 16:15:59 +00:00
for is faster than forEach 2025-03-17 17:07:08 +00:00			`// transpose columns into rows`
			`const groupData = new Array(rowLimit)`
Fix out of order columns in onComplete 2024-07-22 19:03:02 +00:00			`for (let row = 0; row < rowLimit; row++) {`
Add an option to return each row as an object keyed by column name (#25) * Add an option to return each row as an object keyed by column name * rename option to rowFormat and address feedback 2024-08-13 16:15:59 +00:00			`if (options.rowFormat === 'object') {`
			`// return each row as an object`
			`/** @type {Record<string, any>} */`
			`const rowData = {}`
for is faster than forEach 2025-03-17 17:07:08 +00:00			`for (let i = 0; i < columnOrder.length; i++) {`
			`rowData[columnOrder[i]] = includedColumns[i]?.[row]`
			`}`
Add an option to return each row as an object keyed by column name (#25) * Add an option to return each row as an object keyed by column name * rename option to rowFormat and address feedback 2024-08-13 16:15:59 +00:00			`groupData[row] = rowData`
			`} else {`
			`// return each row as an array`
return column names in the order requested (#27) * return column names in the order requested * retain correct ordering of columns in object rows as well 2024-08-14 07:01:47 +00:00			`groupData[row] = includedColumns.map(column => column?.[row])`
Add an option to return each row as an object keyed by column name (#25) * Add an option to return each row as an object keyed by column name * rename option to rowFormat and address feedback 2024-08-13 16:15:59 +00:00			`}`
Fix out of order columns in onComplete 2024-07-22 19:03:02 +00:00			`}`
			`return groupData`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`}`
Faster row transpose 2024-05-14 09:19:37 +00:00			`return []`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`}`
Assembly of nested column types (#11) 2024-05-18 05:44:03 +00:00
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`/**`
			`* Flatten a list of lists into a single list.`
			`*`
			`* @param {DecodedArray[] \| undefined} chunks`
			`* @returns {DecodedArray}`
			`*/`
			`function flatten(chunks) {`
			`if (!chunks) return []`
			`if (chunks.length === 1) return chunks[0]`
			`/** @type {any[]} */`
			`const output = []`
			`for (const chunk of chunks) {`
			`concat(output, chunk)`
			`}`
			`return output`
			`}`
Assembly of nested column types (#11) 2024-05-18 05:44:03 +00:00
			`/**`
			`* Return a list of sub-columns needed to construct a top-level column.`
			`*`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`* @import {DecodedArray, ParquetReadOptions, RowGroup, SchemaTree} from '../src/types.d.ts'`
build types before publishing to npm (#46) * build types before publishing to npm * use prepare instead of prepublishOnly + make it clear that we only build types doc for prepare vs prepublishOnly is here: https://docs.npmjs.com/cli/v8/using-npm/scripts * no jsx in this lib * relative imports from the root, so that it works from types/ * remove unused hyparquet.d.ts + report differences to jsdoc in files * try to understand if this is the cause of the failing CI check tsc fails: https://github.com/hyparam/hyparquet/actions/runs/12040954822/job/33571851170?pr=46 * Revert "try to understand if this is the cause of the failing CI check" This reverts commit 5e2fc8ca179064369de71793ab1cda3facefddc7. * not sure what happens, but we just need to ensure the types are created correctly * increment version * Explicitly export types for use in downstream typescript projects * Use new typescript jsdoc imports for smaller package * Combine some files and use @import jsdoc * use the local typescript --------- Co-authored-by: Kenny Daniel <platypii@gmail.com> 2024-12-02 16:47:42 +00:00			`* @param {SchemaTree} schema`
Assembly of nested column types (#11) 2024-05-18 05:44:03 +00:00			`* @param {string[]} output`
			`* @returns {string[]}`
			`*/`
			`function getSubcolumns(schema, output = []) {`
			`if (schema.children.length) {`
			`for (const child of schema.children) {`
			`getSubcolumns(child, output)`
			`}`
			`} else {`
			`output.push(schema.path.join('.'))`
			`}`
			`return output`
			`}`