hyparquet/test/column.test.js

import { describe, expect, it } from 'vitest'
import { readColumn } from '../src/column.js'
import { DEFAULT_PARSERS } from '../src/convert.js'
import { parquetMetadata } from '../src/index.js'
import { asyncBufferFromFile } from '../src/node.js'
import { getColumnRange } from '../src/plan.js'
import { getSchemaPath } from '../src/schema.js'

const values = [null, 1, -2, NaN, 0, -1, -0, 2]

describe('readColumn', () => {
  it.for([
    { selectEnd: Infinity, expected: [values] },
    { selectEnd: 2, expected: [values.slice(0, 2)] },
    { selectEnd: 0, expected: [] },
  ])('readColumn with rowGroupEnd %p', async ({ selectEnd, expected }) => {
    const testFile = 'test/files/float16_nonzeros_and_nans.parquet'
    const file = await asyncBufferFromFile(testFile)
    const arrayBuffer = await file.slice(0)
    const metadata = parquetMetadata(arrayBuffer)

    const column = metadata.row_groups[0].columns[0]
    if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`)
    const { startByte, endByte } = getColumnRange(column.meta_data)
    const columnArrayBuffer = arrayBuffer.slice(startByte, endByte)
    const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])
    const reader = { view: new DataView(columnArrayBuffer), offset: 0 }
    const columnDecoder = {
      columnName: column.meta_data.path_in_schema.join('.'),
      type: column.meta_data.type,
      element: schemaPath[schemaPath.length - 1].element,
      schemaPath,
      parsers: DEFAULT_PARSERS,
      codec: column.meta_data.codec,
    }
    const rowGroupSelect = {
      groupStart: 0,
      selectStart: 0,
      selectEnd,
      groupRows: expected.length,
    }

    const result = readColumn(reader, rowGroupSelect, columnDecoder)
    expect(result).toEqual(expected)
  })

  it('readColumn should return a typed array', async () => {
    const testFile = 'test/files/datapage_v2.snappy.parquet'
    const file = await asyncBufferFromFile(testFile)
    const arrayBuffer = await file.slice(0)
    const metadata = parquetMetadata(arrayBuffer)

    const column = metadata.row_groups[0].columns[1] // second column
    if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`)
    const { startByte, endByte } = getColumnRange(column.meta_data)
    const columnArrayBuffer = arrayBuffer.slice(startByte, endByte)
    const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])
    const reader = { view: new DataView(columnArrayBuffer), offset: 0 }
    const columnDecoder = {
      columnName: column.meta_data.path_in_schema.join('.'),
      type: column.meta_data.type,
      element: schemaPath[schemaPath.length - 1].element,
      schemaPath,
      parsers: DEFAULT_PARSERS,
      codec: column.meta_data.codec,
    }
    const rowGroupSelect = {
      groupStart: 0,
      selectStart: 0,
      selectEnd: Infinity,
      groupRows: Number(column.meta_data.num_values),
    }

    const columnData = readColumn(reader, rowGroupSelect, columnDecoder)
    expect(columnData[0]).toBeInstanceOf(Int32Array)
  })
})
Enable readColumn to read all rows (#53) * Enable readColumn to read all rows * Refactor readColumn to use hasRowLimit * Simplify hasRowLimit condition * Check less common condition first * add readColumn test files * implement readColumn tests for undefined rowLimits * remove unused variable * return early if no metadata is present * address tsc warnings * add comparison * clarify that undefined is valid for rowLimit * remove test files * verify edge case works when rowLimit is undefined * add test cases for readColumn --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> 2024-12-20 02:08:22 +00:00			`import { describe, expect, it } from 'vitest'`
Parquet Query Planner: plan byte ranges, pre-fetch in parallel (#75) * Parquet Query Planner: plan byte ranges, pre-fetch in parallel. - parquetPlan() that returns lists of byte ranges to fetch. - prefetchAsyncBuffer() pre-fetches all byte ranges in parallel. throws exception if non-pre-fetched slice is requested later. 2025-04-30 07:49:40 +00:00			`import { readColumn } from '../src/column.js'`
Introduce 'custom parsers' option for decoding dates (#87) 2025-06-10 01:02:31 +00:00			`import { DEFAULT_PARSERS } from '../src/convert.js'`
Move hyparquet.js to index.js (#84) 2025-05-30 22:47:02 +00:00			`import { parquetMetadata } from '../src/index.js'`
Node-specific exports for asyncBufferFromFile (#80) * Update README for asyncBufferFromFile * Simplify asyncBufferFromFile 2025-05-30 20:01:20 +00:00			`import { asyncBufferFromFile } from '../src/node.js'`
Parquet Query Planner: plan byte ranges, pre-fetch in parallel (#75) * Parquet Query Planner: plan byte ranges, pre-fetch in parallel. - parquetPlan() that returns lists of byte ranges to fetch. - prefetchAsyncBuffer() pre-fetches all byte ranges in parallel. throws exception if non-pre-fetched slice is requested later. 2025-04-30 07:49:40 +00:00			`import { getColumnRange } from '../src/plan.js'`
Enable readColumn to read all rows (#53) * Enable readColumn to read all rows * Refactor readColumn to use hasRowLimit * Simplify hasRowLimit condition * Check less common condition first * add readColumn test files * implement readColumn tests for undefined rowLimits * remove unused variable * return early if no metadata is present * address tsc warnings * add comparison * clarify that undefined is valid for rowLimit * remove test files * verify edge case works when rowLimit is undefined * add test cases for readColumn --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> 2024-12-20 02:08:22 +00:00			`import { getSchemaPath } from '../src/schema.js'`

factor tests with it.for() (#55) 2024-12-20 08:53:56 +00:00			`const values = [null, 1, -2, NaN, 0, -1, -0, 2]`
Enable readColumn to read all rows (#53) * Enable readColumn to read all rows * Refactor readColumn to use hasRowLimit * Simplify hasRowLimit condition * Check less common condition first * add readColumn test files * implement readColumn tests for undefined rowLimits * remove unused variable * return early if no metadata is present * address tsc warnings * add comparison * clarify that undefined is valid for rowLimit * remove test files * verify edge case works when rowLimit is undefined * add test cases for readColumn --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> 2024-12-20 02:08:22 +00:00
factor tests with it.for() (#55) 2024-12-20 08:53:56 +00:00			`describe('readColumn', () => {`
			`it.for([`
Group selection of a row group into an object 2025-04-10 22:51:24 +00:00			`{ selectEnd: Infinity, expected: [values] },`
			`{ selectEnd: 2, expected: [values.slice(0, 2)] },`
			`{ selectEnd: 0, expected: [] },`
			`])('readColumn with rowGroupEnd %p', async ({ selectEnd, expected }) => {`
Enable readColumn to read all rows (#53) * Enable readColumn to read all rows * Refactor readColumn to use hasRowLimit * Simplify hasRowLimit condition * Check less common condition first * add readColumn test files * implement readColumn tests for undefined rowLimits * remove unused variable * return early if no metadata is present * address tsc warnings * add comparison * clarify that undefined is valid for rowLimit * remove test files * verify edge case works when rowLimit is undefined * add test cases for readColumn --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> 2024-12-20 02:08:22 +00:00			`const testFile = 'test/files/float16_nonzeros_and_nans.parquet'`
Re-order types.d.ts to put important apis up front 2025-04-10 23:27:25 +00:00			`const file = await asyncBufferFromFile(testFile)`
			`const arrayBuffer = await file.slice(0)`
Enable readColumn to read all rows (#53) * Enable readColumn to read all rows * Refactor readColumn to use hasRowLimit * Simplify hasRowLimit condition * Check less common condition first * add readColumn test files * implement readColumn tests for undefined rowLimits * remove unused variable * return early if no metadata is present * address tsc warnings * add comparison * clarify that undefined is valid for rowLimit * remove test files * verify edge case works when rowLimit is undefined * add test cases for readColumn --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> 2024-12-20 02:08:22 +00:00			`const metadata = parquetMetadata(arrayBuffer)`

			`const column = metadata.row_groups[0].columns[0]`
			if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`)
Parquet Query Planner: plan byte ranges, pre-fetch in parallel (#75) * Parquet Query Planner: plan byte ranges, pre-fetch in parallel. - parquetPlan() that returns lists of byte ranges to fetch. - prefetchAsyncBuffer() pre-fetches all byte ranges in parallel. throws exception if non-pre-fetched slice is requested later. 2025-04-30 07:49:40 +00:00			`const { startByte, endByte } = getColumnRange(column.meta_data)`
			`const columnArrayBuffer = arrayBuffer.slice(startByte, endByte)`
Enable readColumn to read all rows (#53) * Enable readColumn to read all rows * Refactor readColumn to use hasRowLimit * Simplify hasRowLimit condition * Check less common condition first * add readColumn test files * implement readColumn tests for undefined rowLimits * remove unused variable * return early if no metadata is present * address tsc warnings * add comparison * clarify that undefined is valid for rowLimit * remove test files * verify edge case works when rowLimit is undefined * add test cases for readColumn --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> 2024-12-20 02:08:22 +00:00			`const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])`
			`const reader = { view: new DataView(columnArrayBuffer), offset: 0 }`
Group column decoding params into an object 2025-04-11 02:18:06 +00:00			`const columnDecoder = {`
			`columnName: column.meta_data.path_in_schema.join('.'),`
			`type: column.meta_data.type,`
			`element: schemaPath[schemaPath.length - 1].element,`
			`schemaPath,`
Introduce 'custom parsers' option for decoding dates (#87) 2025-06-10 01:02:31 +00:00			`parsers: DEFAULT_PARSERS,`
Group column decoding params into an object 2025-04-11 02:18:06 +00:00			`codec: column.meta_data.codec,`
			`}`
Group selection of a row group into an object 2025-04-10 22:51:24 +00:00			`const rowGroupSelect = {`
			`groupStart: 0,`
			`selectStart: 0,`
			`selectEnd,`
Add more details to QueryPlan. (#82) - Add metadata - Add rowStart and rowEnd - Add columns - Add groupStart, selectStart, selectEnd, and groupRows to GroupPlan - Rename ranges to fetches - Rename numRows to groupRows in ColumnDecoder 2025-05-25 22:21:58 +00:00			`groupRows: expected.length,`
Group selection of a row group into an object 2025-04-10 22:51:24 +00:00			`}`
Enable readColumn to read all rows (#53) * Enable readColumn to read all rows * Refactor readColumn to use hasRowLimit * Simplify hasRowLimit condition * Check less common condition first * add readColumn test files * implement readColumn tests for undefined rowLimits * remove unused variable * return early if no metadata is present * address tsc warnings * add comparison * clarify that undefined is valid for rowLimit * remove test files * verify edge case works when rowLimit is undefined * add test cases for readColumn --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> 2024-12-20 02:08:22 +00:00
Group selection of a row group into an object 2025-04-10 22:51:24 +00:00			`const result = readColumn(reader, rowGroupSelect, columnDecoder)`
Enable readColumn to read all rows (#53) * Enable readColumn to read all rows * Refactor readColumn to use hasRowLimit * Simplify hasRowLimit condition * Check less common condition first * add readColumn test files * implement readColumn tests for undefined rowLimits * remove unused variable * return early if no metadata is present * address tsc warnings * add comparison * clarify that undefined is valid for rowLimit * remove test files * verify edge case works when rowLimit is undefined * add test cases for readColumn --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> 2024-12-20 02:08:22 +00:00			`expect(result).toEqual(expected)`
			`})`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00
			`it('readColumn should return a typed array', async () => {`
			`const testFile = 'test/files/datapage_v2.snappy.parquet'`
Re-order types.d.ts to put important apis up front 2025-04-10 23:27:25 +00:00			`const file = await asyncBufferFromFile(testFile)`
			`const arrayBuffer = await file.slice(0)`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`const metadata = parquetMetadata(arrayBuffer)`

			`const column = metadata.row_groups[0].columns[1] // second column`
			if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`)
Parquet Query Planner: plan byte ranges, pre-fetch in parallel (#75) * Parquet Query Planner: plan byte ranges, pre-fetch in parallel. - parquetPlan() that returns lists of byte ranges to fetch. - prefetchAsyncBuffer() pre-fetches all byte ranges in parallel. throws exception if non-pre-fetched slice is requested later. 2025-04-30 07:49:40 +00:00			`const { startByte, endByte } = getColumnRange(column.meta_data)`
			`const columnArrayBuffer = arrayBuffer.slice(startByte, endByte)`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])`
			`const reader = { view: new DataView(columnArrayBuffer), offset: 0 }`
Group column decoding params into an object 2025-04-11 02:18:06 +00:00			`const columnDecoder = {`
			`columnName: column.meta_data.path_in_schema.join('.'),`
			`type: column.meta_data.type,`
			`element: schemaPath[schemaPath.length - 1].element,`
			`schemaPath,`
Introduce 'custom parsers' option for decoding dates (#87) 2025-06-10 01:02:31 +00:00			`parsers: DEFAULT_PARSERS,`
Group column decoding params into an object 2025-04-11 02:18:06 +00:00			`codec: column.meta_data.codec,`
			`}`
Group selection of a row group into an object 2025-04-10 22:51:24 +00:00			`const rowGroupSelect = {`
			`groupStart: 0,`
			`selectStart: 0,`
			`selectEnd: Infinity,`
Add more details to QueryPlan. (#82) - Add metadata - Add rowStart and rowEnd - Add columns - Add groupStart, selectStart, selectEnd, and groupRows to GroupPlan - Rename ranges to fetches - Rename numRows to groupRows in ColumnDecoder 2025-05-25 22:21:58 +00:00			`groupRows: Number(column.meta_data.num_values),`
Group selection of a row group into an object 2025-04-10 22:51:24 +00:00			`}`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00
Group selection of a row group into an object 2025-04-10 22:51:24 +00:00			`const columnData = readColumn(reader, rowGroupSelect, columnDecoder)`
Return typed arrays in onChunk. Change readColumn to return DecodedArray[]. (#67) Refactored readColumn to avoid `concat` operations. This avoids extra copying and allocation. 2025-03-11 06:33:47 +00:00			`expect(columnData[0]).toBeInstanceOf(Int32Array)`
			`})`
Enable readColumn to read all rows (#53) * Enable readColumn to read all rows * Refactor readColumn to use hasRowLimit * Simplify hasRowLimit condition * Check less common condition first * add readColumn test files * implement readColumn tests for undefined rowLimits * remove unused variable * return early if no metadata is present * address tsc warnings * add comparison * clarify that undefined is valid for rowLimit * remove test files * verify edge case works when rowLimit is undefined * add test cases for readColumn --------- Co-authored-by: Brian Park <park-brian@users.noreply.github.com> 2024-12-20 02:08:22 +00:00			`})`