mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-01 01:36:38 +00:00
No utf8 in plain decoding
This commit is contained in:
parent
5291ff1072
commit
e398e66dd4
@ -31,10 +31,7 @@ export function readDataPage(bytes, daph, schemaPath, columnMetadata) {
|
||||
// read values based on encoding
|
||||
const nValues = daph.num_values - numNulls
|
||||
if (daph.encoding === 'PLAIN') {
|
||||
const { element } = schemaPath[schemaPath.length - 1]
|
||||
const utf8 = element.converted_type === 'UTF8'
|
||||
const plainObj = readPlain(reader, columnMetadata.type, nValues, utf8)
|
||||
dataPage = plainObj
|
||||
dataPage = readPlain(reader, columnMetadata.type, nValues)
|
||||
} else if (
|
||||
daph.encoding === 'PLAIN_DICTIONARY' ||
|
||||
daph.encoding === 'RLE_DICTIONARY' ||
|
||||
@ -75,7 +72,7 @@ export function readDataPage(bytes, daph, schemaPath, columnMetadata) {
|
||||
export function readDictionaryPage(bytes, diph, columnMetadata) {
|
||||
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
|
||||
const reader = { view, offset: 0 }
|
||||
return readPlain(reader, columnMetadata.type, diph.num_values, false)
|
||||
return readPlain(reader, columnMetadata.type, diph.num_values)
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -48,15 +48,13 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata,
|
||||
// read values based on encoding
|
||||
const nValues = daph2.num_values - daph2.num_nulls
|
||||
if (daph2.encoding === 'PLAIN') {
|
||||
const { element } = schemaPath[schemaPath.length - 1]
|
||||
const utf8 = element.converted_type === 'UTF8'
|
||||
let page = compressedBytes.slice(reader.offset)
|
||||
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') {
|
||||
page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors)
|
||||
}
|
||||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
|
||||
const pageReader = { view: pageView, offset: 0 }
|
||||
dataPage = readPlain(pageReader, columnMetadata.type, nValues, utf8)
|
||||
dataPage = readPlain(pageReader, columnMetadata.type, nValues)
|
||||
} else if (daph2.encoding === 'RLE') {
|
||||
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
|
||||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
|
||||
|
||||
10
src/plain.js
10
src/plain.js
@ -7,10 +7,9 @@
|
||||
* @param {DataReader} reader - buffer to read data from
|
||||
* @param {ParquetType} type - parquet type of the data
|
||||
* @param {number} count - number of values to read
|
||||
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8
|
||||
* @returns {DecodedArray} array of values
|
||||
*/
|
||||
export function readPlain(reader, type, count, utf8) {
|
||||
export function readPlain(reader, type, count) {
|
||||
if (count === 0) return []
|
||||
if (type === 'BOOLEAN') {
|
||||
return readPlainBoolean(reader, count)
|
||||
@ -25,12 +24,7 @@ export function readPlain(reader, type, count, utf8) {
|
||||
} else if (type === 'DOUBLE') {
|
||||
return readPlainDouble(reader, count)
|
||||
} else if (type === 'BYTE_ARRAY') {
|
||||
const byteArray = readPlainByteArray(reader, count)
|
||||
if (utf8) {
|
||||
const decoder = new TextDecoder()
|
||||
return byteArray.map(bytes => decoder.decode(bytes))
|
||||
}
|
||||
return byteArray
|
||||
return readPlainByteArray(reader, count)
|
||||
} else if (type === 'FIXED_LEN_BYTE_ARRAY') {
|
||||
return readPlainByteArrayFixed(reader, count)
|
||||
} else {
|
||||
|
||||
@ -7,7 +7,7 @@ describe('readPlain', () => {
|
||||
const view = new DataView(new ArrayBuffer(1))
|
||||
view.setUint8(0, 0b00000101) // true, false, true
|
||||
const reader = { view, offset: 0 }
|
||||
const result = readPlain(reader, 'BOOLEAN', 3, false)
|
||||
const result = readPlain(reader, 'BOOLEAN', 3)
|
||||
expect(result).toEqual([true, false, true])
|
||||
expect(reader.offset).toBe(1)
|
||||
})
|
||||
@ -16,7 +16,7 @@ describe('readPlain', () => {
|
||||
const view = new DataView(new ArrayBuffer(4))
|
||||
view.setInt32(0, 123456789, true) // little-endian
|
||||
const reader = { view, offset: 0 }
|
||||
const result = readPlain(reader, 'INT32', 1, false)
|
||||
const result = readPlain(reader, 'INT32', 1)
|
||||
expect(result).toEqual(new Int32Array([123456789]))
|
||||
expect(reader.offset).toBe(4)
|
||||
})
|
||||
@ -25,7 +25,7 @@ describe('readPlain', () => {
|
||||
const view = new DataView(new ArrayBuffer(8))
|
||||
view.setBigInt64(0, BigInt('1234567890123456789'), true)
|
||||
const reader = { view, offset: 0 }
|
||||
const result = readPlain(reader, 'INT64', 1, false)
|
||||
const result = readPlain(reader, 'INT64', 1)
|
||||
expect(result).toEqual(new BigInt64Array([1234567890123456789n]))
|
||||
expect(reader.offset).toBe(8)
|
||||
})
|
||||
@ -40,7 +40,7 @@ describe('readPlain', () => {
|
||||
view.setBigInt64(0, low, true)
|
||||
view.setInt32(8, high, true)
|
||||
const reader = { view, offset: 0 }
|
||||
const result = readPlain(reader, 'INT96', 1, false)
|
||||
const result = readPlain(reader, 'INT96', 1)
|
||||
const expectedValue = (BigInt(high) << BigInt(32)) | low
|
||||
expect(result).toEqual([expectedValue])
|
||||
expect(reader.offset).toBe(12)
|
||||
@ -50,7 +50,7 @@ describe('readPlain', () => {
|
||||
const view = new DataView(new ArrayBuffer(4))
|
||||
view.setFloat32(0, 1234.5, true) // little-endian
|
||||
const reader = { view, offset: 0 }
|
||||
const result = readPlain(reader, 'FLOAT', 1, false)
|
||||
const result = readPlain(reader, 'FLOAT', 1)
|
||||
expect(result).toEqual(new Float32Array([1234.5]))
|
||||
expect(reader.offset).toBe(4)
|
||||
})
|
||||
@ -59,7 +59,7 @@ describe('readPlain', () => {
|
||||
const view = new DataView(new ArrayBuffer(8))
|
||||
view.setFloat64(0, 12345.6789, true) // little-endian
|
||||
const reader = { view, offset: 0 }
|
||||
const result = readPlain(reader, 'DOUBLE', 1, false)
|
||||
const result = readPlain(reader, 'DOUBLE', 1)
|
||||
expect(result).toEqual(new Float64Array([12345.6789]))
|
||||
expect(reader.offset).toBe(8)
|
||||
})
|
||||
@ -71,23 +71,11 @@ describe('readPlain', () => {
|
||||
view.setUint8(5, 2)
|
||||
view.setUint8(6, 3)
|
||||
const reader = { view, offset: 0 }
|
||||
const result = readPlain(reader, 'BYTE_ARRAY', 1, false)
|
||||
const result = readPlain(reader, 'BYTE_ARRAY', 1)
|
||||
expect(result).toEqual([new Uint8Array([1, 2, 3])])
|
||||
expect(reader.offset).toBe(7)
|
||||
})
|
||||
|
||||
it('reads BYTE_ARRAY values as strings', () => {
|
||||
const view = new DataView(new ArrayBuffer(10))
|
||||
view.setInt32(0, 3, true) // length 3
|
||||
view.setUint8(4, 65)
|
||||
view.setUint8(5, 66)
|
||||
view.setUint8(6, 67)
|
||||
const reader = { view, offset: 0 }
|
||||
const result = readPlain(reader, 'BYTE_ARRAY', 1, true)
|
||||
expect(result).toEqual(['ABC'])
|
||||
expect(reader.offset).toBe(7)
|
||||
})
|
||||
|
||||
it('reads FIXED_LEN_BYTE_ARRAY values', () => {
|
||||
const fixedLength = 3
|
||||
const view = new DataView(new ArrayBuffer(fixedLength))
|
||||
@ -95,7 +83,7 @@ describe('readPlain', () => {
|
||||
view.setUint8(1, 5)
|
||||
view.setUint8(2, 6)
|
||||
const reader = { view, offset: 0 }
|
||||
const result = readPlain(reader, 'FIXED_LEN_BYTE_ARRAY', fixedLength, false)
|
||||
const result = readPlain(reader, 'FIXED_LEN_BYTE_ARRAY', fixedLength)
|
||||
expect(result).toEqual(new Uint8Array([4, 5, 6]))
|
||||
expect(reader.offset).toBe(fixedLength)
|
||||
})
|
||||
@ -105,7 +93,7 @@ describe('readPlain', () => {
|
||||
const reader = { view, offset: 0 }
|
||||
/** @type any */
|
||||
const invalidType = 'invalidType'
|
||||
expect(() => readPlain(reader, invalidType, 1, false))
|
||||
expect(() => readPlain(reader, invalidType, 1))
|
||||
.toThrow(`parquet unhandled type: ${invalidType}`)
|
||||
})
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user