No utf8 in plain decoding

This commit is contained in:
Kenny Daniel 2024-05-09 16:28:50 -07:00
parent 5291ff1072
commit e398e66dd4
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 14 additions and 37 deletions

@ -31,10 +31,7 @@ export function readDataPage(bytes, daph, schemaPath, columnMetadata) {
// read values based on encoding
const nValues = daph.num_values - numNulls
if (daph.encoding === 'PLAIN') {
const { element } = schemaPath[schemaPath.length - 1]
const utf8 = element.converted_type === 'UTF8'
const plainObj = readPlain(reader, columnMetadata.type, nValues, utf8)
dataPage = plainObj
dataPage = readPlain(reader, columnMetadata.type, nValues)
} else if (
daph.encoding === 'PLAIN_DICTIONARY' ||
daph.encoding === 'RLE_DICTIONARY' ||
@ -75,7 +72,7 @@ export function readDataPage(bytes, daph, schemaPath, columnMetadata) {
export function readDictionaryPage(bytes, diph, columnMetadata) {
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
const reader = { view, offset: 0 }
return readPlain(reader, columnMetadata.type, diph.num_values, false)
return readPlain(reader, columnMetadata.type, diph.num_values)
}
/**

@ -48,15 +48,13 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata,
// read values based on encoding
const nValues = daph2.num_values - daph2.num_nulls
if (daph2.encoding === 'PLAIN') {
const { element } = schemaPath[schemaPath.length - 1]
const utf8 = element.converted_type === 'UTF8'
let page = compressedBytes.slice(reader.offset)
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') {
page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors)
}
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const pageReader = { view: pageView, offset: 0 }
dataPage = readPlain(pageReader, columnMetadata.type, nValues, utf8)
dataPage = readPlain(pageReader, columnMetadata.type, nValues)
} else if (daph2.encoding === 'RLE') {
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)

@ -7,10 +7,9 @@
* @param {DataReader} reader - buffer to read data from
* @param {ParquetType} type - parquet type of the data
* @param {number} count - number of values to read
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8
* @returns {DecodedArray} array of values
*/
export function readPlain(reader, type, count, utf8) {
export function readPlain(reader, type, count) {
if (count === 0) return []
if (type === 'BOOLEAN') {
return readPlainBoolean(reader, count)
@ -25,12 +24,7 @@ export function readPlain(reader, type, count, utf8) {
} else if (type === 'DOUBLE') {
return readPlainDouble(reader, count)
} else if (type === 'BYTE_ARRAY') {
const byteArray = readPlainByteArray(reader, count)
if (utf8) {
const decoder = new TextDecoder()
return byteArray.map(bytes => decoder.decode(bytes))
}
return byteArray
return readPlainByteArray(reader, count)
} else if (type === 'FIXED_LEN_BYTE_ARRAY') {
return readPlainByteArrayFixed(reader, count)
} else {

@ -7,7 +7,7 @@ describe('readPlain', () => {
const view = new DataView(new ArrayBuffer(1))
view.setUint8(0, 0b00000101) // true, false, true
const reader = { view, offset: 0 }
const result = readPlain(reader, 'BOOLEAN', 3, false)
const result = readPlain(reader, 'BOOLEAN', 3)
expect(result).toEqual([true, false, true])
expect(reader.offset).toBe(1)
})
@ -16,7 +16,7 @@ describe('readPlain', () => {
const view = new DataView(new ArrayBuffer(4))
view.setInt32(0, 123456789, true) // little-endian
const reader = { view, offset: 0 }
const result = readPlain(reader, 'INT32', 1, false)
const result = readPlain(reader, 'INT32', 1)
expect(result).toEqual(new Int32Array([123456789]))
expect(reader.offset).toBe(4)
})
@ -25,7 +25,7 @@ describe('readPlain', () => {
const view = new DataView(new ArrayBuffer(8))
view.setBigInt64(0, BigInt('1234567890123456789'), true)
const reader = { view, offset: 0 }
const result = readPlain(reader, 'INT64', 1, false)
const result = readPlain(reader, 'INT64', 1)
expect(result).toEqual(new BigInt64Array([1234567890123456789n]))
expect(reader.offset).toBe(8)
})
@ -40,7 +40,7 @@ describe('readPlain', () => {
view.setBigInt64(0, low, true)
view.setInt32(8, high, true)
const reader = { view, offset: 0 }
const result = readPlain(reader, 'INT96', 1, false)
const result = readPlain(reader, 'INT96', 1)
const expectedValue = (BigInt(high) << BigInt(32)) | low
expect(result).toEqual([expectedValue])
expect(reader.offset).toBe(12)
@ -50,7 +50,7 @@ describe('readPlain', () => {
const view = new DataView(new ArrayBuffer(4))
view.setFloat32(0, 1234.5, true) // little-endian
const reader = { view, offset: 0 }
const result = readPlain(reader, 'FLOAT', 1, false)
const result = readPlain(reader, 'FLOAT', 1)
expect(result).toEqual(new Float32Array([1234.5]))
expect(reader.offset).toBe(4)
})
@ -59,7 +59,7 @@ describe('readPlain', () => {
const view = new DataView(new ArrayBuffer(8))
view.setFloat64(0, 12345.6789, true) // little-endian
const reader = { view, offset: 0 }
const result = readPlain(reader, 'DOUBLE', 1, false)
const result = readPlain(reader, 'DOUBLE', 1)
expect(result).toEqual(new Float64Array([12345.6789]))
expect(reader.offset).toBe(8)
})
@ -71,23 +71,11 @@ describe('readPlain', () => {
view.setUint8(5, 2)
view.setUint8(6, 3)
const reader = { view, offset: 0 }
const result = readPlain(reader, 'BYTE_ARRAY', 1, false)
const result = readPlain(reader, 'BYTE_ARRAY', 1)
expect(result).toEqual([new Uint8Array([1, 2, 3])])
expect(reader.offset).toBe(7)
})
it('reads BYTE_ARRAY values as strings', () => {
const view = new DataView(new ArrayBuffer(10))
view.setInt32(0, 3, true) // length 3
view.setUint8(4, 65)
view.setUint8(5, 66)
view.setUint8(6, 67)
const reader = { view, offset: 0 }
const result = readPlain(reader, 'BYTE_ARRAY', 1, true)
expect(result).toEqual(['ABC'])
expect(reader.offset).toBe(7)
})
it('reads FIXED_LEN_BYTE_ARRAY values', () => {
const fixedLength = 3
const view = new DataView(new ArrayBuffer(fixedLength))
@ -95,7 +83,7 @@ describe('readPlain', () => {
view.setUint8(1, 5)
view.setUint8(2, 6)
const reader = { view, offset: 0 }
const result = readPlain(reader, 'FIXED_LEN_BYTE_ARRAY', fixedLength, false)
const result = readPlain(reader, 'FIXED_LEN_BYTE_ARRAY', fixedLength)
expect(result).toEqual(new Uint8Array([4, 5, 6]))
expect(reader.offset).toBe(fixedLength)
})
@ -105,7 +93,7 @@ describe('readPlain', () => {
const reader = { view, offset: 0 }
/** @type any */
const invalidType = 'invalidType'
expect(() => readPlain(reader, invalidType, 1, false))
expect(() => readPlain(reader, invalidType, 1))
.toThrow(`parquet unhandled type: ${invalidType}`)
})
})