hyparquet-writer/src/unconvert.js

const dayMillis = 86400000 // 1 day in milliseconds

/**
 * Convert from rich to primitive types.
 *
 * @import {DecodedArray, SchemaElement, Statistics} from 'hyparquet'
 * @param {SchemaElement} element
 * @param {DecodedArray} values
 * @returns {DecodedArray}
 */
export function unconvert(element, values) {
  const { converted_type: ctype, logical_type: ltype } = element
  if (ctype === 'DECIMAL') {
    const factor = 10 ** (element.scale || 0)
    return values.map(v => {
      if (v === null || v === undefined) return v
      if (typeof v !== 'number') throw new Error('DECIMAL must be a number')
      return unconvertDecimal(element, BigInt(Math.round(v * factor)))
    })
  }
  if (ctype === 'DATE') {
    return Array.from(values).map(v => v && v.getTime() / dayMillis)
  }
  if (ctype === 'TIMESTAMP_MILLIS') {
    return Array.from(values).map(v => v && BigInt(v.getTime()))
  }
  if (ctype === 'TIMESTAMP_MICROS') {
    return Array.from(values).map(v => v && BigInt(v.getTime() * 1000))
  }
  if (ctype === 'JSON') {
    if (!Array.isArray(values)) throw new Error('JSON must be an array')
    const encoder = new TextEncoder()
    return values.map(v => encoder.encode(JSON.stringify(v)))
  }
  if (ltype?.type === 'FLOAT16') {
    return Array.from(values).map(unconvertFloat16)
  }
  if (ctype === 'UTF8') {
    if (!Array.isArray(values)) throw new Error('strings must be an array')
    const encoder = new TextEncoder()
    return values.map(v => encoder.encode(v))
  }
  return values
}

/**
 * Uncovert from rich type to byte array for metadata statistics.
 *
 * @param {import('hyparquet/src/types.js').MinMaxType | undefined} value
 * @param {SchemaElement} element
 * @returns {Uint8Array | undefined}
 */
export function unconvertMinMax(value, element) {
  if (value === undefined || value === null) return undefined
  const { type, converted_type } = element
  if (type === 'BOOLEAN') return new Uint8Array([value ? 1 : 0])
  if (converted_type === 'DECIMAL') {
    if (typeof value !== 'number') throw new Error('DECIMAL must be a number')
    const factor = 10 ** (element.scale || 0)
    const out = unconvertDecimal(element, BigInt(Math.round(value * factor)))
    if (out instanceof Uint8Array) return out
    if (typeof out === 'number') {
      const buffer = new ArrayBuffer(4)
      new DataView(buffer).setFloat32(0, out, true)
      return new Uint8Array(buffer)
    }
    if (typeof out === 'bigint') {
      const buffer = new ArrayBuffer(8)
      new DataView(buffer).setBigInt64(0, out, true)
      return new Uint8Array(buffer)
    }
  }
  if (type === 'BYTE_ARRAY' || type === 'FIXED_LEN_BYTE_ARRAY') {
    // truncate byte arrays to 16 bytes for statistics
    if (value instanceof Uint8Array) return value.slice(0, 16)
    return new TextEncoder().encode(value.toString().slice(0, 16))
  }
  if (type === 'FLOAT' && typeof value === 'number') {
    const buffer = new ArrayBuffer(4)
    new DataView(buffer).setFloat32(0, value, true)
    return new Uint8Array(buffer)
  }
  if (type === 'DOUBLE' && typeof value === 'number') {
    const buffer = new ArrayBuffer(8)
    new DataView(buffer).setFloat64(0, value, true)
    return new Uint8Array(buffer)
  }
  if (type === 'INT32' && typeof value === 'number') {
    const buffer = new ArrayBuffer(4)
    new DataView(buffer).setInt32(0, value, true)
    return new Uint8Array(buffer)
  }
  if (type === 'INT64' && typeof value === 'bigint') {
    const buffer = new ArrayBuffer(8)
    new DataView(buffer).setBigInt64(0, value, true)
    return new Uint8Array(buffer)
  }
  if (type === 'INT32' && converted_type === 'DATE' && value instanceof Date) {
    const buffer = new ArrayBuffer(4)
    new DataView(buffer).setInt32(0, Math.floor(value.getTime() / dayMillis), true)
    return new Uint8Array(buffer)
  }
  if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS' && value instanceof Date) {
    const buffer = new ArrayBuffer(8)
    new DataView(buffer).setBigInt64(0, BigInt(value.getTime()), true)
    return new Uint8Array(buffer)
  }
  throw new Error(`unsupported type for statistics: ${type} with value ${value}`)
}

/**
 * @param {Statistics} stats
 * @param {SchemaElement} element
 * @returns {import('../src/types.js').ThriftObject}
 */
export function unconvertStatistics(stats, element) {
  return {
    field_1: unconvertMinMax(stats.max, element),
    field_2: unconvertMinMax(stats.min, element),
    field_3: stats.null_count,
    field_4: stats.distinct_count,
    field_5: unconvertMinMax(stats.max_value, element),
    field_6: unconvertMinMax(stats.min_value, element),
    field_7: stats.is_max_value_exact,
    field_8: stats.is_min_value_exact,
  }
}

/**
 * @param {SchemaElement} element
 * @param {bigint} value
 * @returns {number | bigint | Uint8Array}
 */
export function unconvertDecimal({ type, type_length }, value) {
  if (type === 'INT32') return Number(value)
  if (type === 'INT64') return value
  if (type === 'FIXED_LEN_BYTE_ARRAY' && !type_length) {
    throw new Error('fixed length byte array type_length is required')
  }
  if (!type_length && !value) return new Uint8Array()

  const bytes = []
  while (true) {
    // extract the lowest 8 bits
    const byte = Number(value & 0xffn)
    bytes.unshift(byte)
    value >>= 8n

    if (type_length) {
      if (bytes.length >= type_length) break // fixed length
    } else {
      // for nonnegative: stop when top byte has signBit = 0 AND shifted value == 0n
      // for negative: stop when top byte has signBit = 1 AND shifted value == -1n
      const sign = byte & 0x80
      if (!sign && value === 0n || sign && value === -1n) {
        break
      }
    }
  }

  return new Uint8Array(bytes)
}

/**
 * @param {number | undefined} value
 * @returns {Uint8Array | undefined}
 */
export function unconvertFloat16(value) {
  if (value === undefined || value === null) return
  if (Number.isNaN(value)) return new Uint8Array([0x00, 0x7e])

  const sign = value < 0 || Object.is(value, -0) ? 1 : 0
  const abs = Math.abs(value)

  // infinities
  if (!isFinite(abs)) return new Uint8Array([0x00, sign << 7 | 0x7c])

  // ±0
  if (abs === 0) return new Uint8Array([0x00, sign << 7])

  // write as f32 to get raw bits
  const buf = new ArrayBuffer(4)
  new Float32Array(buf)[0] = abs
  const bits32 = new Uint32Array(buf)[0]

  let exp32 = bits32 >>> 23 & 0xff
  let mant32 = bits32 & 0x7fffff

  // convert 32‑bit exponent to unbiased, then to 16‑bit
  exp32 -= 127

  // handle numbers too small for a normal 16‑bit exponent
  if (exp32 < -14) {
    // sub‑normal: shift mantissa so that result = mant * 2^-14
    const shift = -14 - exp32
    mant32 = (mant32 | 0x800000) >> shift + 13

    // round‑to‑nearest‑even
    if (mant32 & 1) mant32 += 1

    const bits16 = sign << 15 | mant32
    return new Uint8Array([bits16 & 0xff, bits16 >> 8])
  }

  // overflow
  if (exp32 > 15) return new Uint8Array([0x00, sign << 7 | 0x7c])

  // normal number
  let exp16 = exp32 + 15
  mant32 = mant32 + 0x1000 // add rounding bit

  // handle mantissa overflow after rounding
  if (mant32 & 0x800000) {
    mant32 = 0
    if (++exp16 === 31) // became infinity
      return new Uint8Array([0x00, sign << 7 | 0x7c])
  }

  const bits16 = sign << 15 | exp16 << 10 | mant32 >> 13
  return new Uint8Array([bits16 & 0xff, bits16 >> 8])
}