From f7f6361522851b121aad2ee2492d544d74111404 Mon Sep 17 00:00:00 2001 From: Ross Blair Date: Fri, 11 Apr 2025 11:24:16 -0500 Subject: [PATCH 1/2] An attempt to convert parquet files to columnmaps. --- deno.json | 1 + src/files/parquet.test.ts | 18 ++++++++++++++++++ src/files/parquet.ts | 15 +++++++++++++++ tests/data/participants.parquet | Bin 0 -> 2923 bytes 4 files changed, 34 insertions(+) create mode 100644 src/files/parquet.test.ts create mode 100644 src/files/parquet.ts create mode 100644 tests/data/participants.parquet diff --git a/deno.json b/deno.json index 47207157..bae15e16 100644 --- a/deno.json +++ b/deno.json @@ -32,6 +32,7 @@ "@cliffy/command": "jsr:@effigies/cliffy-command@1.0.0-dev.8", "@cliffy/table": "jsr:@effigies/cliffy-table@1.0.0-dev.5", "@hed/validator": "npm:hed-validator@4.0.1", + "@hyparquet": "npm:hyparquet@1.12.0", "@ignore": "npm:ignore@7.0.3", "@libs/xml": "jsr:@libs/xml@6.0.4", "@mango/nifti": "npm:@bids/nifti-reader-js@0.6.9", diff --git a/src/files/parquet.test.ts b/src/files/parquet.test.ts new file mode 100644 index 00000000..d272e54e --- /dev/null +++ b/src/files/parquet.test.ts @@ -0,0 +1,18 @@ +import { assertEquals } from '@std/assert' +import { FileIgnoreRules } from './ignore.ts' +import { BIDSFileDeno } from './deno.ts' + +import { loadParquet } from './parquet.ts' + +Deno.test('Test loading parquet file', async (t) => { + const ignore = new FileIgnoreRules([]) + await t.step('Load participants.parquet', async () => { + const path = 'participants.parquet' + const root = './tests/data/' + const file = new BIDSFileDeno(root, path, ignore) + const participantsMap = await loadParquet(file) + const keys = Object.keys(participantsMap) + assertEquals(keys.length, 3) + keys.map(key => assertEquals(participantsMap.get(key)?.length, 16)) + }) +}) diff --git a/src/files/parquet.ts b/src/files/parquet.ts new file mode 100644 index 00000000..403990b2 --- /dev/null +++ b/src/files/parquet.ts @@ -0,0 +1,15 @@ +import { asyncBufferFromFile, parquetRead, ParquetReadOptions, ColumnData } from '@hyparquet' +import type { BIDSFile } from '../types/filetree.ts' +import { ColumnsMap } from '../types/columns.ts' +import { createUTF8Stream } from './streams.ts' + +export async function loadParquet(file: BIDSFile, maxRows: number = -1): Promise { + let columnsMap = new ColumnsMap() + const readOpts: ParquetReadOptions = { + file: (await file.readBytes(file.size)).buffer, + // @ts-expect-error + onChunk: (data) => columnsMap[data.columnName] = data.columnData.map(entry => String(entry)), + } + await parquetRead(readOpts) + return columnsMap +} diff --git a/tests/data/participants.parquet b/tests/data/participants.parquet new file mode 100644 index 0000000000000000000000000000000000000000..75a54019fba31414a7a479bc23674ab3400bc0f7 GIT binary patch literal 2923 zcmcIm%WoS+7#};1n?!9DAZyuRAQzzK0c95`^}!XLm1@y)CsaT5-xWtDk+kNLje_n3(v z;yTL&nD70}m%a)UW6-MzedXH@B7}ynyqjd14aC7n!AQf%z{tY53*#P)+$$RaHpy%- zDqbLf`lkKll=BSDB3LKv;Wb9WXbjK%q0b3=5Q^-CgV5-zGz@>f`XsJfs&;;?^bOavhVlEc_nyShfCszyK(e}8d?yyS zBvS*1ZZ? z4`>ApT^>k-2`PX*kp_eQm>3&rgQ4EH2$#*=N#04uArp{b(uBtsgxnpD$WZt?uGb%k z_q}@_{&W2;@@2dG7~>iY^1X|38t1#1$GD3NUCPVnm*iu@JebR?JRk1tWC-T33P?i zepj%{J=s(ZK_7PoGt-mB2GMa<@qAb*U|mVodx}*99v)BWkx~${!u|m_r45hfbFD|j z#?+UQvt_exXvOAZ;zvvkb=l;d&LcQm%&>BCJQNCLQxcoQ4xe+|B5M#m*=i2u0@$(g zfD3%Hxun+0Q<-2jT}eC~b&3t0_y%1^DH$hf$rw|+h4QFv+D1E7^WfDiTHAGcEVi~=!0PIXnxfX&<);3xTjyt8Aic_C&qr%Pt@ zR4q`s(`nUvU?bU{=kYvODUjGM_~)AGf5kSFz5M*ro;iUYfc{$K&E&0VY?o>+EM(K( zJ-k>;TefQ4p%?cRc`;mdzn%2rWDT{*L+{sn^ByX(D(R7Ss%!v%r0CFhCFsGTp(`do z(MbLZ<*_F66UEG>s2!_gDAu7Zb7@+e>QTe37SdzEN|M?vFq+MY!Wko(*enQ>%BcW) za78gL(Mx-ekZ@n&1vrIAfsh9Uxa8!jl9_JF24lFhIN$4V4Gb3-h%Ql~_nV3v zWZqZE(?S&U^#zyOrz9qC4pyMPxVlgL)o{K>>=l9Q2|;=8wAP>?%k=E*sRA$LX(?Uq yb>PBLFiG|4^fn5|;9t`RuIB72w!KRq(b)8njCs#zWDkDfcz Date: Mon, 14 Apr 2025 10:52:21 -0500 Subject: [PATCH 2/2] Add slice and bytelength to make our filetypes compatible with how hyparquet is accessing files --- src/files/browser.ts | 10 ++++++++++ src/files/deno.ts | 7 +++++++ src/files/parquet.ts | 2 +- src/types/filetree.ts | 4 ++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/files/browser.ts b/src/files/browser.ts index a8a7ae7b..f8f70e17 100644 --- a/src/files/browser.ts +++ b/src/files/browser.ts @@ -29,6 +29,10 @@ export class BIDSFileBrowser implements BIDSFile { return this.#file.size } + get size(): number { + return this.#file.size + } + get stream(): ReadableStream { return this.#file.stream() } @@ -44,6 +48,12 @@ export class BIDSFileBrowser implements BIDSFile { async readBytes(size: number, offset = 0): Promise> { return new Uint8Array(await this.#file.slice(offset, size).arrayBuffer()) } + + async slice(start: number, end: number): Promise { + return (await this.readBytes(end - start, start)).buffer + } + + /** } /** diff --git a/src/files/deno.ts b/src/files/deno.ts index 65cc82d3..72871635 100644 --- a/src/files/deno.ts +++ b/src/files/deno.ts @@ -45,6 +45,10 @@ export class BIDSFileDeno implements BIDSFile { return this.#fileInfo ? this.#fileInfo.size : -1 } + get byteLength(): number { + return this.#fileInfo ? this.#fileInfo.size : -1 + } + get stream(): ReadableStream { const handle = this.#openHandle() return handle.readable @@ -87,6 +91,9 @@ export class BIDSFileDeno implements BIDSFile { return buf.subarray(0, nbytes) } + async slice(start: number, end: number): Promise { + return (await this.readBytes(end - start, start)).buffer + } /** * Return a Deno file handle */ diff --git a/src/files/parquet.ts b/src/files/parquet.ts index 403990b2..0c43a6c3 100644 --- a/src/files/parquet.ts +++ b/src/files/parquet.ts @@ -6,7 +6,7 @@ import { createUTF8Stream } from './streams.ts' export async function loadParquet(file: BIDSFile, maxRows: number = -1): Promise { let columnsMap = new ColumnsMap() const readOpts: ParquetReadOptions = { - file: (await file.readBytes(file.size)).buffer, + file: file, // @ts-expect-error onChunk: (data) => columnsMap[data.columnName] = data.columnData.map(entry => String(entry)), } diff --git a/src/types/filetree.ts b/src/types/filetree.ts index d1e94f24..0e84ba9f 100644 --- a/src/types/filetree.ts +++ b/src/types/filetree.ts @@ -15,6 +15,10 @@ export interface BIDSFile { text: () => Promise // Read a range of bytes readBytes: (size: number, offset?: number) => Promise> + // Alternative to readBytes used for ArrayBuffer compatibility + slice: (start: number, end: number) => Promise + // Alternative to size used for ArrayBuffer compatibility + byteLength: number // Access the parent directory parent: FileTree // File has been viewed