Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 105 additions & 2 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import contextlib
import csv
import json
import locale
import pickle
import re
Expand All @@ -18,19 +19,22 @@
from urllib.request import urlopen, Request
from pathlib import Path

import h5py
import numpy as np
import pandas as pd

import xlrd
import xlsxwriter
import openpyxl

from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin
from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, \
TimeVariable, StringVariable
from Orange.data import Compression, open_compressed, detect_encoding, \
isnastr, guess_data_type, sanitize_variable
from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL

from Orange.util import flatten

from Orange.version import short_version as ORANGE_VERSION

# Support values longer than 128K (i.e. text contents features)
csv.field_size_limit(100*1024*1024)
Expand Down Expand Up @@ -511,3 +515,102 @@ def _suggest_filename(self, content_disposition):
matches = re.findall(r"filename\*?=(?:\"|.{0,10}?'[^']*')([^\"]+)",
content_disposition or '')
return urlunquote(matches[-1]) if matches else default_name

class HDF5Reader(FileFormat):
"""Reader for Orange HDF5 files"""
EXTENSIONS = ('.hdf5',)
DESCRIPTION = 'Orange on-disk data'
SUPPORT_COMPRESSED = False
SUPPORT_SPARSE_DATA = False

def read(self):
def read_domain(sub):
d = f['domain']
subdomain = d[sub].asstr() if sub in d else []
subdomain_args = d[f'{sub}_args'].asstr() \
if f'{sub}_args' in d else ['{}'] * len(subdomain)
for attr, args in zip(subdomain, subdomain_args):
yield attr[0], attr[1], json.loads(args)

def make_var(name, header, args):
var_cls = [var for var in (ContinuousVariable,
DiscreteVariable,
StringVariable,
TimeVariable) if header in var.TYPE_HEADERS][0]
new_var = var_cls(name, **{key: val for key, val in args.items()
if key != "attributes"})
new_var.attributes = args.get("attributes", {})
return new_var

def read_hdf5(name, as_str=False):
if name in f:
if as_str:
return f[name].asstr()[:]
return f[name]
return None

with h5py.File(self.filename, "r") as f:
try:
assert f.attrs['creator'] == "Orange"
except KeyError:
assert 'domain' in f

domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)]
for subdomain in ['attributes', 'class_vars', 'metas']])

X = read_hdf5("X")
Y = read_hdf5("Y")

if len(domain.metas) > 1:
metas = np.hstack([read_hdf5(f'metas/{i}',
isinstance(attr, StringVariable))
for i, attr in enumerate(domain.metas)])
elif len(domain.metas) == 1:
metas = read_hdf5('metas/0',
isinstance(domain.metas[0], StringVariable)
)
else:
metas = None

table = Table.from_numpy(domain, X, Y, metas)
if isinstance(self.filename, str):
table.name = path.splitext(path.split(self.filename)[-1])[0]
self.set_table_metadata(self.filename, table)
return table

@classmethod
def write_file(cls, filename, data):
def parse(attr):
params = (attr.name, attr.TYPE_HEADERS[1], {"attributes": attr.attributes})
if isinstance(attr, DiscreteVariable):
params[2].update(values=attr.values)
elif isinstance(attr, TimeVariable):
params[2].update(have_date=attr.have_date,
have_time=attr.have_time)
elif isinstance(attr, ContinuousVariable):
params[2].update(number_of_decimals=attr.number_of_decimals)
return params

with h5py.File(filename, 'w') as f:
f.attrs['creator'] = "Orange"
f.attrs['Orange_version'] = ORANGE_VERSION
f.attrs['HDF5_Version'] = h5py.version.hdf5_version
f.attrs['h5py_version'] = h5py.version.version
str_dtype = h5py.string_dtype()
for subdomain in ['attributes', 'class_vars', 'metas']:
parsed = [parse(feature) for feature in getattr(data.domain, subdomain)]
domain = np.array([[name, header] for name, header, _ in parsed], dtype=str_dtype)
domain_args = np.array([json.dumps(args) for *_, args in parsed], dtype=str_dtype)
f.create_dataset(f'domain/{subdomain}', data=domain)
f.create_dataset(f'domain/{subdomain}_args', data=domain_args)
f.create_dataset("X", data=data.X)
if data.Y.size:
f.create_dataset("Y", data=data.Y)
if data.metas.size:
for i, attr in enumerate(data.domain.metas):
col_type = str_dtype if isinstance(attr, StringVariable) else 'f'
col_data = data.metas[:, [i]].astype(col_type)
if col_type != 'f':
col_data[pd.isnull(col_data)] = ""
f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type)
cls.write_table_metadata(filename, data)
13 changes: 12 additions & 1 deletion Orange/data/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
TimeVariable, Domain, Table
from Orange.data.io import TabReader, ExcelReader
from Orange.data.io import TabReader, ExcelReader, HDF5Reader
from Orange.data.io_util import guess_data_type
from Orange.misc.collections import natural_sorted
from Orange.tests import named_file


class TestTableFilters(unittest.TestCase):
Expand Down Expand Up @@ -155,6 +156,16 @@ def test_roundtrip_xlsx(self):
finally:
os.remove(fname)

def test_roundtrip_hdf5(self):
with named_file('', suffix='.hdf5') as fn:
HDF5Reader.write(fn, self.data)
data = HDF5Reader(fn).read()
np.testing.assert_equal(data.X, self.data.X)
np.testing.assert_equal(data.Y, self.data.Y)
np.testing.assert_equal(data.metas[:2], self.data.metas[:2])
self.assertEqual(data.metas[2, 0], "")
np.testing.assert_equal(data.domain, self.data.domain)


if __name__ == "__main__":
unittest.main()
2 changes: 2 additions & 0 deletions requirements-core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ xgboost>=1.7.4,<2.1
xlrd>=1.2.0
# Writing Excel Files
xlsxwriter
# HDF5 binary data format
h5py