Skip to content

Commit

Permalink
Version 0.0.7. First fairly well-tested release. Dropping the "dev" s…
Browse files Browse the repository at this point in the history
…uffix from version number.
  • Loading branch information
afiedler committed May 20, 2014
1 parent c06e764 commit bdf9480
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 25 deletions.
14 changes: 9 additions & 5 deletions EXAMPLES.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,27 @@ import tstables
import pandas
from datetime import *

# Class to use as the table discription
class bpi_values(tables.IsDescription):
# Class to use as the table description
class BpiValues(tables.IsDescription):
timestamp = tables.Int64Col(pos=0)
bpi = tables.Float64Col(pos=1)

# Use pandas to read in the CSV data
bpi = pandas.read_csv('bpi_2014_01.csv',index_col=0,names=['date','bpi'],parse_dates=True)

f = tables.open_file('bpi.h5','a')

ts = f.create_ts('/','BPI',bpi_values)
# Create a new time series
ts = f.create_ts('/','BPI',BpiValues)

# Append the BPI data
ts.append(bpi)
ts.flush()

# Now, read in some data
# Read in some data
read_start_dt = datetime(2014,1,4,12,00)
read_end_dt = datetime(2014,1,4,14,30)

rows = ts.read_range(read_start_dt,read_end_dt)

# `rows` will be a pandas DataFrame with a DatetimeIndex.
```
39 changes: 35 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,39 @@ Its goals are to support a workflow where tons (gigabytes) of time series data a
appended periodically to a HDF5 file, and need to be read many times (quickly) for analytical models
and research.

## Not ready for use yet
## Example

TsTables is not ready for use yet and is currently under development. The goal is to have something
workable and being testing by end of May, 2014. If you are interested in the project (to contribute
or learn when it is finished), email Andy Fiedler at <[email protected]>.
This example reads in minutely bitcoin price data and then fetches a range of data. For the full example here, and other
examples, see [EXAMPLES.md](EXAMPLES.md).

```python
# Class to use as the table description
class BpiValues(tables.IsDescription):
timestamp = tables.Int64Col(pos=0)
bpi = tables.Float64Col(pos=1)

# Use pandas to read in the CSV data
bpi = pandas.read_csv('bpi_2014_01.csv',index_col=0,names=['date','bpi'],parse_dates=True)

f = tables.open_file('bpi.h5','a')

# Create a new time series
ts = f.create_ts('/','BPI',BpiValues)

# Append the BPI data
ts.append(bpi)

# Read in some data
read_start_dt = datetime(2014,1,4,12,00)
read_end_dt = datetime(2014,1,4,14,30)

rows = ts.read_range(read_start_dt,read_end_dt)

# `rows` will be a pandas DataFrame with a DatetimeIndex.
```

## Pre-release software

TsTables is currently under development and has yet to be used extensively in production. It is reaching the point where
it is reasonably well-tested, so if you'd like to use it, feel free! If you are interested in the project (to contribute
or to hear about updates), email Andy Fiedler at <[email protected]>.
4 changes: 2 additions & 2 deletions register.py → release.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Converts README.md to README.txt (in restructured text format) and registers on PyPI
# Converts README.md to README.txt (in restructured text format), builds package, and uploads to PyPI

import pypandoc
import os
Expand All @@ -7,6 +7,6 @@
f = open('README.txt','w+')
f.write(rst)
f.close()
os.system("python3 setup.py register")
os.system("python3 setup.py register sdist upload")
os.remove('README.txt')

2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ force = True

[egg_info]
# We are doing development build
tag_build = dev
# tag_build = dev
# Do we want to have date in file name?
tag_date = 0

Expand Down
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
if os.path.exists('README.txt'):
long_description = open('README.txt').read()

exec(open('src/tstables/_version.py').read())


setup(

Expand Down Expand Up @@ -57,7 +59,7 @@
tests_require = 'docutils >= 0.6',

name = "tstables",
version = "0.0.5",
version = __version__,

# metadata for upload to PyPI
author = "Andy Fiedler",
Expand All @@ -67,5 +69,5 @@
keywords = "time series high frequency HDF5",
url = "http://github.com/afiedler/tstables", # project home page, if any
long_description = long_description
# could also include long_description, download_url, classifiers, etc.
# could also include download_url, classifiers, etc.
)
2 changes: 1 addition & 1 deletion src/tstables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
TsTables is a wrapper for PyTables that allows you to manage very large time series.
"""

from ._version import __version__
from tstables.tstable import TsTable
from tstables.file import create_ts
from tstables.group import timeseries_repr
Expand Down
5 changes: 5 additions & 0 deletions src/tstables/_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Store the version here so:
# 1) we don't load dependencies by storing it in __init__.py
# 2) we can import it in setup.py for the same reason
# 3) we can import it into your module module
__version__ = '0.0.7'
86 changes: 79 additions & 7 deletions src/tstables/tests/test_tstable_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def test_create_ts(self):
# - the group exists
# - it has a _TS_TABLES_CLASS attribute equal to "TIMESERIES"
# - it has a table at yYYYY/mMM/dDD/ts_data, where YYY-MM-DD is today (in UTC)
# - the dtype is correct
self.assertEqual(self.h5_file.root.EURUSD.__class__, tables.Group)
self.assertEqual(self.h5_file.root.EURUSD._v_attrs._TS_TABLES_CLASS,'TIMESERIES')

Expand All @@ -43,6 +44,10 @@ def test_create_ts(self):

self.assertEqual(ts_data.attrs._TS_TABLES_EXPECTEDROWS_PER_PARTITION,10000)

self.assertEqual(ts_data._v_dtype[0],tables.dtype_from_descr(Price)[0])
self.assertEqual(ts_data._v_dtype[1],tables.dtype_from_descr(Price)[1])


def test_create_ts_with_invalid_description_incorrect_order(self):
class InvalidDesc(tables.IsDescription):
# Positions are out of order here!
Expand Down Expand Up @@ -96,6 +101,17 @@ def test_load_same_timestamp(self):
for idx,p in enumerate(rows_read['price']):
self.assertEqual(p,rows['price'][idx])

def __load_csv_data(self,csv):
sfile = io.StringIO(csv)

# Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex.
rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'})

ts = self.h5_file.create_ts('/','EURUSD',description=Price)
ts.append(rows)

return ts,rows

def test_load_cross_partition_boundary_timestamps(self):

# This data should just cross the partition boundary between 5/4 and 5/5
Expand All @@ -105,13 +121,7 @@ def test_load_cross_partition_boundary_timestamps(self):
2014-05-05T00:00:00.000Z,4
2014-05-05T00:00:00.001Z,5"""

sfile = io.StringIO(csv)

# Note: don't need the 'timestamp' column in the dtype param here because it will become the DatetimeIndex.
rows = pandas.read_csv(sfile,parse_dates=[0],index_col=0,names=['timestamp', 'price'],dtype={'price': 'i4'})

ts = self.h5_file.create_ts('/','EURUSD',description=Price)
ts.append(rows)
ts,rows = self.__load_csv_data(csv)

# Inspect to ensure that data has been stored correctly
tbl = ts.root_group.y2014.m05.d04.ts_data
Expand Down Expand Up @@ -139,6 +149,68 @@ def test_load_cross_partition_boundary_timestamps(self):
for idx,p in enumerate(rows_read['price']):
self.assertEqual(p,rows['price'][idx])

def test_read_data_end_date_before_start_date(self):
csv = """2014-05-04T23:59:59.998Z,1
2014-05-04T23:59:59.999Z,2
2014-05-04T23:59:59.999Z,3
2014-05-05T00:00:00.000Z,4
2014-05-05T00:00:00.001Z,5"""

ts,rows = self.__load_csv_data(csv)

# Try to fetch with end_dt before start_dt
end_dt = datetime.datetime(2014,5,5)
start_dt = datetime.datetime(2014,5,4)
self.assertRaises(AttributeError, ts.read_range, start_dt, end_dt)

# This should work, and return just this row: '2014-05-05T00:00:00.000Z,4'
start_dt = end_dt
rng = ts.read_range(start_dt,end_dt)

self.assertEqual(rng['price'].size, 1)
self.assertEqual(rng['price'][0],4)

def test_no_data_stored_in_missing_day(self):
# Note that May 5 is missing
csv = """2014-05-04T23:59:59.998Z,1
2014-05-04T23:59:59.999Z,2
2014-05-04T23:59:59.999Z,3
2014-05-06T00:00:00.000Z,4
2014-05-06T00:00:00.001Z,5"""

ts,rows = self.__load_csv_data(csv)

tbl = ts.root_group.y2014.m05.d05.ts_data

# No rows on the 5th
self.assertEqual(tbl.nrows,0)

tbl = ts.root_group.y2014.m05.d06.ts_data

# Two rows on the 6th
self.assertEqual(tbl.nrows,2)

tbl = ts.root_group.y2014.m05.d04.ts_data

# Three rows on the 4th
self.assertEqual(tbl.nrows,3)

def test_append_no_data(self):
# No data, just making sure this doesn't throw an exception or anything
csv = """"""

ts,rows = self.__load_csv_data(csv)

self.assertEqual(rows['price'].size, 0)












Expand Down
9 changes: 6 additions & 3 deletions src/tstables/tstable.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,14 +180,16 @@ def __get_min_ts(self):

return min_ts



def read_range(self,start_dt,end_dt,as_pandas_dataframe=True):
# Convert start_dt and end_dt to UTC if they are naive
if start_dt.tzinfo is None:
start_dt = pytz.utc.localize(start_dt)
if end_dt.tzinfo is None:
end_dt = pytz.utc.localize(end_dt)


if start_dt < end_dt:
raise AttributeError('start_dt must be >= end_dt')


partitions = self.__dtrange_to_partition_ranges(start_dt,end_dt)
Expand All @@ -208,11 +210,12 @@ def read_range(self,start_dt,end_dt,as_pandas_dataframe=True):

return result


def append(self,rows,convert_strings=False):
# This part is specific to pandas support. If rows is a pandas DataFrame, convert it to a
# format suitable to PyTables
if rows.__class__ == pandas.core.frame.DataFrame:
if rows.empty:
return # Do nothing if we are appending nothing
if rows.index.__class__ != pandas.tseries.index.DatetimeIndex:
raise ValueError('when rows is a DataFrame, the index must be a DatetimeIndex.')

Expand Down

0 comments on commit bdf9480

Please sign in to comment.