Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add parse_ixbrl_diskcache_version to accelarate access parsed ixbrls. #90

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
setuptools==63.2.0
requests~=2.28.1
urllib3~=1.26.11
urllib3~=1.26.11
diskcache~=5.4.0
8 changes: 8 additions & 0 deletions tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from invoke import task
from pathlib import Path

@task(default=True)
def setup(c):
from fhopecc.winman import addpath
p = Path(__file__).parent
addpath(str(p), 'PYTHONPATH')
35 changes: 35 additions & 0 deletions tests/test_linkbase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import unittest

class Test(unittest.TestCase):
def test(self):
from xbrl.linkbase import parse_linkbase, LinkbaseType
pre = r'D:\tifrs\tifrs-20200630\BSCI\tifrs-bsci-ci-2020-06-30-presentation.xml'
pre = parse_linkbase(pre, linkbase_type=LinkbaseType.PRESENTATION)
bal = pre.extended_links[0]
print(bal.treeview())

# from treelib import Tree
# from treelib.exceptions import DuplicatedNodeIdError
# t = Tree()
# r = t.create_node('r', 'r')
# def make_arc(arc, p):
# t.create_node(arc.to_locator.name, str(arc), p)
# for c_arc in arc.to_locator.children:
# make_arc(c_arc, str(arc))
# for l in pre.extended_links:
# # breakpoint()
# t.create_node(l.role, l.elr_id, 'r')
# for loc in l.root_locators:
# t.create_node(loc.name, loc.href, l.elr_id)
# for arc in loc.children:
# make_arc(arc, loc.href)
# # t.create_node(arc.to_locator.name, str(arc), loc.href)
# t.show()
# # pl = tax.pre_linkbases
# # for p in pl:
# # breakpoint()
# # print(p)


if __name__ == '__main__':
unittest.main()
21 changes: 12 additions & 9 deletions tests/test_local_taxonomy.py → tests/test_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,32 @@
from xbrl.taxonomy import parse_taxonomy, TaxonomySchema
import logging


class TaxonomySchemaTest(unittest.TestCase):
"""
Unit test for taxonomy.test_parse_taxonomy()
"""

@unittest.skip('跳過')
def test_parse_taxonomy(self):
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
cache_dir: str = './cache/'
cache: HttpCache = HttpCache(cache_dir)
print(f"Saving to {cache_dir}")

extension_schema_path: str = './tests/data/example.xsd'
# extension_schema_path: str = './data/example.xsd'
tax: TaxonomySchema = parse_taxonomy(extension_schema_path, cache)
print(tax)
from pathlib import Path

extension_schema_path = (Path(__file__).parent / 'data/example.xsd').__str__()
tax = parse_taxonomy(extension_schema_path)
srt_tax: TaxonomySchema = tax.get_taxonomy('http://fasb.org/srt/2020-01-31')
self.assertTrue(srt_tax)
self.assertEqual(len(srt_tax.concepts), 489)

# check if the labels where successfully linked to the concept
self.assertEqual(len(tax.concepts['example_Assets'].labels), 2)

def test_parse_tifrs_taxonomy(self):
extension_schema_path = r'D:\tifrs\tifrs-20200630\BSCI\tifrs-bsci-bd-2020-06-30.xsd'
t = parse_taxonomy(extension_schema_path)
print(t.namespace)
for p in t.pre_linkbases:
print(p.treeview())
# print(t.imports)

if __name__ == '__main__':
unittest.main()
68 changes: 55 additions & 13 deletions xbrl/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,20 @@
import logging
import re
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from datetime import date, datetime
from io import StringIO
from typing import List
from pathlib import Path
from diskcache import Cache
from xbrl import TaxonomyNotFound, InstanceParseException
from xbrl.cache import HttpCache
from xbrl.helper.uri_helper import resolve_uri
from xbrl.helper.xml_parser import parse_file
from xbrl.taxonomy import Concept, TaxonomySchema, parse_taxonomy, parse_common_taxonomy, parse_taxonomy_url
from xbrl.transformations import normalize, TransformationException, TransformationNotImplemented
cache = Cache(Path.home() / 'cache' / 'xbrlinstance')

logger = logging.getLogger(__name__)
LINK_NS: str = "{http://www.xbrl.org/2003/linkbase}"
Expand Down Expand Up @@ -190,13 +194,7 @@ def __str__(self) -> str:
return "{}: {}".format(self.concept.name, str(self.value))

def json(self, **kwargs) -> dict:
if isinstance(self.context, TimeFrameContext):
period: str = f"{self.context.start_date}/{self.context.end_date}"
elif isinstance(self.context, InstantContext):
period: str = str(self.context.instant_date)
else:
period: str = '' # Forever context not specified in REC-2021-10-13

period = self.get_period()
kwargs['value'] = self.value
if 'dimensions' not in kwargs: kwargs['dimensions'] = {}
kwargs['dimensions']['concept'] = self.concept.name
Expand All @@ -206,6 +204,27 @@ def json(self, **kwargs) -> dict:
kwargs['dimensions'][segment.dimension.name] = segment.member.name
return kwargs

def get_period(self) -> str:
if isinstance(self.context, TimeFrameContext):
period: str = f"{self.context.start_date}/{self.context.end_date}"
elif isinstance(self.context, InstantContext):
period: str = str(self.context.instant_date)
else:
period: str = '' # Forever context not specified in REC-2021-10-13
return period

def to_record(self) -> dict:
r = {}
r['entity'] = self.context.entity
r['concept'] = self.concept.xml_id
r['balance'] = self.concept.balance
r['labels'] = [l.text for l in self.concept.labels if l.language=='zh-tw']
r['labels'] = r['labels'][0] if r['labels'] else ''
r['period_type'] = self.concept.period_type
r['period'] = self.get_period()
r['segments'] = self.context.xml_id
r['value'] = self.value
return r

class NumericFact(AbstractFact):
"""
Expand Down Expand Up @@ -312,6 +331,27 @@ def json(self, file_path: str = None, override_fact_ids: bool = True) -> str or
else:
return json.dumps(json_dict)

def instant_facts(self):
df = pd.DataFrame([f.to_record() for f in self.facts
if f.concept.period_type == 'instant'
])
cs = df.groupby(['concept']).period.count().reset_index()
cs = cs.query('period==3')
cs = cs.concept.to_list()
df = df.query('concept in @cs')
df = df.groupby(['concept', 'balance', 'labels', 'period']).value.sum().to_frame()
df = df.unstack('period')
df.columns = ('last_year', 'last_quarter', 'this_quarter')
df['year_diff'] = df.this_quarter - df.last_year
df['quarter_diff'] = df.this_quarter - df.last_quarter
df['year_diff_rate'] = df.apply(
lambda r: r.year_diff / r.last_year if r.last_year else np.nan, axis=1)
df['quarter_diff_rate'] = df.apply(
lambda r: r.quarter_diff / r.last_quarter if r.last_quarter else np.nan, axis=1)
assets = df.query('concept=="ifrs-full_Assets"').this_quarter.iloc[0]
df['portion_of_assets'] = df.this_quarter / assets
df = df.reset_index()
return df

def parse_xbrl_url(instance_url: str, cache: HttpCache) -> XbrlInstance:
"""
Expand Down Expand Up @@ -408,7 +448,6 @@ def parse_xbrl(instance_path: str, cache: HttpCache, instance_url: str or None =

return XbrlInstance(instance_url if instance_url else instance_path, taxonomy, facts, context_dir, unit_dir)


def parse_ixbrl_url(instance_url: str, cache: HttpCache) -> XbrlInstance:
"""
Parses a inline XBRL (iXBRL) instance file.
Expand All @@ -420,8 +459,7 @@ def parse_ixbrl_url(instance_url: str, cache: HttpCache) -> XbrlInstance:
instance_path: str = cache.cache_file(instance_url)
return parse_ixbrl(instance_path, cache, instance_url)


def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None = None, encoding=None, schema_root=None) -> XbrlInstance:
def parse_ixbrl(instance_path: str, cache=None , instance_url: str or None = None, encoding=None, schema_root=None) -> XbrlInstance:
"""
Parses a inline XBRL (iXBRL) instance file.

Expand All @@ -437,6 +475,9 @@ def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None
to the .getRoot() is missing. This has the benefit, that we can search the document with absolute xpath expressions
=> in the XBRL-parse function root is ET.Element, here just an instance of ElementTree class!
"""
if not cache:
cache = HttpCache(str(Path.home() / 'cache'))
cache.set_headers({'From': 'hook', 'User-Agent': 'Tool/Version (Website)'})

instance_file = open(instance_path, "r", encoding=encoding)
contents = instance_file.read()
Expand Down Expand Up @@ -502,7 +543,7 @@ def parse_ixbrl(instance_path: str, cache: HttpCache, instance_url: str or None
elif fact_elem.tag == '{' + ns_map['ix'] + '}nonNumeric':
fact_value: str = _extract_non_numeric_value(fact_elem)
facts.append(TextFact(concept, context, str(fact_value), xml_id))

instance_file.close()
return XbrlInstance(instance_url if instance_url else instance_path, taxonomy, facts, context_dir, unit_dir)


Expand Down Expand Up @@ -569,8 +610,9 @@ def _extract_non_fraction_value(fact_elem: ET.Element) -> float or None or str:
except TransformationException:
logging.warning(f'Could not transform value "{fact_value}" with format {fact_format}')
return fact_value

scaled_value = float(fact_value) * pow(10, value_scale)
try:
scaled_value = float(fact_value) * pow(10, value_scale)
except: scaled_value = 0
# Floating-point error mitigation
if abs(scaled_value) > 1e6: scaled_value = float(round(scaled_value))
if value_sign == '-':
Expand Down
54 changes: 47 additions & 7 deletions xbrl/linkbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from abc import ABC
from enum import Enum
from typing import List
from treelib import Tree

from xbrl import XbrlParseException, LinkbaseNotFoundException
from xbrl.cache import HttpCache
Expand Down Expand Up @@ -204,8 +205,9 @@ def to_dict(self):
"preferredLabel": self.preferred_label, "locator": self.to_locator.to_dict()}

def __str__(self) -> str:
return "{} {}".format(self.arcrole.split('/')[-1], self.to_locator.concept_id)
return "order is {} {} {}".format(self.order, self.arcrole.split('/')[-1], self.to_locator.concept_id)

def __repr__(self): return self.__str__()

class Label:
"""
Expand Down Expand Up @@ -250,7 +252,6 @@ def __init__(self, label: str, label_type: str, language: str, text: str) -> Non
def __str__(self) -> str:
return self.text


class LabelArc(AbstractArcElement):
"""
Represents a label arc (link:labelArc)
Expand All @@ -259,7 +260,6 @@ class LabelArc(AbstractArcElement):
attribute of a label arc points to multiple label elements

"""

def __init__(self, from_locator, order: int, labels: List[Label]) -> None:
"""
@type from_locator: Locator
Expand Down Expand Up @@ -311,7 +311,11 @@ def __init__(self, href: str, name: str):
self.children: List[AbstractArcElement] = []

def __str__(self) -> str:
return "{} with {} children".format(self.name, len(self.children))
msg = f"{self.name} with {len(self.children)} children"
msg += self.children.__str__()
return msg

def __repr__(self): return self.__str__()

def to_dict(self) -> dict:
"""
Expand Down Expand Up @@ -374,9 +378,22 @@ def to_simple_dict(self) -> dict:
"""
return {"role": self.role, "children": [loc.to_simple_dict() for loc in self.root_locators]}

def __str__(self) -> str:
return self.elr_id

def to_tree(self):
t = Tree()
t.create_node(self.role, self.role)
def make_tree(tree, parent):
match tree:
case PresentationArc():
t.create_node(tree.to_locator.name, tree.to_locator.href, parent)
for c in tree.to_locator.children:
make_tree(c, tree.to_locator.href)
case Locator():
t.create_node(tree.name, tree.href, parent)
for c in tree.children:
make_tree(c, tree.href)
for loc in self.root_locators:
make_tree(loc, self.role)
return t

class Linkbase:
"""
Expand Down Expand Up @@ -406,6 +423,29 @@ def to_simple_dict(self) -> dict:
"""
return {"standardExtendedLinkElements": [el.to_simple_dict() for el in self.extended_links]}

def treeview(self) -> str:
from treelib import Tree
from treelib.exceptions import DuplicatedNodeIdError
t = Tree()
r = t.create_node(str(self.type), str(self.type))
def make_tree(tree, parent):
match tree:
case PresentationArc():
try:
t.create_node(tree.to_locator.name, tree.to_locator.href, parent)
for c in tree.to_locator.children:
make_tree(c, tree.to_locator.href)
except:pass
case Locator():
t.create_node(tree.name, tree.href, parent)
for c in tree.children:
make_tree(c, tree.href)

for link in self.extended_links:
t.create_node(link.role, link.role, r.identifier)
for loc in link.root_locators:
make_tree(loc, link.role)
return t.show()

def parse_linkbase_url(linkbase_url: str, linkbase_type: LinkbaseType, cache: HttpCache) -> Linkbase:
"""
Expand Down
16 changes: 8 additions & 8 deletions xbrl/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
import logging
import os
import xml.etree.ElementTree as ET
from functools import lru_cache
import functools
from typing import List
from urllib.parse import unquote

from xbrl import XbrlParseException, TaxonomyNotFound
from xbrl.cache import HttpCache
from xbrl.helper.uri_helper import resolve_uri, compare_uri
from xbrl.linkbase import Linkbase, ExtendedLink, LinkbaseType, parse_linkbase, parse_linkbase_url, Label
from pathlib import Path

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -459,9 +460,6 @@ def __init__(self, schema_url: str, namespace: str):
# based on the name
self.name_id_map: dict = {}

def __str__(self) -> str:
return self.namespace

def get_taxonomy(self, url: str):
"""
Returns the taxonomy with the given namespace (if it is the current taxonomy, or if it is imported)
Expand Down Expand Up @@ -505,8 +503,7 @@ def parse_common_taxonomy(cache: HttpCache, namespace: str) -> TaxonomySchema or
return parse_taxonomy_url(ns_schema_map[namespace], cache)
return None


@lru_cache(maxsize=60)
@functools.cache
def parse_taxonomy_url(schema_url: str, cache: HttpCache) -> TaxonomySchema:
"""
Parses a taxonomy schema file from the internet
Expand All @@ -521,8 +518,8 @@ def parse_taxonomy_url(schema_url: str, cache: HttpCache) -> TaxonomySchema:
schema_path: str = cache.cache_file(schema_url)
return parse_taxonomy(schema_path, cache, schema_url)


def parse_taxonomy(schema_path: str, cache: HttpCache, schema_url: str or None = None) -> TaxonomySchema:
@functools.cache
def parse_taxonomy(schema_path: str, cache=None, schema_url: str or None = None) -> TaxonomySchema:
"""
Parses a taxonomy schema file.

Expand All @@ -532,6 +529,9 @@ def parse_taxonomy(schema_path: str, cache: HttpCache, schema_url: str or None =
imported schemas from the remote location. If this url is None, the script will try to find those resources locally.
:return: parsed :class:`xbrl.taxonomy.TaxonomySchema` object
"""
if not cache:
cache_dir = r'd:\\cache'
cache = HttpCache(cache_dir)
schema_path = str(schema_path)
if schema_path.startswith('http'): raise XbrlParseException(
'This function only parses locally saved taxonomies. Please use parse_taxonomy_url to parse remote taxonomy schemas')
Expand Down