Skip to content

Commit

Permalink
imporve entity domain creation performance (#119)
Browse files Browse the repository at this point in the history
- add validator for EntityDomain initialization
- and avoid add_entity()
  • Loading branch information
semio committed Nov 11, 2019
1 parent 313fc66 commit 2c40c18
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 5 deletions.
16 changes: 14 additions & 2 deletions ddf_utils/model/ddf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pathlib import Path
from itertools import product
from tqdm import tqdm
from collections import OrderedDict
from collections import OrderedDict, Counter

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -75,9 +75,21 @@ def to_dict(self, pkey=None):
@attr.s(auto_attribs=True)
class EntityDomain:
id: str
entities: List[Entity]
entities: List[Entity] = attr.ib(factory=list)
props: dict = attr.ib(factory=dict)

@entities.validator
def _check_entities_identity(self, attribute, value):
entities_id_list = [x.id for x in value]
counter = Counter(entities_id_list)
error = False
for k, v in counter.items():
if v > 1:
logger.critical(f"entity {k} exists {v} times in entity table!")
error = True
if error:
raise ValueError("duplicated entity detected")

@property
def entity_sets(self):
sets = set()
Expand Down
9 changes: 6 additions & 3 deletions ddf_utils/model/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,11 +245,14 @@ def load_ddf(self):
# load entities
entities = list(self._gen_entities(concepts))
domains = dict()
domains_tmp = dict()
for domain, entity in entities:
if domain not in domains.keys():
domains[domain] = EntityDomain(id=domain, entities=[])
if domain not in domains_tmp.keys():
domains_tmp[domain] = list()
domains_tmp[domain].append(entity)

domains[domain].add_entity(entity)
for domain, entities_ in domains_tmp.items():
domains[domain] = EntityDomain(id=domain, entities=entities_)

# load datapoints. Here we will use Dask for all
# 1. create categories for entity domains
Expand Down

0 comments on commit 2c40c18

Please sign in to comment.