From 5f180883434ef6a0305e3df3bb5535e70c955ef2 Mon Sep 17 00:00:00 2001 From: semio Date: Mon, 11 Nov 2019 17:17:04 +0800 Subject: [PATCH] new constructor for entity domain related issue #119 --- ddf_utils/model/ddf.py | 26 +++++++++++++++++++++++--- ddf_utils/model/package.py | 5 ++++- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/ddf_utils/model/ddf.py b/ddf_utils/model/ddf.py index 5a84767..865bb9c 100644 --- a/ddf_utils/model/ddf.py +++ b/ddf_utils/model/ddf.py @@ -80,7 +80,7 @@ class EntityDomain: @entities.validator def _check_entities_identity(self, attribute, value): - entities_id_list = [x.id for x in value] + entities_id_list = self.entity_ids counter = Counter(entities_id_list) error = False for k, v in counter.items(): @@ -90,6 +90,27 @@ def _check_entities_identity(self, attribute, value): if error: raise ValueError("duplicated entity detected") + @classmethod + def from_entity_list(cls, domain_id, entities, allow_duplicated=True, **kwargs): + if not allow_duplicated: + return cls(id=domain_id, entities=entities, props=kwargs) + # if there are duplicates, we need to combine all duplicates + # now construct a new entities list without duplicates + entity_ids = [x.id for x in entities] + entities_new = dict((i, Entity(id=i, domain=domain_id, sets=[], props={})) for i in entity_ids) + for x in entities: + en = entities_new[x.id] + for s in x.sets: + if s not in en.sets: + en.sets.append(s) + en.props.update(x.props) + entities_new[x.id] = en + return cls(id=domain_id, entities=list(entities_new.values()), props=kwargs) + + @property + def entity_ids(self): + return [x.id for x in self.entities] + @property def entity_sets(self): sets = set() @@ -102,8 +123,7 @@ def get_entity_set(self, s): return [e for e in self.entities if s in e.sets] def has_entity(self, sid): - all_ids = [e.id for e in self.entities] - return sid in all_ids + return sid in self.entity_ids def to_dict(self, eset=None): if eset: diff --git a/ddf_utils/model/package.py b/ddf_utils/model/package.py index 6220e6f..74a4e58 100644 --- a/ddf_utils/model/package.py +++ b/ddf_utils/model/package.py @@ -252,7 +252,10 @@ def load_ddf(self): domains_tmp[domain].append(entity) for domain, entities_ in domains_tmp.items(): - domains[domain] = EntityDomain(id=domain, entities=entities_) + # TODO: maybe get properties from concepts table + # Allow duplicated entity because they may be defined in multiple resources + # i.e. multiple entity sets in separated files. + domains[domain] = EntityDomain.from_entity_list(domain_id=domain, entities=entities_, allow_duplicated=True) # load datapoints. Here we will use Dask for all # 1. create categories for entity domains