Skip to content

Commit

Permalink
new constructor for entity domain
Browse files Browse the repository at this point in the history
related issue #119
  • Loading branch information
semio committed Nov 11, 2019
1 parent 2c40c18 commit 5f18088
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 4 deletions.
26 changes: 23 additions & 3 deletions ddf_utils/model/ddf.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class EntityDomain:

@entities.validator
def _check_entities_identity(self, attribute, value):
entities_id_list = [x.id for x in value]
entities_id_list = self.entity_ids
counter = Counter(entities_id_list)
error = False
for k, v in counter.items():
Expand All @@ -90,6 +90,27 @@ def _check_entities_identity(self, attribute, value):
if error:
raise ValueError("duplicated entity detected")

@classmethod
def from_entity_list(cls, domain_id, entities, allow_duplicated=True, **kwargs):
if not allow_duplicated:
return cls(id=domain_id, entities=entities, props=kwargs)
# if there are duplicates, we need to combine all duplicates
# now construct a new entities list without duplicates
entity_ids = [x.id for x in entities]
entities_new = dict((i, Entity(id=i, domain=domain_id, sets=[], props={})) for i in entity_ids)
for x in entities:
en = entities_new[x.id]
for s in x.sets:
if s not in en.sets:
en.sets.append(s)
en.props.update(x.props)
entities_new[x.id] = en
return cls(id=domain_id, entities=list(entities_new.values()), props=kwargs)

@property
def entity_ids(self):
return [x.id for x in self.entities]

@property
def entity_sets(self):
sets = set()
Expand All @@ -102,8 +123,7 @@ def get_entity_set(self, s):
return [e for e in self.entities if s in e.sets]

def has_entity(self, sid):
all_ids = [e.id for e in self.entities]
return sid in all_ids
return sid in self.entity_ids

def to_dict(self, eset=None):
if eset:
Expand Down
5 changes: 4 additions & 1 deletion ddf_utils/model/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,10 @@ def load_ddf(self):
domains_tmp[domain].append(entity)

for domain, entities_ in domains_tmp.items():
domains[domain] = EntityDomain(id=domain, entities=entities_)
# TODO: maybe get properties from concepts table
# Allow duplicated entity because they may be defined in multiple resources
# i.e. multiple entity sets in separated files.
domains[domain] = EntityDomain.from_entity_list(domain_id=domain, entities=entities_, allow_duplicated=True)

# load datapoints. Here we will use Dask for all
# 1. create categories for entity domains
Expand Down

0 comments on commit 5f18088

Please sign in to comment.