Skip to content

Commit

Permalink
misc code updates
Browse files Browse the repository at this point in the history
  • Loading branch information
cuihaoleo committed Apr 7, 2023
1 parent c27fb6d commit c571559
Show file tree
Hide file tree
Showing 13 changed files with 299 additions and 361 deletions.
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / stateme
[BASIC]

# Good variable names which should always be accepted, separated by a comma
good-names=main,_
good-names=main,_,G

# Bad variable names which should always be refused, separated by a comma
bad-names=
Expand Down
4 changes: 1 addition & 3 deletions privacy_policy_analyzer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from spacy.tokens import Span, Token

import privacy_policy_analyzer.named_entity_recognition
from spacy.tokens import Token
from privacy_policy_analyzer import utils

Token.set_extension("src", getter=utils.token_to_source)
Expand Down
77 changes: 41 additions & 36 deletions privacy_policy_analyzer/annotators/collection_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,17 @@ def build_dependency_graph(root_token: Token):
"""

def is_interrogative(token: Token):
if token.sent[-1].lemma_ == "?":
return True

while token.dep_ == "conj":
token = token.head

left_edge = token.left_edge

return left_edge.head == token and token.left_edge.tag_ in (
"VBP", # _Do_ we ...
"MD", # _Will_ we ...
"WRB", # _When/How_ do we ...
"WP", # _What_ information do we ...
return left_edge.head == token and (left_edge.pos_, left_edge.tag_) in (
('AUX', "VBP"), # _Do_ we ...
('AUX', "VBZ"), # _Does_ this app ...
('AUX', "MD"), # _Will_ we ...
('SCONJ', "WRB"), # _When/How_ do we ...
('PRON', "WP"), # _What_ do we ...
)

def is_negative(token: Token):
Expand Down Expand Up @@ -102,12 +100,23 @@ def handle_xcomp(parent_token: Token, xcomp_root_token: Token):
if data["dep"] == "obj":
modified_dep_tree.add_edge(xcomp_root_token, node, dep="subj")

def handle_ccomp(parent_token: Token, ccomp_root_token: Token):
modified_dep_tree.add_edge(parent_token, ccomp_root_token, dep="ccomp")
dfs(ccomp_root_token)

def handle_appos(parent_token: Token, appos_token: Token):
modified_dep_tree.add_node(appos_token, negation=is_negative(parent_token))

for grand_parent_token, _, data in modified_dep_tree.in_edges(parent_token, data=True):
modified_dep_tree.add_edge(grand_parent_token, appos_token, **data)

def handler_factory(graph_dep: str):
def func(parent_token: Token, child_token: Token):
modified_dep_tree.add_edge(parent_token, child_token, dep=graph_dep)
dfs(child_token)

return func

def find_all_children(current_token: Token):
children = list(current_token.children)
existing_deps = {t.dep_ for t in current_token.children}
Expand All @@ -132,6 +141,16 @@ def find_all_children(current_token: Token):
important_deps = IMPORTANT_DEPS_OF_POS.get(current_token.pos_, [])
return sorted(filter(lambda t: t.dep_ in important_deps, children))

dep_handlers = {
"nsubj" : handler_factory("subj"),
"agent" : handle_agent,
"dative": handle_dative,
"appos" : handle_appos,
"xcomp" : handle_xcomp,
"ccomp" : handle_ccomp,
}
dep_handlers["dobj"] = dep_handlers["nsubjpass"] = dep_handlers["pobj"] = handler_factory("obj")

def dfs(current_token: Token):
# Check negation
modified_dep_tree.nodes[current_token]["negation"] = is_negative(current_token)
Expand All @@ -141,35 +160,19 @@ def dfs(current_token: Token):
conjuncts = immediate_child.conjuncts
dependency = immediate_child.dep_

if dependency not in dep_handlers:
dep_handlers[dependency] = handler_factory(dependency)

for child in itertools.chain([immediate_child], conjuncts):
match dependency:
case "nsubj":
# Nominal subject
modified_dep_tree.add_edge(current_token, child, dep="subj")
dfs(child)
case "dobj" | "nsubjpass" | "pobj":
# Object or passive subject
modified_dep_tree.add_edge(current_token, child, dep="obj")
dfs(child)
case "agent":
# "by" after a passive verb. Assign grandchildren as "subj" (subject).
handle_agent(current_token, child)
case "dative":
# Dative or indirect object
# give us something => us: dative
# is given to us => to: dative, us: pobj
handle_dative(current_token, child)
case "xcomp":
handle_xcomp(current_token, child)
case "appos":
handle_appos(current_token, child)
case _:
modified_dep_tree.add_edge(current_token, child, dep=dependency)
dfs(child)
if not is_interrogative(child):
dep_handlers[dependency](current_token, child)

modified_dep_tree = nx.DiGraph()
modified_dep_tree.add_node("")

if root_token.sent[-1].lemma_ == "?":
return modified_dep_tree

for token in itertools.chain([root_token], root_token.conjuncts):
if not is_interrogative(token):
modified_dep_tree.add_edge("", token, dep="root")
Expand Down Expand Up @@ -309,19 +312,21 @@ class CollectionAnnotator(BaseAnnotator):
ACTION_MAP = {
("COLLECT", False): [(0, 1, "COLLECT")],
("COLLECT", True): [(0, 1, "NOT_COLLECT")],
("SHARE", False): [(2, 1, "SHARE_WITH"),
("SHARE", False): [(2, 1, "BE_SHARED"),
(0, 1, "COLLECT")],
("SHARE", True): [(2, 1, "NOT_SHARE_WITH")],
("SELL", False): [(2, 1, "SELL_TO"),
("SHARE", True): [(2, 1, "NOT_BE_SHARED")],
("SELL", False): [(2, 1, "BE_SOLD"),
(0, 1, "COLLECT")],
("SELL", True): [(2, 1, "NOT_SELL_TO")],
("SELL", True): [(2, 1, "NOT_BE_SOLD")],
("USE", False): [(0, 1, "USE")],
("USE", True): [(0, 1, "NOT_USE")],
("STORE", False): [(0, 1, "STORE")],
("STORE", True): [(0, 1, "NOT_STORE")],
}

EDGE_TYPES = frozenset(edge_type for li in ACTION_MAP.values() for _, _, edge_type in li)
NEGATIVE_EDGE_TYPES = frozenset(filter(lambda t: t.startswith("NOT_"), EDGE_TYPES))
POSITIVE_EDGE_TYPES = EDGE_TYPES - NEGATIVE_EDGE_TYPES

def __init__(self, nlp):
super().__init__(nlp)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def link_coref(coref, coref_main, reason):

for noun_phrase in sent.ents:
found = False
startswith_det = noun_phrase[0].lemma_ in {"this", "that", "these", "those"}
startswith_det = noun_phrase[0].lemma_ in {"this", "that", "these", "those", "such"}

if startswith_det and noun_phrase[0].head == noun_phrase[-1]:
# Resolve this/that/these/those xxx
Expand Down
16 changes: 8 additions & 8 deletions privacy_policy_analyzer/annotators/purpose_annotator.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
from spacy.matcher import DependencyMatcher

from ..named_entity_recognition import ACTOR_KEYWORDS, DATATYPE_KEYWORDS, TRIVIAL_WORDS
from ..utils import TRIVIAL_WORDS
from .base import BaseAnnotator
from .collection_annotator import CollectionAnnotator


class PurposeValidator:
ADDITIONAL_STOP_WORDS = frozenset({*TRIVIAL_WORDS, "purpose", "reason", "use"})

def __init__(self, vocab):
self.deny_matcher = DependencyMatcher(vocab)

patterns = []
# For ... period
patterns.append([
{
"RIGHT_ID": "anchor",
Expand All @@ -20,7 +23,7 @@ def __init__(self, vocab):
"REL_OP": ">",
"RIGHT_ID": "r00",
"RIGHT_ATTRS": {"LEMMA": {"IN": [
"day", "week", "month", "year", "period", "time", "instance"]
"day", "week", "month", "year", "period", "time", "instance", "duration"]
}}
},
])
Expand Down Expand Up @@ -52,11 +55,6 @@ def __init__(self, vocab):
])
self.deny_matcher.add("DENY", patterns)

self.additional_stop_words = {"purpose", "reason", "use"}
self.additional_stop_words.update(ACTOR_KEYWORDS)
self.additional_stop_words.update(DATATYPE_KEYWORDS)
self.additional_stop_words.update(TRIVIAL_WORDS)

def __call__(self, span):
if self.deny_matcher(span):
return False
Expand All @@ -65,7 +63,9 @@ def __call__(self, span):
return False

for token in span:
if not(token.is_stop or token.lemma_ in self.additional_stop_words):
if (not token.is_stop
and token.ent_type_ not in ('DATA', 'ACTOR')
and token.lemma_ not in self.ADDITIONAL_STOP_WORDS):
return True


Expand Down
33 changes: 22 additions & 11 deletions privacy_policy_analyzer/annotators/subject_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@ def __init__(self, nlp):

self.matcher = DependencyMatcher(self.vocab)

# from/about children / minors / kids
pattern = [
{
"RIGHT_ID": "anchor",
"RIGHT_ATTRS": {"LEMMA": "from", "DEP": "prep"},
"RIGHT_ATTRS": {"LEMMA": {"IN": ["from", "about"]}, "DEP": "prep"},
},
{
"LEFT_ID": "anchor",
Expand All @@ -29,35 +30,45 @@ def __init__(self, nlp):

self.matcher.add("FROM_CHILDREN", [pattern])

# from ... under ... age/years
pattern = [
{
"RIGHT_ID": "anchor",
"RIGHT_ATTRS": {"LEMMA": "from", "DEP": "prep"},
"RIGHT_ATTRS": {"LEMMA": {"IN": ["from", "about"]}, "DEP": "prep"},
},
{
"LEFT_ID": "anchor",
"REL_OP": ">>",
"RIGHT_ID": "prep_under",
"RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(under|of|in)$"}, "DEP": "prep"}
"RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(under|of)$"}, "DEP": "prep"}
},
{
"LEFT_ID": "prep_under",
"REL_OP": ">",
"RIGHT_ID": "pobj_age",
"RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(age|year|\d+)$"}, "DEP": "pobj"}
"RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(age|year|old|\d+)$"}, "DEP": "pobj"}
},
]

self.matcher.add("UNDER_AGE", [pattern])

def annotate(self, document):
def poss_is_children(root_token):
for token in root_token.subtree:
if token.dep_ == "poss" and token.lemma_ in ("child", "minor", "kid"):
return True
# children's information
pattern = [
{
"RIGHT_ID": "anchor",
"RIGHT_ATTRS": {"ENT_TYPE": "DATA"},
},
{
"LEFT_ID": "anchor",
"REL_OP": ">",
"RIGHT_ID": "poss",
"RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(child|minor|kid)$"}, "DEP": "poss"}
},
]

return False
self.matcher.add("CHILDREN_POSS", [pattern])

def annotate(self, document):
visited_data_src = set()

for _, src2, relationship in document.token_relationship.edges(keys=True):
Expand All @@ -67,6 +78,6 @@ def poss_is_children(root_token):
data_token = document.get_token_with_src(src2)
sentence = data_token.sent

if len(self.matcher(sentence)) > 0 or poss_is_children(data_token):
if len(self.matcher(sentence)) > 0:
self.logger.info("Set children as data subject: %r", sentence.text)
document.token_relationship.nodes[src2]['subject'] = 'children'
15 changes: 6 additions & 9 deletions privacy_policy_analyzer/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,10 @@ class PolicyDocument:
"""Container of privacy policy document"""

@classmethod
def initialize(cls, workdir, nlp):
def initialize(cls, workdir, nlp: Language):
obj = cls(flag=True)
obj.workdir = Path(workdir)
obj.token_relationship = nx.MultiDiGraph()
obj.nlp = nlp

with open(obj.workdir / "accessibility_tree.json", encoding="utf-8") as fin:
accessibility_tree = json.load(fin)
Expand Down Expand Up @@ -166,16 +165,15 @@ def initialize(cls, workdir, nlp):
return obj

@classmethod
def load(cls, workdir, nlp):
def load(cls, workdir, nlp: Language):
obj = cls(flag=True)
obj.workdir = Path(workdir)
obj.nlp = nlp

with open(obj.workdir / "document.pickle", "rb") as fin:
(obj.token_relationship, obj.segments, docbin_bytes) = pickle.load(fin)

serialized_docs = DocBin().from_bytes(docbin_bytes)
obj.all_docs = dict()
obj.all_docs = {}

for doc in serialized_docs.get_docs(nlp.vocab):
doc_id = doc.user_data["id"]
Expand All @@ -190,9 +188,8 @@ def __init__(self, **kwargs):
# Make linter happy
self.workdir: Path
self.all_docs: dict[tuple[int, int], Doc]
self.token_relationship: nx.DiGraph
self.token_relationship: nx.MultiDiGraph
self.segments: list[DocumentSegment]
self.nlp: Language

def print_tree(self):
with io.StringIO() as fout:
Expand Down Expand Up @@ -250,15 +247,15 @@ def get_relations(self, token1, token2):

yield from self.token_relationship.get_edge_data(src1, src2)

def get_all_links(self, token, direction=None):
def get_all_links(self, token: Token, direction="out"):
doc = token.doc
source_rmap = doc.user_data["source_rmap"]

if token._.src is None:
return

match direction:
case None | "out":
case "out":
edge_view = self.token_relationship.out_edges(token._.src, keys=True)
case "in":
edge_view = self.token_relationship.in_edges(token._.src, keys=True)
Expand Down
14 changes: 7 additions & 7 deletions privacy_policy_analyzer/extra-data/phrase_map.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
DATA:
UNSPECIFIC:
- '^(information|data|datum)(we collect| about (you|user|our user))?$'
- '^(technology|data|datum|information)$'
- '^(information|data|datum)( we collect| you provide| about (you|user|our user))?$'

personal information:
- '^((your|user|the)\s+)?personal (data|info|information|datum)$'
Expand Down Expand Up @@ -107,8 +107,8 @@ DATA:
- '(customer|emergency) contact$'

person name:
- '(first|last|family|middle|given|real|legal|maiden|person|contact|passenger|personal|full|sur)\s*name'
- '^(your )?name$'
- '(your|first|last|family|middle|given|real|legal|maiden|person|contact|passenger|personal|full|sur)\s*name'
- '^name$'

phone number:
- '(tele)?phone (mobile )?number'
Expand Down Expand Up @@ -214,6 +214,9 @@ DATA:
- '(anonymous|pseudonymous)\b.*\bidentifier'

ACTOR:
IGNORE:
- '^(you|user|customer|visitor|friend)$'

we:
- '^(we|us|i)$'
- '^our (service|(web)?site|product|server|app(lication|s)?|mobile application|system|software|company|business|platform)$'
Expand All @@ -227,6 +230,7 @@ ACTOR:
- '(other|another|external|outside) (service|provider|partner|vendor|website|site|company|platform|contractor|party|business|organi[sz]ation|entities)$'
- '(our|other|another|external|outside) (partner|vendor)'
- '^(our|other|another|external|outside)?\b.*\bservice provider$'
- '^(affiliate|app|application|business|company|corporation|organization|partner|party|product|provider|service|site|software|subsidiary|vendor|website)$'
- '!(ad(vertis\w+)?|social|(e-?)?mail|analytics?|measurement|market(ing)?|track(ing)?|content|search|payment|government|auth(entication)?)'

advertiser:
Expand Down Expand Up @@ -255,7 +259,3 @@ ACTOR:
- '(e-?mail|electronic mail)(ing)?\b.*\b(provider|service|vendor)'
- 'provider of\b.*\bemail'
- 'service provider\b.*\bsend e-?mail'

# Not in use
search engine:
- 'search (engine|information|result)\b.*\b(provider)?'
Loading

0 comments on commit c571559

Please sign in to comment.