misc code updates

UCI-Networking-Group · Apr 7, 2023 · c571559 · c571559
1 parent c27fb6d
commit c571559
Show file tree

Hide file tree

Showing 13 changed files with 299 additions and 361 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -173,7 +173,7 @@ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / stateme
 [BASIC]
 
 # Good variable names which should always be accepted, separated by a comma
-good-names=main,_
+good-names=main,_,G
 
 # Bad variable names which should always be refused, separated by a comma
 bad-names=

diff --git a/privacy_policy_analyzer/__init__.py b/privacy_policy_analyzer/__init__.py
@@ -1,6 +1,4 @@
-from spacy.tokens import Span, Token
-
-import privacy_policy_analyzer.named_entity_recognition
+from spacy.tokens import Token
 from privacy_policy_analyzer import utils
 
 Token.set_extension("src", getter=utils.token_to_source)

diff --git a/privacy_policy_analyzer/annotators/collection_annotator.py b/privacy_policy_analyzer/annotators/collection_annotator.py
@@ -35,19 +35,17 @@ def build_dependency_graph(root_token: Token):
     """
 
     def is_interrogative(token: Token):
-        if token.sent[-1].lemma_ == "?":
-            return True
-
         while token.dep_ == "conj":
             token = token.head
 
         left_edge = token.left_edge
 
-        return left_edge.head == token and token.left_edge.tag_ in (
-            "VBP", # _Do_ we ...
-            "MD",  # _Will_ we ...
-            "WRB", # _When/How_ do we ...
-            "WP",  # _What_ information do we ...
+        return left_edge.head == token and (left_edge.pos_, left_edge.tag_) in (
+            ('AUX', "VBP"),   # _Do_ we ...
+            ('AUX', "VBZ"),   # _Does_ this app ...
+            ('AUX', "MD"),    # _Will_ we ...
+            ('SCONJ', "WRB"), # _When/How_ do we ...
+            ('PRON', "WP"),   # _What_ do we ...
         )
 
     def is_negative(token: Token):
@@ -102,12 +100,23 @@ def handle_xcomp(parent_token: Token, xcomp_root_token: Token):
                 if data["dep"] == "obj":
                     modified_dep_tree.add_edge(xcomp_root_token, node, dep="subj")
 
+    def handle_ccomp(parent_token: Token, ccomp_root_token: Token):
+        modified_dep_tree.add_edge(parent_token, ccomp_root_token, dep="ccomp")
+        dfs(ccomp_root_token)
+
     def handle_appos(parent_token: Token, appos_token: Token):
         modified_dep_tree.add_node(appos_token, negation=is_negative(parent_token))
 
         for grand_parent_token, _, data in modified_dep_tree.in_edges(parent_token, data=True):
             modified_dep_tree.add_edge(grand_parent_token, appos_token, **data)
 
+    def handler_factory(graph_dep: str):
+        def func(parent_token: Token, child_token: Token):
+            modified_dep_tree.add_edge(parent_token, child_token, dep=graph_dep)
+            dfs(child_token)
+
+        return func
+
     def find_all_children(current_token: Token):
         children = list(current_token.children)
         existing_deps = {t.dep_ for t in current_token.children}
@@ -132,6 +141,16 @@ def find_all_children(current_token: Token):
         important_deps = IMPORTANT_DEPS_OF_POS.get(current_token.pos_, [])
         return sorted(filter(lambda t: t.dep_ in important_deps, children))
 
+    dep_handlers = {
+        "nsubj" : handler_factory("subj"),
+        "agent" : handle_agent,
+        "dative": handle_dative,
+        "appos" : handle_appos,
+        "xcomp" : handle_xcomp,
+        "ccomp" : handle_ccomp,
+    }
+    dep_handlers["dobj"] = dep_handlers["nsubjpass"] = dep_handlers["pobj"] = handler_factory("obj")
+
     def dfs(current_token: Token):
         # Check negation
         modified_dep_tree.nodes[current_token]["negation"] = is_negative(current_token)
@@ -141,35 +160,19 @@ def dfs(current_token: Token):
             conjuncts = immediate_child.conjuncts
             dependency = immediate_child.dep_
 
+            if dependency not in dep_handlers:
+                dep_handlers[dependency] = handler_factory(dependency)
+
             for child in itertools.chain([immediate_child], conjuncts):
-                match dependency:
-                    case "nsubj":
-                        # Nominal subject
-                        modified_dep_tree.add_edge(current_token, child, dep="subj")
-                        dfs(child)
-                    case "dobj" | "nsubjpass" | "pobj":
-                        # Object or passive subject
-                        modified_dep_tree.add_edge(current_token, child, dep="obj")
-                        dfs(child)
-                    case "agent":
-                        # "by" after a passive verb. Assign grandchildren as "subj" (subject).
-                        handle_agent(current_token, child)
-                    case "dative":
-                        # Dative or indirect object
-                        # give us something => us: dative
-                        # is given to us => to: dative, us: pobj
-                        handle_dative(current_token, child)
-                    case "xcomp":
-                        handle_xcomp(current_token, child)
-                    case "appos":
-                        handle_appos(current_token, child)
-                    case _:
-                        modified_dep_tree.add_edge(current_token, child, dep=dependency)
-                        dfs(child)
+                if not is_interrogative(child):
+                    dep_handlers[dependency](current_token, child)
 
     modified_dep_tree = nx.DiGraph()
     modified_dep_tree.add_node("")
 
+    if root_token.sent[-1].lemma_ == "?":
+        return modified_dep_tree
+
     for token in itertools.chain([root_token], root_token.conjuncts):
         if not is_interrogative(token):
             modified_dep_tree.add_edge("", token, dep="root")
@@ -309,19 +312,21 @@ class CollectionAnnotator(BaseAnnotator):
     ACTION_MAP = {
         ("COLLECT", False): [(0, 1, "COLLECT")],
         ("COLLECT", True):  [(0, 1, "NOT_COLLECT")],
-        ("SHARE", False):   [(2, 1, "SHARE_WITH"),
+        ("SHARE", False):   [(2, 1, "BE_SHARED"),
                              (0, 1, "COLLECT")],
-        ("SHARE", True):    [(2, 1, "NOT_SHARE_WITH")],
-        ("SELL", False):    [(2, 1, "SELL_TO"),
+        ("SHARE", True):    [(2, 1, "NOT_BE_SHARED")],
+        ("SELL", False):    [(2, 1, "BE_SOLD"),
                              (0, 1, "COLLECT")],
-        ("SELL", True):     [(2, 1, "NOT_SELL_TO")],
+        ("SELL", True):     [(2, 1, "NOT_BE_SOLD")],
         ("USE", False):     [(0, 1, "USE")],
         ("USE", True):      [(0, 1, "NOT_USE")],
         ("STORE", False):   [(0, 1, "STORE")],
         ("STORE", True):    [(0, 1, "NOT_STORE")],
     }
 
     EDGE_TYPES = frozenset(edge_type for li in ACTION_MAP.values() for _, _, edge_type in li)
+    NEGATIVE_EDGE_TYPES = frozenset(filter(lambda t: t.startswith("NOT_"), EDGE_TYPES))
+    POSITIVE_EDGE_TYPES = EDGE_TYPES - NEGATIVE_EDGE_TYPES
 
     def __init__(self, nlp):
         super().__init__(nlp)

diff --git a/privacy_policy_analyzer/annotators/coreference_annotator.py b/privacy_policy_analyzer/annotators/coreference_annotator.py
@@ -73,7 +73,7 @@ def link_coref(coref, coref_main, reason):
 
             for noun_phrase in sent.ents:
                 found = False
-                startswith_det = noun_phrase[0].lemma_ in {"this", "that", "these", "those"}
+                startswith_det = noun_phrase[0].lemma_ in {"this", "that", "these", "those", "such"}
 
                 if startswith_det and noun_phrase[0].head == noun_phrase[-1]:
                     # Resolve this/that/these/those xxx

diff --git a/privacy_policy_analyzer/annotators/purpose_annotator.py b/privacy_policy_analyzer/annotators/purpose_annotator.py
@@ -1,15 +1,18 @@
 from spacy.matcher import DependencyMatcher
 
-from ..named_entity_recognition import ACTOR_KEYWORDS, DATATYPE_KEYWORDS, TRIVIAL_WORDS
+from ..utils import TRIVIAL_WORDS
 from .base import BaseAnnotator
 from .collection_annotator import CollectionAnnotator
 
 
 class PurposeValidator:
+    ADDITIONAL_STOP_WORDS = frozenset({*TRIVIAL_WORDS, "purpose", "reason", "use"})
+
     def __init__(self, vocab):
         self.deny_matcher = DependencyMatcher(vocab)
 
         patterns = []
+        # For ... period
         patterns.append([
             {
                 "RIGHT_ID": "anchor",
@@ -20,7 +23,7 @@ def __init__(self, vocab):
                 "REL_OP": ">",
                 "RIGHT_ID": "r00",
                 "RIGHT_ATTRS": {"LEMMA": {"IN": [
-                    "day", "week", "month", "year", "period", "time", "instance"]
+                    "day", "week", "month", "year", "period", "time", "instance", "duration"]
                 }}
             },
         ])
@@ -52,11 +55,6 @@ def __init__(self, vocab):
         ])
         self.deny_matcher.add("DENY", patterns)
 
-        self.additional_stop_words = {"purpose", "reason", "use"}
-        self.additional_stop_words.update(ACTOR_KEYWORDS)
-        self.additional_stop_words.update(DATATYPE_KEYWORDS)
-        self.additional_stop_words.update(TRIVIAL_WORDS)
-
     def __call__(self, span):
         if self.deny_matcher(span):
             return False
@@ -65,7 +63,9 @@ def __call__(self, span):
             return False
 
         for token in span:
-            if not(token.is_stop or token.lemma_ in self.additional_stop_words):
+            if (not token.is_stop
+                and token.ent_type_ not in ('DATA', 'ACTOR')
+                and token.lemma_ not in self.ADDITIONAL_STOP_WORDS):
                 return True
 
 

diff --git a/privacy_policy_analyzer/annotators/subject_annotator.py b/privacy_policy_analyzer/annotators/subject_annotator.py
@@ -14,10 +14,11 @@ def __init__(self, nlp):
 
         self.matcher = DependencyMatcher(self.vocab)
 
+        # from/about children / minors / kids
         pattern = [
             {
                 "RIGHT_ID": "anchor",
-                "RIGHT_ATTRS": {"LEMMA": "from", "DEP": "prep"},
+                "RIGHT_ATTRS": {"LEMMA": {"IN": ["from", "about"]}, "DEP": "prep"},
             },
             {
                 "LEFT_ID": "anchor",
@@ -29,35 +30,45 @@ def __init__(self, nlp):
 
         self.matcher.add("FROM_CHILDREN", [pattern])
 
+        # from ... under ... age/years
         pattern = [
             {
                 "RIGHT_ID": "anchor",
-                "RIGHT_ATTRS": {"LEMMA": "from", "DEP": "prep"},
+                "RIGHT_ATTRS": {"LEMMA": {"IN": ["from", "about"]}, "DEP": "prep"},
             },
             {
                 "LEFT_ID": "anchor",
                 "REL_OP": ">>",
                 "RIGHT_ID": "prep_under",
-                "RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(under|of|in)$"}, "DEP": "prep"}
+                "RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(under|of)$"}, "DEP": "prep"}
             },
             {
                 "LEFT_ID": "prep_under",
                 "REL_OP": ">",
                 "RIGHT_ID": "pobj_age",
-                "RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(age|year|\d+)$"}, "DEP": "pobj"}
+                "RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(age|year|old|\d+)$"}, "DEP": "pobj"}
             },
         ]
 
         self.matcher.add("UNDER_AGE", [pattern])
 
-    def annotate(self, document):
-        def poss_is_children(root_token):
-            for token in root_token.subtree:
-                if token.dep_ == "poss" and token.lemma_ in ("child", "minor", "kid"):
-                    return True
+        # children's information
+        pattern = [
+            {
+                "RIGHT_ID": "anchor",
+                "RIGHT_ATTRS": {"ENT_TYPE": "DATA"},
+            },
+            {
+                "LEFT_ID": "anchor",
+                "REL_OP": ">",
+                "RIGHT_ID": "poss",
+                "RIGHT_ATTRS": {"LEMMA": {"REGEX": r"^(child|minor|kid)$"}, "DEP": "poss"}
+            },
+        ]
 
-            return False
+        self.matcher.add("CHILDREN_POSS", [pattern])
 
+    def annotate(self, document):
         visited_data_src = set()
 
         for _, src2, relationship in document.token_relationship.edges(keys=True):
@@ -67,6 +78,6 @@ def poss_is_children(root_token):
                 data_token = document.get_token_with_src(src2)
                 sentence = data_token.sent
 
-                if len(self.matcher(sentence)) > 0 or poss_is_children(data_token):
+                if len(self.matcher(sentence)) > 0:
                     self.logger.info("Set children as data subject: %r", sentence.text)
                     document.token_relationship.nodes[src2]['subject'] = 'children'
diff --git a/privacy_policy_analyzer/document.py b/privacy_policy_analyzer/document.py
@@ -126,11 +126,10 @@ class PolicyDocument:
     """Container of privacy policy document"""
 
     @classmethod
-    def initialize(cls, workdir, nlp):
+    def initialize(cls, workdir, nlp: Language):
         obj = cls(flag=True)
         obj.workdir = Path(workdir)
         obj.token_relationship = nx.MultiDiGraph()
-        obj.nlp = nlp
 
         with open(obj.workdir / "accessibility_tree.json", encoding="utf-8") as fin:
             accessibility_tree = json.load(fin)
@@ -166,16 +165,15 @@ def initialize(cls, workdir, nlp):
         return obj
 
     @classmethod
-    def load(cls, workdir, nlp):
+    def load(cls, workdir, nlp: Language):
         obj = cls(flag=True)
         obj.workdir = Path(workdir)
-        obj.nlp = nlp
 
         with open(obj.workdir / "document.pickle", "rb") as fin:
             (obj.token_relationship, obj.segments, docbin_bytes) = pickle.load(fin)
 
         serialized_docs = DocBin().from_bytes(docbin_bytes)
-        obj.all_docs = dict()
+        obj.all_docs = {}
 
         for doc in serialized_docs.get_docs(nlp.vocab):
             doc_id = doc.user_data["id"]
@@ -190,9 +188,8 @@ def __init__(self, **kwargs):
         # Make linter happy
         self.workdir: Path
         self.all_docs: dict[tuple[int, int], Doc]
-        self.token_relationship: nx.DiGraph
+        self.token_relationship: nx.MultiDiGraph
         self.segments: list[DocumentSegment]
-        self.nlp: Language
 
     def print_tree(self):
         with io.StringIO() as fout:
@@ -250,15 +247,15 @@ def get_relations(self, token1, token2):
 
         yield from self.token_relationship.get_edge_data(src1, src2)
 
-    def get_all_links(self, token, direction=None):
+    def get_all_links(self, token: Token, direction="out"):
         doc = token.doc
         source_rmap = doc.user_data["source_rmap"]
 
         if token._.src is None:
             return
 
         match direction:
-            case None | "out":
+            case "out":
                 edge_view = self.token_relationship.out_edges(token._.src, keys=True)
             case "in":
                 edge_view = self.token_relationship.in_edges(token._.src, keys=True)

diff --git a/privacy_policy_analyzer/extra-data/phrase_map.yml b/privacy_policy_analyzer/extra-data/phrase_map.yml
@@ -1,7 +1,7 @@
 DATA:
   UNSPECIFIC:
-    - '^(information|data|datum)(we collect| about (you|user|our user))?$'
     - '^(technology|data|datum|information)$'
+    - '^(information|data|datum)( we collect| you provide| about (you|user|our user))?$'
 
   personal information:
     - '^((your|user|the)\s+)?personal (data|info|information|datum)$'
@@ -107,8 +107,8 @@ DATA:
     - '(customer|emergency) contact$'
 
   person name:
-    - '(first|last|family|middle|given|real|legal|maiden|person|contact|passenger|personal|full|sur)\s*name'
-    - '^(your )?name$'
+    - '(your|first|last|family|middle|given|real|legal|maiden|person|contact|passenger|personal|full|sur)\s*name'
+    - '^name$'
 
   phone number:
     - '(tele)?phone (mobile )?number'
@@ -214,6 +214,9 @@ DATA:
   - '(anonymous|pseudonymous)\b.*\bidentifier'
 
 ACTOR:
+  IGNORE:
+  - '^(you|user|customer|visitor|friend)$'
+
   we:
   - '^(we|us|i)$'
   - '^our (service|(web)?site|product|server|app(lication|s)?|mobile application|system|software|company|business|platform)$'
@@ -227,6 +230,7 @@ ACTOR:
   - '(other|another|external|outside) (service|provider|partner|vendor|website|site|company|platform|contractor|party|business|organi[sz]ation|entities)$'
   - '(our|other|another|external|outside) (partner|vendor)'
   - '^(our|other|another|external|outside)?\b.*\bservice provider$'
+  - '^(affiliate|app|application|business|company|corporation|organization|partner|party|product|provider|service|site|software|subsidiary|vendor|website)$'
   - '!(ad(vertis\w+)?|social|(e-?)?mail|analytics?|measurement|market(ing)?|track(ing)?|content|search|payment|government|auth(entication)?)'
 
   advertiser:
@@ -255,7 +259,3 @@ ACTOR:
   - '(e-?mail|electronic mail)(ing)?\b.*\b(provider|service|vendor)'
   - 'provider of\b.*\bemail'
   - 'service provider\b.*\bsend e-?mail'
-
-  # Not in use
-  search engine:
-  - 'search (engine|information|result)\b.*\b(provider)?'