feat: 🎸 incorporate ngrams analysis

soul-codes · soul-codes · commit 87e738af379b · 2024-09-23T23:38:29.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ __pycache__
 __private__
 /build
 /dist
+/analysis_outputs
diff --git a/analyzer_interface/__init__.py b/analyzer_interface/__init__.py
@@ -0,0 +1,3 @@
+from .interface import AnalyzerInterface, InputColumn, OutputColumn, AnalyzerInput, AnalyzerOutput, DataType
+from .column_automap import column_automap, UserInputColumn
+from .data_type_compatibility import get_data_type_compatibility_score
diff --git a/analyzer_interface/column_automap.py b/analyzer_interface/column_automap.py
@@ -0,0 +1,54 @@
+from pydantic import BaseModel
+from .interface import DataType, InputColumn
+from .data_type_compatibility import get_data_type_compatibility_score
+
+
+class UserInputColumn(BaseModel):
+  name: str
+  data_type: DataType
+
+
+def column_automap(user_columns: list[UserInputColumn], input_schema_columns: list[InputColumn]):
+  """
+  Matches user-provided columns to the expected columns based on the name hints.
+
+  The resulting dictionary is keyed by the expected input column name.
+  """
+  matches: dict[str, str] = {}
+  for user_column in user_columns:
+    max_score = None
+    best_match_input_column = None
+    for input_column in input_schema_columns:
+      current_score = get_data_type_compatibility_score(
+        input_column.data_type, user_column.data_type
+      )
+
+      # Don't consider type-incompatible columns
+      if current_score is None:
+        continue
+
+      # Boost the score if we have a name hint match such that
+      # - among similarly compatible matches, those with name hints are preferred
+      # - among name hint matches, those with the best data type compatibility are preferred
+      if any(
+        check_name_hint(user_column.name, hint)
+        for hint in input_column.name_hints
+      ):
+        current_score += 10
+
+      if max_score is None or current_score > max_score:
+        max_score = current_score
+        best_match_input_column = input_column
+
+    if best_match_input_column is not None:
+      matches[best_match_input_column.name] = user_column.name
+
+  return matches
+
+
+def check_name_hint(name: str, hint: str):
+  """
+  Returns true if every word in the hint (split by spaces) is present in the name,
+  in a case insensitive manner.
+  """
+  return all(word.lower().strip() in name.lower() for word in hint.split(" "))
diff --git a/analyzer_interface/data_type_compatibility.py b/analyzer_interface/data_type_compatibility.py
@@ -0,0 +1,32 @@
+from .interface import DataType
+
+data_type_mapping_preference: dict[DataType, list[list[DataType]]] = {
+  "text": [["text"], ["identifier", "url"]],
+  "integer": [["integer"]],
+  "float": [["float", "integer"]],
+  "boolean": [["boolean"]],
+  "datetime": [["datetime"]],
+  "identifier": [["identifier"], ["integer"], ["url"], ["text"]],
+  "url": [["url"]]
+}
+"""
+For each data type, a list of lists of data types that are considered compatible
+with it. The first list is the most preferred, the last list is the least. The
+items in each list are considered equally compatible.
+"""
+
+
+def get_data_type_compatibility_score(expected_data_type: DataType, actual_data_type: DataType):
+  """
+  Returns a score for the compatibility of the actual data type with the
+  expected data type. Higher (less negative) scores are better.
+  `None` means the data types are not compatible.
+  """
+  if expected_data_type == actual_data_type:
+    return 0
+
+  for i, preference_list in enumerate(data_type_mapping_preference[expected_data_type]):
+    if actual_data_type in preference_list:
+      return -(i + 1)
+
+  return None
diff --git a/analyzer_interface/interface.py b/analyzer_interface/interface.py
@@ -0,0 +1,106 @@
+from typing import Callable, Literal, Optional
+
+from pydantic import BaseModel
+
+
+class AnalyzerInterface(BaseModel):
+  id: str
+  """
+  The static ID for the analyzer that, with the version, uniquely identifies the
+  analyzer and will be stored as metadata as part of the output data.
+  """
+
+  version: str
+  """
+  The version ID for the analyzer. In future, we may choose to support output
+  migration between versions of the same analyzer.
+  """
+
+  name: str
+  """
+  The short human-readable name of the analyzer.
+  """
+
+  short_description: str
+  """
+  A short, one-liner description of what the analyzer does.
+  """
+
+  long_description: Optional[str] = None
+  """
+  A longer description of what the analyzer does that will be shown separately.
+  """
+
+  input: "AnalyzerInput"
+  """
+  Specifies the input data schema for the analyzer.
+  """
+
+  outputs: list["AnalyzerOutput"]
+  """
+  Specifies the output data schema for the analyzer.
+  """
+
+  entry_point: Callable
+  """
+  The entry point should be a function that accepts the input dataframe and
+  returns a dictionary of output dataframes
+  """
+
+
+class AnalyzerInput(BaseModel):
+  columns: list["InputColumn"]
+
+
+class AnalyzerOutput(BaseModel):
+  id: str
+  """
+  Uniquely identifies the output data schema for the analyzer. The analyzer
+  must include this key in the output dictionary.
+  """
+
+  name: str
+  """The human-friendly for the output."""
+
+  description: Optional[str] = None
+
+  columns: list["OutputColumn"]
+
+
+DataType = Literal[
+  "text", "integer", "float", "boolean", "datetime", "identifier", "url"
+]
+"""
+The semantic data type for a data column. This is not quite the same as
+structural data types like polars or pandas or even arrow types, but they
+represent how the data is intended to be interpreted.
+
+- `text` is expected to be a free-form human-readable text content.
+- `integer` and `float` are meant to be manipulated arithmetically.
+- `boolean` is a binary value.
+- `datetime` represents time and are meant to be manipulated as time values.
+- `identifier` is a unique identifier for a record. It is not expected to be manipulated in any way.
+- `url` is a string that represents a URL.
+"""
+
+
+class Column(BaseModel):
+  name: str
+  description: Optional[str] = None
+  data_type: DataType
+
+
+class InputColumn(Column):
+  name_hints: list[str] = []
+  """
+  Specifies a list of space-separated words that are likely to be found in the
+  column name of the user-provided data. This is used to help the user map the
+  input columns to the expected columns.
+
+  Any individual hint matching is sufficient for a match to be called. The hint
+  in turn is matched if every word matches some part of the column name.
+  """
+
+
+class OutputColumn(Column):
+  pass
diff --git a/analyzers/__init__.py b/analyzers/__init__.py
@@ -0,0 +1,5 @@
+from .ngrams import interface as ngrams
+
+all_analyzers = [
+  ngrams
+]
diff --git a/analyzers/ngrams/__init__.py b/analyzers/ngrams/__init__.py
@@ -0,0 +1,2 @@
+from .main import analyze_ngrams
+from .interface import interface
diff --git a/analyzers/ngrams/interface.py b/analyzers/ngrams/interface.py
@@ -0,0 +1,75 @@
+from analyzer_interface import (AnalyzerInput, AnalyzerInterface,
+                                AnalyzerOutput, InputColumn, OutputColumn)
+
+from .main import (MESSAGE__ID, MESSAGE__TEXT,
+                   NGRAM__ID, NGRAM__LENGTH,
+                   NGRAM__WORDS, MESSAGE__NGRAM_COUNT, AUTHOR__ID, analyze_ngrams)
+
+interface = AnalyzerInterface(
+  id="ngrams",
+  version="0.1.0",
+  name="ngrams",
+  short_description="Extracts n-grams from text data",
+  long_description="""
+The n-gram analysis extract n-grams (sequences of n words) from the text data
+in the input and counts the occurrences of each n-gram in each message, linking
+the message author to the ngram frequency.
+
+The result can be used to see if certain word sequences are more common in
+the corpus of text, and whether certain authors use these sequences more often.
+  """,
+  input=AnalyzerInput(columns=[
+    InputColumn(
+      name=AUTHOR__ID,
+      data_type="identifier",
+      description="The unique identifier of the author of the message",
+      name_hints=["author", "user", "poster", "username",
+                  "screen name", "user name", "name", "email"]
+    ),
+    InputColumn(
+      name=MESSAGE__ID,
+      data_type="identifier",
+      description="The unique identifier of the message",
+      name_hints=["post", "message", "comment",
+                  "text", "retweet id", "tweet"]
+    ),
+    InputColumn(
+      name=MESSAGE__TEXT,
+      data_type="text",
+      description="The text content of the message",
+      name_hints=["message", "text", "comment",
+                  "post", "body", "content", "tweet"]
+    )
+  ]),
+  outputs=[
+    AnalyzerOutput(
+      id="message_ngrams",
+      name="N-gram count per message",
+      columns=[
+        OutputColumn(name=MESSAGE__ID, data_type="identifier"),
+        OutputColumn(name=NGRAM__ID, data_type="identifier"),
+        OutputColumn(name=MESSAGE__NGRAM_COUNT, data_type="integer")
+      ]
+    ),
+    AnalyzerOutput(
+      id="ngrams",
+      name="N-gram definitions",
+      description="The word compositions of each unique n-gram",
+      columns=[
+        OutputColumn(name=NGRAM__ID, data_type="identifier"),
+        OutputColumn(name=NGRAM__WORDS, data_type="text"),
+        OutputColumn(name=NGRAM__LENGTH, data_type="integer")
+      ]
+    ),
+    AnalyzerOutput(
+      id="message_authors",
+      name="Message authorship",
+      description="Message authorship",
+      columns=[
+        OutputColumn(name=AUTHOR__ID, data_type="identifier"),
+        OutputColumn(name=MESSAGE__ID, data_type="identifier")
+      ]
+    )
+  ],
+  entry_point=analyze_ngrams
+)
diff --git a/analyzers/ngrams/main.py b/analyzers/ngrams/main.py
@@ -0,0 +1,80 @@
+import polars as pl
+import re
+
+
+AUTHOR__ID = "user_id"
+MESSAGE__ID = "message_id"
+MESSAGE__TEXT = "message_text"
+MESSAGE__NGRAM_COUNT = "count"
+NGRAM__ID = "ngram_id"
+NGRAM__WORDS = "words"
+NGRAM__LENGTH = "n"
+
+
+def analyze_ngrams(df_input: pl.DataFrame):
+  df_input = df_input.filter(pl.col(MESSAGE__TEXT).is_not_null())
+
+  def get_ngram_rows(ngrams_by_id: dict[str, int]):
+    num_rows = df_input.height
+    current_row = 0
+    for row in df_input.iter_rows(named=True):
+      tokens = tokenize(row[MESSAGE__TEXT])
+      for ngram in ngrams(tokens, 3, 5):
+        serialized_ngram = serialize_ngram(ngram)
+        if serialized_ngram not in ngrams_by_id:
+          ngrams_by_id[serialized_ngram] = len(ngrams_by_id)
+        ngram_id = ngrams_by_id[serialized_ngram]
+        yield {
+          MESSAGE__ID: row[MESSAGE__ID],
+          NGRAM__ID: ngram_id
+        }
+      current_row = current_row + 1
+      if current_row % 100 == 0:
+        print(
+          current_row, "/", num_rows, "rows processed; found ",
+          len(ngrams_by_id), "ngrams", end="\r"
+        )
+
+  ngrams_by_id: dict[str, int] = {}
+
+  df_message_ngrams = (
+    pl.DataFrame(get_ngram_rows(ngrams_by_id))
+      .group_by(MESSAGE__ID, NGRAM__ID)
+      .agg(pl.count().alias(MESSAGE__NGRAM_COUNT))
+  )
+  df_ngrams = pl.DataFrame({
+    NGRAM__ID: list(ngrams_by_id.values()),
+    NGRAM__WORDS: list(ngrams_by_id.keys())
+  }).with_columns([
+    pl.col(NGRAM__WORDS)
+      .str.split(" ")
+      .list.len()
+      .alias(NGRAM__LENGTH)
+  ])
+  df_message_authors = df_input.select(
+    [AUTHOR__ID, MESSAGE__ID])
+
+  return {
+    "message_ngrams": df_message_ngrams,
+    "ngrams": df_ngrams,
+    "message_authors": df_message_authors
+  }
+
+
+def tokenize(input: str) -> list[str]:
+  """Generate words from input string."""
+  return re.split(r'\W+', input.lower())
+
+
+def ngrams(tokens: list[str], min: int, max: int):
+  """Generate n-grams from list of tokens."""
+  for i in range(len(tokens) - min + 1):
+    for n in range(min, max + 1):
+      if i + n > len(tokens):
+        break
+      yield tokens[i:i + n]
+
+
+def serialize_ngram(ngram: list[str]) -> str:
+  """Generates a string that uniquely represents an ngram"""
+  return " ".join(ngram)
diff --git a/components/new_analysis.py b/components/new_analysis.py
diff --git a/preprocessing/series_semantic.py b/preprocessing/series_semantic.py
diff --git a/terminal_tools/inception.py b/terminal_tools/inception.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .interface import AnalyzerInterface, InputColumn, OutputColumn, AnalyzerInput, AnalyzerOutput, DataType`
	`2`	`+from .column_automap import column_automap, UserInputColumn`
	`3`	`+from .data_type_compatibility import get_data_type_compatibility_score`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +from .ngrams import interface as ngrams
++
 +all_analyzers = [
 +  ngrams
 +]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .main import analyze_ngrams`
	`2`	`+from .interface import interface`