Skip to content

Commit 87e738a

Browse files
committed
feat: 🎸 incorporate ngrams analysis
1 parent 5830953 commit 87e738a

File tree

12 files changed

+536
-17
lines changed

12 files changed

+536
-17
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ __pycache__
33
__private__
44
/build
55
/dist
6+
/analysis_outputs

analyzer_interface/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .interface import AnalyzerInterface, InputColumn, OutputColumn, AnalyzerInput, AnalyzerOutput, DataType
2+
from .column_automap import column_automap, UserInputColumn
3+
from .data_type_compatibility import get_data_type_compatibility_score
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from pydantic import BaseModel
2+
from .interface import DataType, InputColumn
3+
from .data_type_compatibility import get_data_type_compatibility_score
4+
5+
6+
class UserInputColumn(BaseModel):
7+
name: str
8+
data_type: DataType
9+
10+
11+
def column_automap(user_columns: list[UserInputColumn], input_schema_columns: list[InputColumn]):
12+
"""
13+
Matches user-provided columns to the expected columns based on the name hints.
14+
15+
The resulting dictionary is keyed by the expected input column name.
16+
"""
17+
matches: dict[str, str] = {}
18+
for user_column in user_columns:
19+
max_score = None
20+
best_match_input_column = None
21+
for input_column in input_schema_columns:
22+
current_score = get_data_type_compatibility_score(
23+
input_column.data_type, user_column.data_type
24+
)
25+
26+
# Don't consider type-incompatible columns
27+
if current_score is None:
28+
continue
29+
30+
# Boost the score if we have a name hint match such that
31+
# - among similarly compatible matches, those with name hints are preferred
32+
# - among name hint matches, those with the best data type compatibility are preferred
33+
if any(
34+
check_name_hint(user_column.name, hint)
35+
for hint in input_column.name_hints
36+
):
37+
current_score += 10
38+
39+
if max_score is None or current_score > max_score:
40+
max_score = current_score
41+
best_match_input_column = input_column
42+
43+
if best_match_input_column is not None:
44+
matches[best_match_input_column.name] = user_column.name
45+
46+
return matches
47+
48+
49+
def check_name_hint(name: str, hint: str):
50+
"""
51+
Returns true if every word in the hint (split by spaces) is present in the name,
52+
in a case insensitive manner.
53+
"""
54+
return all(word.lower().strip() in name.lower() for word in hint.split(" "))
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from .interface import DataType
2+
3+
data_type_mapping_preference: dict[DataType, list[list[DataType]]] = {
4+
"text": [["text"], ["identifier", "url"]],
5+
"integer": [["integer"]],
6+
"float": [["float", "integer"]],
7+
"boolean": [["boolean"]],
8+
"datetime": [["datetime"]],
9+
"identifier": [["identifier"], ["integer"], ["url"], ["text"]],
10+
"url": [["url"]]
11+
}
12+
"""
13+
For each data type, a list of lists of data types that are considered compatible
14+
with it. The first list is the most preferred, the last list is the least. The
15+
items in each list are considered equally compatible.
16+
"""
17+
18+
19+
def get_data_type_compatibility_score(expected_data_type: DataType, actual_data_type: DataType):
20+
"""
21+
Returns a score for the compatibility of the actual data type with the
22+
expected data type. Higher (less negative) scores are better.
23+
`None` means the data types are not compatible.
24+
"""
25+
if expected_data_type == actual_data_type:
26+
return 0
27+
28+
for i, preference_list in enumerate(data_type_mapping_preference[expected_data_type]):
29+
if actual_data_type in preference_list:
30+
return -(i + 1)
31+
32+
return None

analyzer_interface/interface.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
from typing import Callable, Literal, Optional
2+
3+
from pydantic import BaseModel
4+
5+
6+
class AnalyzerInterface(BaseModel):
7+
id: str
8+
"""
9+
The static ID for the analyzer that, with the version, uniquely identifies the
10+
analyzer and will be stored as metadata as part of the output data.
11+
"""
12+
13+
version: str
14+
"""
15+
The version ID for the analyzer. In future, we may choose to support output
16+
migration between versions of the same analyzer.
17+
"""
18+
19+
name: str
20+
"""
21+
The short human-readable name of the analyzer.
22+
"""
23+
24+
short_description: str
25+
"""
26+
A short, one-liner description of what the analyzer does.
27+
"""
28+
29+
long_description: Optional[str] = None
30+
"""
31+
A longer description of what the analyzer does that will be shown separately.
32+
"""
33+
34+
input: "AnalyzerInput"
35+
"""
36+
Specifies the input data schema for the analyzer.
37+
"""
38+
39+
outputs: list["AnalyzerOutput"]
40+
"""
41+
Specifies the output data schema for the analyzer.
42+
"""
43+
44+
entry_point: Callable
45+
"""
46+
The entry point should be a function that accepts the input dataframe and
47+
returns a dictionary of output dataframes
48+
"""
49+
50+
51+
class AnalyzerInput(BaseModel):
52+
columns: list["InputColumn"]
53+
54+
55+
class AnalyzerOutput(BaseModel):
56+
id: str
57+
"""
58+
Uniquely identifies the output data schema for the analyzer. The analyzer
59+
must include this key in the output dictionary.
60+
"""
61+
62+
name: str
63+
"""The human-friendly for the output."""
64+
65+
description: Optional[str] = None
66+
67+
columns: list["OutputColumn"]
68+
69+
70+
DataType = Literal[
71+
"text", "integer", "float", "boolean", "datetime", "identifier", "url"
72+
]
73+
"""
74+
The semantic data type for a data column. This is not quite the same as
75+
structural data types like polars or pandas or even arrow types, but they
76+
represent how the data is intended to be interpreted.
77+
78+
- `text` is expected to be a free-form human-readable text content.
79+
- `integer` and `float` are meant to be manipulated arithmetically.
80+
- `boolean` is a binary value.
81+
- `datetime` represents time and are meant to be manipulated as time values.
82+
- `identifier` is a unique identifier for a record. It is not expected to be manipulated in any way.
83+
- `url` is a string that represents a URL.
84+
"""
85+
86+
87+
class Column(BaseModel):
88+
name: str
89+
description: Optional[str] = None
90+
data_type: DataType
91+
92+
93+
class InputColumn(Column):
94+
name_hints: list[str] = []
95+
"""
96+
Specifies a list of space-separated words that are likely to be found in the
97+
column name of the user-provided data. This is used to help the user map the
98+
input columns to the expected columns.
99+
100+
Any individual hint matching is sufficient for a match to be called. The hint
101+
in turn is matched if every word matches some part of the column name.
102+
"""
103+
104+
105+
class OutputColumn(Column):
106+
pass

analyzers/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .ngrams import interface as ngrams
2+
3+
all_analyzers = [
4+
ngrams
5+
]

analyzers/ngrams/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .main import analyze_ngrams
2+
from .interface import interface

analyzers/ngrams/interface.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
from analyzer_interface import (AnalyzerInput, AnalyzerInterface,
2+
AnalyzerOutput, InputColumn, OutputColumn)
3+
4+
from .main import (MESSAGE__ID, MESSAGE__TEXT,
5+
NGRAM__ID, NGRAM__LENGTH,
6+
NGRAM__WORDS, MESSAGE__NGRAM_COUNT, AUTHOR__ID, analyze_ngrams)
7+
8+
interface = AnalyzerInterface(
9+
id="ngrams",
10+
version="0.1.0",
11+
name="ngrams",
12+
short_description="Extracts n-grams from text data",
13+
long_description="""
14+
The n-gram analysis extract n-grams (sequences of n words) from the text data
15+
in the input and counts the occurrences of each n-gram in each message, linking
16+
the message author to the ngram frequency.
17+
18+
The result can be used to see if certain word sequences are more common in
19+
the corpus of text, and whether certain authors use these sequences more often.
20+
""",
21+
input=AnalyzerInput(columns=[
22+
InputColumn(
23+
name=AUTHOR__ID,
24+
data_type="identifier",
25+
description="The unique identifier of the author of the message",
26+
name_hints=["author", "user", "poster", "username",
27+
"screen name", "user name", "name", "email"]
28+
),
29+
InputColumn(
30+
name=MESSAGE__ID,
31+
data_type="identifier",
32+
description="The unique identifier of the message",
33+
name_hints=["post", "message", "comment",
34+
"text", "retweet id", "tweet"]
35+
),
36+
InputColumn(
37+
name=MESSAGE__TEXT,
38+
data_type="text",
39+
description="The text content of the message",
40+
name_hints=["message", "text", "comment",
41+
"post", "body", "content", "tweet"]
42+
)
43+
]),
44+
outputs=[
45+
AnalyzerOutput(
46+
id="message_ngrams",
47+
name="N-gram count per message",
48+
columns=[
49+
OutputColumn(name=MESSAGE__ID, data_type="identifier"),
50+
OutputColumn(name=NGRAM__ID, data_type="identifier"),
51+
OutputColumn(name=MESSAGE__NGRAM_COUNT, data_type="integer")
52+
]
53+
),
54+
AnalyzerOutput(
55+
id="ngrams",
56+
name="N-gram definitions",
57+
description="The word compositions of each unique n-gram",
58+
columns=[
59+
OutputColumn(name=NGRAM__ID, data_type="identifier"),
60+
OutputColumn(name=NGRAM__WORDS, data_type="text"),
61+
OutputColumn(name=NGRAM__LENGTH, data_type="integer")
62+
]
63+
),
64+
AnalyzerOutput(
65+
id="message_authors",
66+
name="Message authorship",
67+
description="Message authorship",
68+
columns=[
69+
OutputColumn(name=AUTHOR__ID, data_type="identifier"),
70+
OutputColumn(name=MESSAGE__ID, data_type="identifier")
71+
]
72+
)
73+
],
74+
entry_point=analyze_ngrams
75+
)

analyzers/ngrams/main.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import polars as pl
2+
import re
3+
4+
5+
AUTHOR__ID = "user_id"
6+
MESSAGE__ID = "message_id"
7+
MESSAGE__TEXT = "message_text"
8+
MESSAGE__NGRAM_COUNT = "count"
9+
NGRAM__ID = "ngram_id"
10+
NGRAM__WORDS = "words"
11+
NGRAM__LENGTH = "n"
12+
13+
14+
def analyze_ngrams(df_input: pl.DataFrame):
15+
df_input = df_input.filter(pl.col(MESSAGE__TEXT).is_not_null())
16+
17+
def get_ngram_rows(ngrams_by_id: dict[str, int]):
18+
num_rows = df_input.height
19+
current_row = 0
20+
for row in df_input.iter_rows(named=True):
21+
tokens = tokenize(row[MESSAGE__TEXT])
22+
for ngram in ngrams(tokens, 3, 5):
23+
serialized_ngram = serialize_ngram(ngram)
24+
if serialized_ngram not in ngrams_by_id:
25+
ngrams_by_id[serialized_ngram] = len(ngrams_by_id)
26+
ngram_id = ngrams_by_id[serialized_ngram]
27+
yield {
28+
MESSAGE__ID: row[MESSAGE__ID],
29+
NGRAM__ID: ngram_id
30+
}
31+
current_row = current_row + 1
32+
if current_row % 100 == 0:
33+
print(
34+
current_row, "/", num_rows, "rows processed; found ",
35+
len(ngrams_by_id), "ngrams", end="\r"
36+
)
37+
38+
ngrams_by_id: dict[str, int] = {}
39+
40+
df_message_ngrams = (
41+
pl.DataFrame(get_ngram_rows(ngrams_by_id))
42+
.group_by(MESSAGE__ID, NGRAM__ID)
43+
.agg(pl.count().alias(MESSAGE__NGRAM_COUNT))
44+
)
45+
df_ngrams = pl.DataFrame({
46+
NGRAM__ID: list(ngrams_by_id.values()),
47+
NGRAM__WORDS: list(ngrams_by_id.keys())
48+
}).with_columns([
49+
pl.col(NGRAM__WORDS)
50+
.str.split(" ")
51+
.list.len()
52+
.alias(NGRAM__LENGTH)
53+
])
54+
df_message_authors = df_input.select(
55+
[AUTHOR__ID, MESSAGE__ID])
56+
57+
return {
58+
"message_ngrams": df_message_ngrams,
59+
"ngrams": df_ngrams,
60+
"message_authors": df_message_authors
61+
}
62+
63+
64+
def tokenize(input: str) -> list[str]:
65+
"""Generate words from input string."""
66+
return re.split(r'\W+', input.lower())
67+
68+
69+
def ngrams(tokens: list[str], min: int, max: int):
70+
"""Generate n-grams from list of tokens."""
71+
for i in range(len(tokens) - min + 1):
72+
for n in range(min, max + 1):
73+
if i + n > len(tokens):
74+
break
75+
yield tokens[i:i + n]
76+
77+
78+
def serialize_ngram(ngram: list[str]) -> str:
79+
"""Generates a string that uniquely represents an ngram"""
80+
return " ".join(ngram)

0 commit comments

Comments
 (0)