Skip to content

Commit e5da070

Browse files
docs: ✏️ development guide with real example (#66)
* docs: ✏️ development guide with real example * tiny analyzer typo the tiniest typo, thought I'd just fix it now * style: 💄 isort && black --------- Co-authored-by: Andi Halim <[email protected]>
1 parent 6e6b412 commit e5da070

File tree

13 files changed

+448
-0
lines changed

13 files changed

+448
-0
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ python -m venv venv
2828
python -m mangotango
2929
```
3030

31+
## Development Guide
32+
33+
[Development Guide](./docs/dev-guide.md)
34+
3135
## License
3236

3337
This project is licensed under the [PolyForm Noncommercial License 1.0.0](https://polyformproject.org/licenses/noncommercial/1.0.0/).

analyzers/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from analyzer_interface import AnalyzerSuite
22

3+
from .example.example_base import example_base
4+
from .example.example_report import example_report
5+
from .example.example_web import example_web
36
from .hashtags import hashtags
47
from .ngram_stats import ngram_stats
58
from .ngram_web import ngrams_web
@@ -10,6 +13,9 @@
1013

1114
suite = AnalyzerSuite(
1215
all_analyzers=[
16+
example_base,
17+
example_report,
18+
example_web,
1319
ngrams,
1420
ngram_stats,
1521
ngrams_web,

analyzers/example/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Example Analyzer Implementation
2+
3+
This is an example of how to implement an analyzer for the `analyzer` module. This analyzer is a simple example that counts the number of words in a given text, an export format that includes a "long"
4+
flag that indicates whether a message is long or not.
5+
6+
A web presenter module is included that plots a histogram of
7+
message lengths.
8+
9+
- [Primary Analyzer](./example_base/__init__.py)
10+
- [Secondary Analyzer](./example_report/__init__.py)
11+
- [Web Presenter](./example_web/__init__.py)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from analyzer_interface import AnalyzerDeclaration
2+
3+
from .interface import interface
4+
from .main import main
5+
6+
# This is an example primary analyzer. It simply counts the number of characters
7+
# in a text column and writes the result to a parquet file.
8+
example_base = AnalyzerDeclaration(
9+
interface=interface,
10+
main=main,
11+
# This marks the analyzer as distributed or not. A distributed
12+
# analyzer is visible only when the application is packaged. A non-distributed
13+
# analyzer is also visible when the application is run in development mode.
14+
is_distributed=False,
15+
)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
from analyzer_interface import (
2+
AnalyzerInput,
3+
AnalyzerInterface,
4+
AnalyzerOutput,
5+
InputColumn,
6+
OutputColumn,
7+
)
8+
9+
interface = AnalyzerInterface(
10+
# Should be globally unique.
11+
id="__example__",
12+
# We don't really use this yet, but specify something for now.
13+
version="0.1.0",
14+
# The name of the analyzer as shown on the UI.
15+
name="Example Analyzer",
16+
# These descriptions are shown to the user in the UI at some point during the
17+
# analysis selection process.
18+
short_description="Example Analyzer (Character Count)",
19+
long_description="""
20+
This is an example analyzer that counts the number of characters in each message.
21+
""",
22+
input=AnalyzerInput(
23+
columns=[
24+
InputColumn(
25+
# This is the column name that you will use in your data analysis
26+
# code.
27+
name="message_id",
28+
# This is the human readable name that will be displayed in the
29+
# user interface.
30+
human_readable_name="Unique Message ID",
31+
# Refer to the complete set of data types by following the
32+
# type definition.
33+
data_type="identifier",
34+
# This is a description of the column that will be displayed in
35+
# the user interface during column matching.
36+
description="The unique identifier of the message",
37+
# This name hints give the application a kind of soft heuristics
38+
# to match the column to the right data. The user will be able to
39+
# override the suggestion if it is incorrect.
40+
#
41+
# You don't need to provide all possible hints, but the more you
42+
# provide, the better the suggestions will be.
43+
name_hints=[
44+
"post",
45+
"message",
46+
"comment",
47+
"text",
48+
"retweet id",
49+
"tweet",
50+
],
51+
),
52+
InputColumn(
53+
name="message_text",
54+
human_readable_name="Message Text",
55+
data_type="text",
56+
description="The text content of the message",
57+
name_hints=[
58+
"message",
59+
"text",
60+
"comment",
61+
"post",
62+
"body",
63+
"content",
64+
"tweet",
65+
],
66+
),
67+
]
68+
),
69+
outputs=[
70+
AnalyzerOutput(
71+
# This should be locally unique to the analyzer.
72+
# Remember this -- you will need it to refer to this output in your
73+
# implementation. It will also form part of the exported output's
74+
# file name, so choose something that's intuitive.
75+
id="character_count",
76+
# This is the human readable name that will be displayed in the
77+
# user interface. Only used if this is exportable. You can leave
78+
# it out and it will fallback to the id.
79+
name="Character Count Per Message",
80+
# Mark this as internal, so that it is not shown in the list of
81+
# exported outputs.
82+
internal=True,
83+
columns=[
84+
OutputColumn(
85+
# This is the column name that you will use in your data analysis
86+
# code when saving the output.
87+
name="message_id",
88+
# This is the human readable name that will be used in the
89+
# exported output.
90+
human_readable_name="Unique Message ID",
91+
data_type="integer",
92+
),
93+
OutputColumn(name="character_count", data_type="integer"),
94+
],
95+
)
96+
],
97+
)
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import polars as pl
2+
3+
from analyzer_interface.context import PrimaryAnalyzerContext
4+
from terminal_tools import ProgressReporter
5+
6+
7+
def main(context: PrimaryAnalyzerContext):
8+
# To read the user's input data the way the user intended, you have to do
9+
# two things:
10+
# - Read the input file, which is a parquet file. The InputReader interface
11+
# gives you the path to the file. You can use whichever library
12+
# to do this. Here we use polars.
13+
#
14+
# - Preprocess the input data. This transforms the user's imported data
15+
# to the format that your analyzer expects by performing the column
16+
# mapping and data type conversion (like converting a string column
17+
# that represents a datetime into a timestamp column).
18+
# YOU MUST DO THIS before you can start your analysis, otherwise you won't
19+
# get the columns or the types that you need.
20+
input_reader = context.input()
21+
df_input = input_reader.preprocess(pl.read_parquet(input_reader.parquet_path))
22+
23+
# Now you can start your analysis. The following code is just a minimal example.
24+
#
25+
# The use of the ProgressReporter is optional. It helps breaking a
26+
# longer analysis down into sections.
27+
with ProgressReporter("Counting characters") as progress:
28+
df_count = df_input.select(
29+
pl.col("message_id"),
30+
# The input and output columns are as you define in the interface.
31+
pl.col("message_text").str.len_chars().alias("character_count"),
32+
)
33+
34+
# If you decide to process the data in small batches
35+
# you can update the progress bar with the fraction of the
36+
# current batch. Again, this is optional. Here we just use
37+
# 1.0 to indicate 100% completion.
38+
#
39+
# You can still use the ProgressReporter without updating the progress
40+
# value, in which case the progress bar will just show a spinner and
41+
# the message.
42+
progress.update(1.0)
43+
44+
# The analyzer is expected to write the output to a parquet file for
45+
# every output that is defined. Make sure that the output ID and the
46+
# columns match the interface.
47+
df_count.write_parquet(context.output("character_count").parquet_path)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from analyzer_interface import SecondaryAnalyzerDeclaration
2+
3+
from .interface import interface
4+
from .main import main
5+
6+
# This is an example secondary analyzer. It adds a column to the output of the
7+
# primary analyzer that indicates whether the message is "long" or not.
8+
example_report = SecondaryAnalyzerDeclaration(interface=interface, main=main)
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface
2+
3+
from ..example_base.interface import interface as example_base
4+
5+
interface = SecondaryAnalyzerInterface(
6+
# This ID should unique among the analyzers in the application.
7+
id="example_report",
8+
# We don't really use this yet, but specify something for now.
9+
version="0.1.0",
10+
# The name of the analyzer as shown on the UI.
11+
name="Example Report",
12+
short_description="",
13+
# Specify the primary analyzer here. You MUST do this otherwise the
14+
# secondary analyzer will not be detected as deriving from the primary.
15+
base_analyzer=example_base,
16+
outputs=[
17+
AnalyzerOutput(
18+
id="example_report",
19+
name="Example Report",
20+
columns=[
21+
OutputColumn(name="message_id", data_type="integer"),
22+
OutputColumn(name="character_count", data_type="integer"),
23+
# This is our pretend "presented column" that isn't part of the
24+
# actual analysis, but is just for the report. We avoid storing
25+
# things like this in the primary analyzer output (unless it's
26+
# laborious to compute), since it's only specific to this export.
27+
OutputColumn(name="is_long", data_type="boolean"),
28+
],
29+
)
30+
],
31+
)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import polars as pl
2+
3+
from analyzer_interface.context import SecondaryAnalyzerContext
4+
5+
6+
def main(context: SecondaryAnalyzerContext):
7+
df_character_count = pl.read_parquet(
8+
# This `character_count` is the output ID from the primary analyzer.
9+
context.base.table("character_count").parquet_path
10+
)
11+
12+
df_export = df_character_count.with_columns(
13+
# `is_long` is a new column that we are adding to the output.
14+
pl.col("character_count")
15+
.gt(100)
16+
.alias("is_long")
17+
)
18+
19+
# Save the output to a parquet file. The output ID comes from the secondary
20+
# analyzer's interface.
21+
df_export.write_parquet(context.output("example_report").parquet_path)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from analyzer_interface import WebPresenterDeclaration
2+
3+
from .factory import factory
4+
from .interface import interface
5+
6+
example_web = WebPresenterDeclaration(
7+
interface=interface,
8+
factory=factory,
9+
# You must pass __name__ here. It's to make Dash happy.
10+
# See: http://dash.plotly.com/urls
11+
name=__name__,
12+
)

0 commit comments

Comments
 (0)