docs: ✏️ development guide with real example (#66)

soul-codes · andi-halim · web-flow · commit e5da070134c1 · 2025-01-30T18:57:53.000+07:00
* docs: ✏️ development guide with real example

* tiny analyzer typo

the tiniest typo, thought I'd just fix it now

* style: 💄 isort &amp;&amp; black

---------

Co-authored-by: Andi Halim &lt;andihalim00@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -28,6 +28,10 @@ python -m venv venv
 python -m mangotango
 ```
 
+## Development Guide
+
+[Development Guide](./docs/dev-guide.md)
+
 ## License
 
 This project is licensed under the [PolyForm Noncommercial License 1.0.0](https://polyformproject.org/licenses/noncommercial/1.0.0/).
diff --git a/analyzers/__init__.py b/analyzers/__init__.py
@@ -1,5 +1,8 @@
 from analyzer_interface import AnalyzerSuite
 
+from .example.example_base import example_base
+from .example.example_report import example_report
+from .example.example_web import example_web
 from .hashtags import hashtags
 from .ngram_stats import ngram_stats
 from .ngram_web import ngrams_web
@@ -10,6 +13,9 @@
 
 suite = AnalyzerSuite(
     all_analyzers=[
+        example_base,
+        example_report,
+        example_web,
         ngrams,
         ngram_stats,
         ngrams_web,
diff --git a/analyzers/example/README.md b/analyzers/example/README.md
@@ -0,0 +1,11 @@
+# Example Analyzer Implementation
+
+This is an example of how to implement an analyzer for the `analyzer` module. This analyzer is a simple example that counts the number of words in a given text, an export format that includes a "long"
+flag that indicates whether a message is long or not.
+
+A web presenter module is included that plots a histogram of
+message lengths.
+
+- [Primary Analyzer](./example_base/__init__.py)
+- [Secondary Analyzer](./example_report/__init__.py)
+- [Web Presenter](./example_web/__init__.py)
diff --git a/analyzers/example/example_base/__init__.py b/analyzers/example/example_base/__init__.py
@@ -0,0 +1,15 @@
+from analyzer_interface import AnalyzerDeclaration
+
+from .interface import interface
+from .main import main
+
+# This is an example primary analyzer. It simply counts the number of characters
+# in a text column and writes the result to a parquet file.
+example_base = AnalyzerDeclaration(
+    interface=interface,
+    main=main,
+    # This marks the analyzer as distributed or not. A distributed
+    # analyzer is visible only when the application is packaged. A non-distributed
+    # analyzer is also visible when the application is run in development mode.
+    is_distributed=False,
+)
diff --git a/analyzers/example/example_base/interface.py b/analyzers/example/example_base/interface.py
@@ -0,0 +1,97 @@
+from analyzer_interface import (
+    AnalyzerInput,
+    AnalyzerInterface,
+    AnalyzerOutput,
+    InputColumn,
+    OutputColumn,
+)
+
+interface = AnalyzerInterface(
+    # Should be globally unique.
+    id="__example__",
+    # We don't really use this yet, but specify something for now.
+    version="0.1.0",
+    # The name of the analyzer as shown on the UI.
+    name="Example Analyzer",
+    # These descriptions are shown to the user in the UI at some point during the
+    # analysis selection process.
+    short_description="Example Analyzer (Character Count)",
+    long_description="""
+This is an example analyzer that counts the number of characters in each message.
+  """,
+    input=AnalyzerInput(
+        columns=[
+            InputColumn(
+                # This is the column name that you will use in your data analysis
+                # code.
+                name="message_id",
+                # This is the human readable name that will be displayed in the
+                # user interface.
+                human_readable_name="Unique Message ID",
+                # Refer to the complete set of data types by following the
+                # type definition.
+                data_type="identifier",
+                # This is a description of the column that will be displayed in
+                # the user interface during column matching.
+                description="The unique identifier of the message",
+                # This name hints give the application a kind of soft heuristics
+                # to match the column to the right data. The user will be able to
+                # override the suggestion if it is incorrect.
+                #
+                # You don't need to provide all possible hints, but the more you
+                # provide, the better the suggestions will be.
+                name_hints=[
+                    "post",
+                    "message",
+                    "comment",
+                    "text",
+                    "retweet id",
+                    "tweet",
+                ],
+            ),
+            InputColumn(
+                name="message_text",
+                human_readable_name="Message Text",
+                data_type="text",
+                description="The text content of the message",
+                name_hints=[
+                    "message",
+                    "text",
+                    "comment",
+                    "post",
+                    "body",
+                    "content",
+                    "tweet",
+                ],
+            ),
+        ]
+    ),
+    outputs=[
+        AnalyzerOutput(
+            # This should be locally unique to the analyzer.
+            # Remember this -- you will need it to refer to this output in your
+            # implementation. It will also form part of the exported output's
+            # file name, so choose something that's intuitive.
+            id="character_count",
+            # This is the human readable name that will be displayed in the
+            # user interface. Only used if this is exportable. You can leave
+            # it out and it will fallback to the id.
+            name="Character Count Per Message",
+            # Mark this as internal, so that it is not shown in the list of
+            # exported outputs.
+            internal=True,
+            columns=[
+                OutputColumn(
+                    # This is the column name that you will use in your data analysis
+                    # code when saving the output.
+                    name="message_id",
+                    # This is the human readable name that will be used in the
+                    # exported output.
+                    human_readable_name="Unique Message ID",
+                    data_type="integer",
+                ),
+                OutputColumn(name="character_count", data_type="integer"),
+            ],
+        )
+    ],
+)
diff --git a/analyzers/example/example_base/main.py b/analyzers/example/example_base/main.py
@@ -0,0 +1,47 @@
+import polars as pl
+
+from analyzer_interface.context import PrimaryAnalyzerContext
+from terminal_tools import ProgressReporter
+
+
+def main(context: PrimaryAnalyzerContext):
+    # To read the user's input data the way the user intended, you have to do
+    # two things:
+    # - Read the input file, which is a parquet file. The InputReader interface
+    #   gives you the path to the file. You can use whichever library
+    #   to do this. Here we use polars.
+    #
+    # - Preprocess the input data. This transforms the user's imported data
+    #   to the format that your analyzer expects by performing the column
+    #   mapping and data type conversion (like converting a string column
+    #   that represents a datetime into a timestamp column).
+    #   YOU MUST DO THIS before you can start your analysis, otherwise you won't
+    #   get the columns or the types that you need.
+    input_reader = context.input()
+    df_input = input_reader.preprocess(pl.read_parquet(input_reader.parquet_path))
+
+    # Now you can start your analysis. The following code is just a minimal example.
+    #
+    # The use of the ProgressReporter is optional. It helps breaking a
+    # longer analysis down into sections.
+    with ProgressReporter("Counting characters") as progress:
+        df_count = df_input.select(
+            pl.col("message_id"),
+            # The input and output columns are as you define in the interface.
+            pl.col("message_text").str.len_chars().alias("character_count"),
+        )
+
+        # If you decide to process the data in small batches
+        # you can update the progress bar with the fraction of the
+        # current batch. Again, this is optional. Here we just use
+        # 1.0 to indicate 100% completion.
+        #
+        # You can still use the ProgressReporter without updating the progress
+        # value, in which case the progress bar will just show a spinner and
+        # the message.
+        progress.update(1.0)
+
+    # The analyzer is expected to write the output to a parquet file for
+    # every output that is defined. Make sure that the output ID and the
+    # columns match the interface.
+    df_count.write_parquet(context.output("character_count").parquet_path)
diff --git a/analyzers/example/example_report/__init__.py b/analyzers/example/example_report/__init__.py
@@ -0,0 +1,8 @@
+from analyzer_interface import SecondaryAnalyzerDeclaration
+
+from .interface import interface
+from .main import main
+
+# This is an example secondary analyzer. It adds a column to the output of the
+# primary analyzer that indicates whether the message is "long" or not.
+example_report = SecondaryAnalyzerDeclaration(interface=interface, main=main)
diff --git a/analyzers/example/example_report/interface.py b/analyzers/example/example_report/interface.py
@@ -0,0 +1,31 @@
+from analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface
+
+from ..example_base.interface import interface as example_base
+
+interface = SecondaryAnalyzerInterface(
+    # This ID should unique among the analyzers in the application.
+    id="example_report",
+    # We don't really use this yet, but specify something for now.
+    version="0.1.0",
+    # The name of the analyzer as shown on the UI.
+    name="Example Report",
+    short_description="",
+    # Specify the primary analyzer here. You MUST do this otherwise the
+    # secondary analyzer will not be detected as deriving from the primary.
+    base_analyzer=example_base,
+    outputs=[
+        AnalyzerOutput(
+            id="example_report",
+            name="Example Report",
+            columns=[
+                OutputColumn(name="message_id", data_type="integer"),
+                OutputColumn(name="character_count", data_type="integer"),
+                # This is our pretend "presented column" that isn't part of the
+                # actual analysis, but is just for the report. We avoid storing
+                # things like this in the primary analyzer output (unless it's
+                # laborious to compute), since it's only specific to this export.
+                OutputColumn(name="is_long", data_type="boolean"),
+            ],
+        )
+    ],
+)
diff --git a/analyzers/example/example_report/main.py b/analyzers/example/example_report/main.py
@@ -0,0 +1,21 @@
+import polars as pl
+
+from analyzer_interface.context import SecondaryAnalyzerContext
+
+
+def main(context: SecondaryAnalyzerContext):
+    df_character_count = pl.read_parquet(
+        # This `character_count` is the output ID from the primary analyzer.
+        context.base.table("character_count").parquet_path
+    )
+
+    df_export = df_character_count.with_columns(
+        # `is_long` is a new column that we are adding to the output.
+        pl.col("character_count")
+        .gt(100)
+        .alias("is_long")
+    )
+
+    # Save the output to a parquet file. The output ID comes from the secondary
+    # analyzer's interface.
+    df_export.write_parquet(context.output("example_report").parquet_path)
diff --git a/analyzers/example/example_web/__init__.py b/analyzers/example/example_web/__init__.py
@@ -0,0 +1,12 @@
+from analyzer_interface import WebPresenterDeclaration
+
+from .factory import factory
+from .interface import interface
+
+example_web = WebPresenterDeclaration(
+    interface=interface,
+    factory=factory,
+    # You must pass __name__ here. It's to make Dash happy.
+    # See: http://dash.plotly.com/urls
+    name=__name__,
+)
diff --git a/analyzers/example/example_web/factory.py b/analyzers/example/example_web/factory.py
@@ -0,0 +1,50 @@
+import plotly.express as px
+import polars as pl
+from dash.dcc import Graph
+from dash.html import Div
+
+from analyzer_interface.context import WebPresenterContext
+
+
+def factory(context: WebPresenterContext):
+    df = pl.read_parquet(
+        # This gives you the path to the primary analyzer's output.
+        # The ID is the same as the one you used in the primary analyzer interface.
+        context.base.table("character_count").parquet_path
+    )
+
+    # For secondary analyzer output, import the secondary analyzer's interface
+    # and use an ID from there.
+    #
+    # Example:
+    #
+    # from ..example_report import interface as example_report
+    #
+    # pl.read_parquet(
+    #   context.dependency(example_report).table("example_report").parquet_path
+    # )
+
+    # This is the Dash app. You can add components to it to build your UI.
+    # For a Dash primer, consult the Dash documentation at https://dash.plotly.com/.
+    app = context.dash_app
+
+    fig = px.histogram(x=df["character_count"], nbins=50)
+    fig.update_layout(
+        {
+            "xaxis": {
+                "title": {"text": "Message Character Count"},
+            },
+            "yaxis": {
+                "title": {"text": "Number of Messages"},
+            },
+        }
+    )
+
+    app.layout = Div(
+        [
+            Graph(
+                figure=fig,
+                style={"height": "100%", "flex-grow": "1"},
+            )
+        ]
+    )
diff --git a/analyzers/example/example_web/interface.py b/analyzers/example/example_web/interface.py
@@ -0,0 +1,27 @@
+from analyzer_interface import WebPresenterInterface
+
+from ..example_base import interface as example_base
+from ..example_report import interface as example_report
+
+interface = WebPresenterInterface(
+    # This ID must be unique among all web presenters.
+    id="example_web",
+    # We don't really use this yet, but specify something for now.
+    version="0.1.0",
+    # The name of the web presenter as shown on the UI.
+    name="Message Length Histogram",
+    # This is the description that will be shown to the user in the UI.
+    short_description="Shows the distribution of message lengths",
+    # Specify the primary analyzer here.
+    base_analyzer=example_base,
+    # You must specify all of the secondary analyzers that this web presenter
+    # depends on, and they must obviously be secondaries to the same primary
+    # analyzer.
+    #
+    # In this example, we don't depend on any secondary analyzer, so we leave
+    # this blank. However, the commented out line below shows how you would
+    # specify a dependency.
+    depends_on=[
+        # example_report
+    ],
+)
diff --git a/docs/dev-guide.md b/docs/dev-guide.md