h2oai · shaunyogeshwaran · Dec 27, 2023 · Dec 28, 2023 · Dec 28, 2023 · Jan 3, 2024
diff --git a/h2o-bindings/bin/custom/python/gen_infogram.py b/h2o-bindings/bin/custom/python/gen_infogram.py
@@ -21,7 +21,7 @@ def _extract_x_from_model(self):
 
     def plot(self, train=True, valid=False, xval=False, figsize=(10, 10), title="Infogram", legend_on=False, server=False):
         """
-        Plot the infogram.  By default, it will plot the infogram calculated from training dataset.  
+        Plot the infogram.  By default, it will plot the infogram calculated from training dataset. 
         Note that the frame rel_cmi_frame contains the following columns:
         - 0: predictor names
         - 1: admissible 
@@ -435,3 +435,119 @@ def train_subset_models(self, model_class, y, training_frame, test_frame, protec
 feature set.  Admissible models are also less susceptible to overfitting and train faster, while providing similar accuracy as models built using all available features.
 """
 )
+examples = dict(
+    algorithm_params="""
+>>> import h2o
+>>> from h2o.estimators.infogram import H2OInfogram
+>>> h2o.init()
+>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+>>> df = h2o.import_file(path=f, col_types=col_types)
+>>> train, test = df.split_frame(seed=1)
+>>> y = "default_payment_next_month"
+>>> x = train.columns
+>>> x.remove(y)
+>>> pcols = ["SEX", "MARRIAGE", "AGE"]
+>>> ig = H2OInfogram(protected_columns=pcols)
+>>> ig.train(y=y, x=x, training_frame=train)
+>>> ig.plot()
+""",
+    data_fraction="""
+>>> import h2o
+>>> from h2o.estimators.infogram import H2OInfogram
+>>> h2o.init()
+>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+>>> df = h2o.import_file(path=f, col_types=col_types)
+>>> train, test = df.split_frame(seed=1)
+>>> y = "default_payment_next_month"
+>>> x = train.columns
+>>> x.remove(y)
+>>> pcols = ["SEX", "MARRIAGE", "AGE"]
+>>> ig = H2OInfogram(protected_columns=pcols, data_fraction=0.7)
+>>> ig.train(y=y, x=x, training_frame=train)
+>>> ig.plot()
+""",
+    net_information_threshold="""
+>>> import h2o
+>>> from h2o.estimators.infogram import H2OInfogram
+>>> h2o.init()
+>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+>>> df = h2o.import_file(path=f, col_types=col_types)
+>>> train, test = df.split_frame(seed=1)
+>>> y = "default_payment_next_month"
+>>> x = train.columns
+>>> x.remove(y)
+>>> pcols = ["SEX", "MARRIAGE", "AGE"]
+>>> ig = H2OInfogram(protected_columns=pcols, net_information_threshold=-1.0)
+>>> ig.train(y=y, x=x, training_frame=train)
+>>> ig.plot()
+""",
+    relevance_index_threshold="""
+>>> import h2o
+>>> from h2o.estimators.infogram import H2OInfogram
+>>> h2o.init()
+>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+>>> df = h2o.import_file(path=f, col_types=col_types)
+>>> train, test = df.split_frame(seed=1)
+>>> y = "default_payment_next_month"
+>>> x = train.columns
+>>> x.remove(y)
+>>> pcols = ["SEX", "MARRIAGE", "AGE"]
+>>> ig = H2OInfogram(protected_columns=pcols, relevance_index_threshold=-1.0)
+>>> ig.train(y=y, x=x, training_frame=train)
+>>> ig.plot()
+""",
+    safety_index_threshold="""
+>>> import h2o
+>>> from h2o.estimators.infogram import H2OInfogram
+>>> h2o.init()
+>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+>>> df = h2o.import_file(path=f, col_types=col_types)
+>>> train, test = df.split_frame(seed=1)
+>>> y = "default_payment_next_month"
+>>> x = train.columns
+>>> x.remove(y)
+>>> pcols = ["SEX", "MARRIAGE", "AGE"]
+>>> ig = H2OInfogram(protected_columns=pcols, safety_index_threshold=-1.0)
+>>> ig.train(y=y, x=x, training_frame=train)
+>>> ig.plot()
+""",
+    top_n_features="""
+>>> import h2o
+>>> from h2o.estimators.infogram import H2OInfogram
+>>> h2o.init()
+>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+>>> df = h2o.import_file(path=f, col_types=col_types)
+>>> train, test = df.split_frame(seed=1)
+>>> y = "default_payment_next_month"
+>>> x = train.columns
+>>> x.remove(y)
+>>> pcols = ["SEX", "MARRIAGE", "AGE"]
+>>> ig = H2OInfogram(protected_columns=pcols, top_n_features=30)
+>>> ig.train(y=y, x=x, training_frame=train)
+>>> ig.plot()
+""",
+    total_information_threshold="""
+>>> import h2o
+>>> from h2o.estimators.infogram import H2OInfogram
+>>> h2o.init()
+>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+>>> df = h2o.import_file(path=f, col_types=col_types)
+>>> train, test = df.split_frame(seed=1)
+>>> y = "default_payment_next_month"
+>>> x = train.columns
+>>> x.remove(y)
+>>> pcols = ["SEX", "MARRIAGE", "AGE"]
+>>> ig = H2OInfogram(protected_columns=pcols, total_information_threshold=-1.0)
+>>> ig.train(y=y, x=x, training_frame=train)
+>>> ig.plot()
+"""
+)
+
+
diff --git a/h2o-py/h2o/estimators/infogram.py b/h2o-py/h2o/estimators/infogram.py
@@ -700,6 +700,23 @@ def algorithm_params(self):
         Customized parameters for the machine learning algorithm specified in the algorithm parameter.
 
         Type: ``dict``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators.infogram import H2OInfogram
+        >>> h2o.init()
+        >>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+        >>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+        >>> df = h2o.import_file(path=f, col_types=col_types)
+        >>> train, test = df.split_frame(seed=1)
+        >>> y = "default_payment_next_month"
+        >>> x = train.columns
+        >>> x.remove(y)
+        >>> pcols = ["SEX", "MARRIAGE", "AGE"]
+        >>> ig = H2OInfogram(protected_columns=pcols)
+        >>> ig.train(y=y, x=x, training_frame=train)
+        >>> ig.plot()
         """
         if self._parms.get("algorithm_params") != None:
             algorithm_params_dict =  ast.literal_eval(self._parms.get("algorithm_params"))
@@ -745,6 +762,23 @@ def total_information_threshold(self):
         information is the x-axis of the Core Infogram. Default is -1 which gets set to 0.1.
 
         Type: ``float``, defaults to ``-1.0``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators.infogram import H2OInfogram
+        >>> h2o.init()
+        >>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+        >>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+        >>> df = h2o.import_file(path=f, col_types=col_types)
+        >>> train, test = df.split_frame(seed=1)
+        >>> y = "default_payment_next_month"
+        >>> x = train.columns
+        >>> x.remove(y)
+        >>> pcols = ["SEX", "MARRIAGE", "AGE"]
+        >>> ig = H2OInfogram(protected_columns=pcols, total_information_threshold=-1.0)
+        >>> ig.train(y=y, x=x, training_frame=train)
+        >>> ig.plot()
         """
         return self._parms.get("total_information_threshold")
 
@@ -768,6 +802,23 @@ def net_information_threshold(self):
         the y-axis of the Core Infogram. Default is -1 which gets set to 0.1.
 
         Type: ``float``, defaults to ``-1.0``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators.infogram import H2OInfogram
+        >>> h2o.init()
+        >>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+        >>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+        >>> df = h2o.import_file(path=f, col_types=col_types)
+        >>> train, test = df.split_frame(seed=1)
+        >>> y = "default_payment_next_month"
+        >>> x = train.columns
+        >>> x.remove(y)
+        >>> pcols = ["SEX", "MARRIAGE", "AGE"]
+        >>> ig = H2OInfogram(protected_columns=pcols, net_information_threshold=-1.0)
+        >>> ig.train(y=y, x=x, training_frame=train)
+        >>> ig.plot()
         """
         return self._parms.get("net_information_threshold")
 
@@ -792,6 +843,23 @@ def relevance_index_threshold(self):
         which gets set to 0.1.
 
         Type: ``float``, defaults to ``-1.0``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators.infogram import H2OInfogram
+        >>> h2o.init()
+        >>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+        >>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+        >>> df = h2o.import_file(path=f, col_types=col_types)
+        >>> train, test = df.split_frame(seed=1)
+        >>> y = "default_payment_next_month"
+        >>> x = train.columns
+        >>> x.remove(y)
+        >>> pcols = ["SEX", "MARRIAGE", "AGE"]
+        >>> ig = H2OInfogram(protected_columns=pcols, relevance_index_threshold=-1.0)
+        >>> ig.train(y=y, x=x, training_frame=train)
+        >>> ig.plot()
         """
         return self._parms.get("relevance_index_threshold")
 
@@ -816,6 +884,23 @@ def safety_index_threshold(self):
         gets set to 0.1.
 
         Type: ``float``, defaults to ``-1.0``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators.infogram import H2OInfogram
+        >>> h2o.init()
+        >>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+        >>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+        >>> df = h2o.import_file(path=f, col_types=col_types)
+        >>> train, test = df.split_frame(seed=1)
+        >>> y = "default_payment_next_month"
+        >>> x = train.columns
+        >>> x.remove(y)
+        >>> pcols = ["SEX", "MARRIAGE", "AGE"]
+        >>> ig = H2OInfogram(protected_columns=pcols, safety_index_threshold=-1.0)
+        >>> ig.train(y=y, x=x, training_frame=train)
+        >>> ig.plot()
         """
         return self._parms.get("safety_index_threshold")
 
@@ -837,6 +922,23 @@ def data_fraction(self):
         and less than or equal to 1.0 is acceptable.
 
         Type: ``float``, defaults to ``1.0``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators.infogram import H2OInfogram
+        >>> h2o.init()
+        >>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+        >>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+        >>> df = h2o.import_file(path=f, col_types=col_types)
+        >>> train, test = df.split_frame(seed=1)
+        >>> y = "default_payment_next_month"
+        >>> x = train.columns
+        >>> x.remove(y)
+        >>> pcols = ["SEX", "MARRIAGE", "AGE"]
+        >>> ig = H2OInfogram(protected_columns=pcols, data_fraction=0.7)
+        >>> ig.train(y=y, x=x, training_frame=train)
+        >>> ig.plot()
         """
         return self._parms.get("data_fraction")
 
@@ -852,6 +954,23 @@ def top_n_features(self):
         importance, and the top N are evaluated.  Defaults to 50.
 
         Type: ``int``, defaults to ``50``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators.infogram import H2OInfogram
+        >>> h2o.init()
+        >>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+        >>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
+        >>> df = h2o.import_file(path=f, col_types=col_types)
+        >>> train, test = df.split_frame(seed=1)
+        >>> y = "default_payment_next_month"
+        >>> x = train.columns
+        >>> x.remove(y)
+        >>> pcols = ["SEX", "MARRIAGE", "AGE"]
+        >>> ig = H2OInfogram(protected_columns=pcols, top_n_features=30)
+        >>> ig.train(y=y, x=x, training_frame=train)
+        >>> ig.plot()
         """
         return self._parms.get("top_n_features")
 
@@ -874,7 +993,7 @@ def _extract_x_from_model(self):
 
     def plot(self, train=True, valid=False, xval=False, figsize=(10, 10), title="Infogram", legend_on=False, server=False):
         """
-        Plot the infogram.  By default, it will plot the infogram calculated from training dataset.  
+        Plot the infogram.  By default, it will plot the infogram calculated from training dataset. 
         Note that the frame rel_cmi_frame contains the following columns:
         - 0: predictor names
         - 1: admissible