Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-15991: Infogram pydocs updates #15992

Open
wants to merge 8 commits into
base: rel-3.46.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 117 additions & 1 deletion h2o-bindings/bin/custom/python/gen_infogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def _extract_x_from_model(self):

def plot(self, train=True, valid=False, xval=False, figsize=(10, 10), title="Infogram", legend_on=False, server=False):
"""
Plot the infogram. By default, it will plot the infogram calculated from training dataset.
Plot the infogram. By default, it will plot the infogram calculated from training dataset.
Note that the frame rel_cmi_frame contains the following columns:
- 0: predictor names
- 1: admissible
Expand Down Expand Up @@ -435,3 +435,119 @@ def train_subset_models(self, model_class, y, training_frame, test_frame, protec
feature set. Admissible models are also less susceptible to overfitting and train faster, while providing similar accuracy as models built using all available features.
"""
)
examples = dict(
algorithm_params="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
data_fraction="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, data_fraction=0.7)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
net_information_threshold="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, net_information_threshold=-1.0)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
relevance_index_threshold="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, relevance_index_threshold=-1.0)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
safety_index_threshold="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, safety_index_threshold=-1.0)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
top_n_features="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, top_n_features=30)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
total_information_threshold="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, total_information_threshold=-1.0)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
)


121 changes: 120 additions & 1 deletion h2o-py/h2o/estimators/infogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,23 @@ def algorithm_params(self):
Customized parameters for the machine learning algorithm specified in the algorithm parameter.

Type: ``dict``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is example for algorithm_params I would expect call of ig.algorithm_params

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I was wrong here. Its a example for data_fraction so I would expect. to call data_fractions..

So just call: ig.data_fraction

"""
if self._parms.get("algorithm_params") != None:
algorithm_params_dict = ast.literal_eval(self._parms.get("algorithm_params"))
Expand Down Expand Up @@ -745,6 +762,23 @@ def total_information_threshold(self):
information is the x-axis of the Core Infogram. Default is -1 which gets set to 0.1.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of scope but I don't understand Default is -1 which gets set to 0.1.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also the first line is not correct since the default is -0.1.

A number between 0 and 1 representing a threshold for total information, defaulting to 0.1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wendycwong any ideas about what's going on with the default values here?

(I can update the schema and fix the first line issue)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hannah-tillman I understand it now.

It comes from here:

https://github.com/h2oai/h2o-3/blob/master/h2o-admissibleml/src/main/java/hex/Infogram/Infogram.java#L185-L187

@wendycwong any reason why we not set it directly here?

Nevertheless, its out of scope of this PR. @shaunyogeshwaran.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@valenad1 @hannah-tillman

The value of -1 or -0.1 is used to denote that the user has not set any value. If the user has not set any value, we will set it to a default value of 0.1. There is a reason that the code needs to know if the user set that value. I cannot remember what it is now.


Type: ``float``, defaults to ``-1.0``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, total_information_threshold=-1.0)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please set

total_information_threshold = 0.5

>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("total_information_threshold")

Expand All @@ -768,6 +802,23 @@ def net_information_threshold(self):
the y-axis of the Core Infogram. Default is -1 which gets set to 0.1.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. First line says number between 0-1,... Default -1 sets to 0.1..


Type: ``float``, defaults to ``-1.0``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, net_information_threshold=-1.0)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please set

net_information_threshold = 0.5

>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("net_information_threshold")

Expand All @@ -792,6 +843,23 @@ def relevance_index_threshold(self):
which gets set to 0.1.

Type: ``float``, defaults to ``-1.0``.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. First line says number between 0-1,... Default -1 sets to 0.1..


:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, relevance_index_threshold=-1.0)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please set

relevance_index_threshold = 0.5

>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("relevance_index_threshold")

Expand All @@ -816,6 +884,23 @@ def safety_index_threshold(self):
gets set to 0.1.

Type: ``float``, defaults to ``-1.0``.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. First line says number between 0-1,... Default -1 sets to 0.1..


:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, safety_index_threshold=-1.0)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please set

safety_index_threshold = 0.5

>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("safety_index_threshold")

Expand All @@ -837,6 +922,23 @@ def data_fraction(self):
and less than or equal to 1.0 is acceptable.

Type: ``float``, defaults to ``1.0``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, data_fraction=0.7)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work the same as split_frame?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("data_fraction")

Expand All @@ -852,6 +954,23 @@ def top_n_features(self):
importance, and the top N are evaluated. Defaults to 50.

Type: ``int``, defaults to ``50``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train, test = df.split_frame(seed=1)
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, top_n_features=30)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("top_n_features")

Expand All @@ -874,7 +993,7 @@ def _extract_x_from_model(self):

def plot(self, train=True, valid=False, xval=False, figsize=(10, 10), title="Infogram", legend_on=False, server=False):
"""
Plot the infogram. By default, it will plot the infogram calculated from training dataset.
Plot the infogram. By default, it will plot the infogram calculated from training dataset.
Note that the frame rel_cmi_frame contains the following columns:
- 0: predictor names
- 1: admissible
Expand Down