Add option locale to CountVectorizer, TfIdfVectorizer converters (#1020)

* Add option locale to CountVectorizer, TfIdfVectorizer converter Signed-off-by: Xavier Dupre <[email protected]> * use locale en Signed-off-by: Xavier Dupre <[email protected]> * install langage-pack-en Signed-off-by: Xavier Dupre <[email protected]> * add language-pack-en Signed-off-by: Xavier Dupre <[email protected]> * package Signed-off-by: Xavier Dupre <[email protected]> * sudo Signed-off-by: Xavier Dupre <[email protected]> * fix local Signed-off-by: Xavier Dupre <[email protected]> * disable two tests on linux Signed-off-by: Xavier Dupre <[email protected]> * fix missing import Signed-off-by: Xavier Dupre <[email protected]> * disable one test for old version of scikit-learn Signed-off-by: Xavier Dupre <[email protected]> * disable failing test in windows and old version of scikit-learn Signed-off-by: Xavier Dupre <[email protected]> * misspelling Signed-off-by: Xavier Dupre <[email protected]> * fix ort version for disabled tests Signed-off-by: Xavier Dupre <[email protected]> * black Signed-off-by: Xavier Dupre <[email protected]> * fix documentation Signed-off-by: Xavier Dupre <[email protected]> * disable tests for older version Signed-off-by: Xavier Dupre <[email protected]> --------- Signed-off-by: Xavier Dupre <[email protected]>
onnx · Aug 30, 2023 · 0371c0f · 0371c0f
1 parent 895c3a7
commit 0371c0f
Show file tree

Hide file tree

Showing 62 changed files with 451 additions and 115 deletions.
diff --git a/.azure-pipelines/linux-conda-CI.yml b/.azure-pipelines/linux-conda-CI.yml
@@ -200,6 +200,12 @@ jobs:
       environmentName: 'py$(python.version)'
       packageSpecs: 'python=$(python.version)'
 
+  - script: |
+      sudo apt-get install -y language-pack-en
+      sudo locale-gen en_US.UTF-8
+      sudo update-locale LANG=en_US.UTF-8
+    displayName: 'Install packages'
+
   - script: |
       test '$(python.version)' == '3.7' && apt-get install protobuf-compiler libprotoc-dev
       conda config --set always_yes yes --set changeps1 no

diff --git a/.gitignore b/.gitignore
@@ -68,3 +68,4 @@ docs/tutorial/*.onnx
 docs/tutorial/*.jpg
 docs/tutorial/*.png
 docs/tutorial/*.dot
+docs/tutorial/catboost_info
diff --git a/CHANGELOGS.md b/CHANGELOGS.md
@@ -0,0 +1,8 @@
+Change Logs
+===========
+
+1.16.0
+++++++
+
+* add option 'language' to converters of CountVectorizer, TfIdfVectorizer
+  [#1020](https://github.com/onnx/sklearn-onnx/pull/1020)
diff --git a/README.md b/README.md
@@ -2,9 +2,9 @@
 
 <p align="center"><img width="50%" src="docs/logo_main.png" /></p>
 
-[![Build Status Linux](https://dev.azure.com/onnxmltools/sklearn-onnx/_apis/build/status%2Fonnx.sklearn-onnx.linux.CI?branchName=refs%2Fpull%2F1009%2Fmerge)](https://dev.azure.com/onnxmltools/sklearn-onnx/_build/latest?definitionId=21&branchName=refs%2Fpull%2F1009%2Fmerge)
+[![Build Status](https://dev.azure.com/onnxmltools/sklearn-onnx/_apis/build/status%2Fonnx.sklearn-onnx.linux.CI?branchName=refs%2Fpull%2F1020%2Fmerge)](https://dev.azure.com/onnxmltools/sklearn-onnx/_build/latest?definitionId=21&branchName=refs%2Fpull%2F1020%2Fmerge)
 
-[![Build Status Windows](https://dev.azure.com/onnxmltools/sklearn-onnx/_apis/build/status%2Fonnx.sklearn-onnx.win.CI?branchName=refs%2Fpull%2F1009%2Fmerge)](https://dev.azure.com/onnxmltools/sklearn-onnx/_build/latest?definitionId=22&branchName=refs%2Fpull%2F1009%2Fmerge)
+[![Build Status](https://dev.azure.com/onnxmltools/sklearn-onnx/_apis/build/status%2Fonnx.sklearn-onnx.win.CI?branchName=refs%2Fpull%2F1020%2Fmerge)](https://dev.azure.com/onnxmltools/sklearn-onnx/_build/latest?definitionId=22&branchName=refs%2Fpull%2F1020%2Fmerge)
 
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
@@ -33,6 +33,38 @@ Or you can install from the source with the latest changes.
 pip install git+https://github.com/onnx/sklearn-onnx.git
 ```
 
+## Getting started
+
+```python
+# Train a model.
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+
+iris = load_iris()
+X, y = iris.data, iris.target
+X = X.astype(np.float32)
+X_train, X_test, y_train, y_test = train_test_split(X, y)
+clr = RandomForestClassifier()
+clr.fit(X_train, y_train)
+
+# Convert into ONNX format.
+from skl2onnx import to_onnx
+
+onx = to_onnx(clr, X[:1])
+with open("rf_iris.onnx", "wb") as f:
+    f.write(onx.SerializeToString())
+
+# Compute the prediction with onnxruntime.
+import onnxruntime as rt
+
+sess = rt.InferenceSession("rf_iris.onnx", providers=["CPUExecutionProvider"])
+input_name = sess.get_inputs()[0].name
+label_name = sess.get_outputs()[0].name
+pred_onx = sess.run([label_name], {input_name: X_test.astype(np.float32)})[0]
+```
+
 ## Contribute
 We welcome contributions in the form of feedback, ideas, or code.
 

diff --git a/docs/api_summary.rst b/docs/api_summary.rst
@@ -45,7 +45,6 @@ it is possible to enable logging:
     import logging
     logger = logging.getLogger('skl2onnx')
     logger.setLevel(logging.DEBUG)
-    logging.basicConfig(level=logging.DEBUG)
 
 Example :ref:`l-example-logging` illustrates what it looks like.
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -5,6 +5,7 @@
 
 import os
 import sys
+import logging
 import warnings
 import skl2onnx
 
@@ -72,16 +73,14 @@
 
 linkcode_resolve = make_linkcode_resolve(
     "skl2onnx",
-    "https://github.com/onnx/skl2onnx/blob/{revision}/" "{package}/{path}#L{lineno}",
+    "https://github.com/onnx/skl2onnx/blob/{revision}/{package}/{path}#L{lineno}",
 )
 
 intersphinx_mapping = {
     "joblib": ("https://joblib.readthedocs.io/en/latest/", None),
     "python": ("https://docs.python.org/{.major}".format(sys.version_info), None),
     "matplotlib": ("https://matplotlib.org/", None),
-    "mlinsights": ("http://www.xavierdupre.fr/app/mlinsights/helpsphinx/", None),
     "numpy": ("https://docs.scipy.org/doc/numpy/", None),
-    "pyquickhelper": ("http://www.xavierdupre.fr/app/pyquickhelper/helpsphinx/", None),
     "onnxruntime": ("https://onnxruntime.ai/docs/api/python/", None),
     "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
     "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
@@ -144,4 +143,14 @@
 def setup(app):
     # Placeholder to initialize the folder before
     # generating the documentation.
+    logger = logging.getLogger("skl2onnx")
+    logger.setLevel(logging.WARNING)
+    logger = logging.getLogger("matplotlib.font_manager")
+    logger.setLevel(logging.WARNING)
+    logger = logging.getLogger("matplotlib.ticker")
+    logger.setLevel(logging.WARNING)
+    logger = logging.getLogger("PIL.PngImagePlugin")
+    logger.setLevel(logging.WARNING)
+    logger = logging.getLogger("graphviz._tools")
+    logger.setLevel(logging.WARNING)
     return app
diff --git a/docs/examples/plot_convert_model.py b/docs/examples/plot_convert_model.py
@@ -69,7 +69,7 @@
 with open("logreg_iris.onnx", "wb") as f:
     f.write(onx.SerializeToString())
 
-sess = rt.InferenceSession("logreg_iris.onnx")
+sess = rt.InferenceSession("logreg_iris.onnx", providers=["CPUExecutionProvider"])
 input_name = sess.get_inputs()[0].name
 label_name = sess.get_outputs()[0].name
 pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]

diff --git a/docs/examples/plot_convert_syntax.py b/docs/examples/plot_convert_syntax.py
@@ -31,7 +31,7 @@
 
 
 def predict_with_onnxruntime(onx, X):
-    sess = InferenceSession(onx.SerializeToString())
+    sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
     input_name = sess.get_inputs()[0].name
     res = sess.run(None, {input_name: X.astype(np.float32)})
     return res[0]

diff --git a/docs/examples/plot_convert_zipmap.py b/docs/examples/plot_convert_zipmap.py
@@ -48,7 +48,7 @@
 # Let's confirm the output type of the probabilities
 # is a list of dictionaries with onnxruntime.
 
-sess = rt.InferenceSession(onx.SerializeToString())
+sess = rt.InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
 res = sess.run(None, {"float_input": X_test.astype(numpy.float32)})
 print(res[1][:2])
 print("probabilities type:", type(res[1]))
@@ -66,7 +66,9 @@
     clr, initial_types=initial_type, options=options, target_opset=12
 )
 
-sess2 = rt.InferenceSession(onx2.SerializeToString())
+sess2 = rt.InferenceSession(
+    onx2.SerializeToString(), providers=["CPUExecutionProvider"]
+)
 res2 = sess2.run(None, {"float_input": X_test.astype(numpy.float32)})
 print(res2[1][:2])
 print("probabilities type:", type(res2[1]))
@@ -85,7 +87,9 @@
     clr, initial_types=initial_type, options=options, target_opset=12
 )
 
-sess3 = rt.InferenceSession(onx3.SerializeToString())
+sess3 = rt.InferenceSession(
+    onx3.SerializeToString(), providers=["CPUExecutionProvider"]
+)
 res3 = sess3.run(None, {"float_input": X_test.astype(numpy.float32)})
 for i, out in enumerate(sess3.get_outputs()):
     print(

diff --git a/docs/examples/plot_custom_model.py b/docs/examples/plot_custom_model.py
@@ -410,7 +410,7 @@ def predictable_tsne_converter(scope, operator, container):
 ##########################
 # Predictions with onnxruntime.
 
-sess = rt.InferenceSession("predictable_tsne.onnx")
+sess = rt.InferenceSession("predictable_tsne.onnx", providers=["CPUExecutionProvider"])
 
 pred_onx = sess.run(None, {"input": X_test[:1].astype(numpy.float32)})
 print("transform", pred_onx[0])

diff --git a/docs/examples/plot_custom_parser.py b/docs/examples/plot_custom_parser.py
@@ -259,7 +259,9 @@ def validator_classifier_parser(scope, model, inputs, custom_parsers=None):
 
 X32 = X_test[:5].astype(np.float32)
 
-sess = rt.InferenceSession(model_onnx.SerializeToString())
+sess = rt.InferenceSession(
+    model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+)
 results = sess.run(None, {"X": X32})
 
 print("--labels--")

diff --git a/docs/examples/plot_custom_parser_alternative.py b/docs/examples/plot_custom_parser_alternative.py
@@ -236,7 +236,9 @@ def validator_classifier_parser(scope, model, inputs, custom_parsers=None):
 
 X32 = X_test[:5].astype(np.float32)
 
-sess = rt.InferenceSession(model_onnx.SerializeToString())
+sess = rt.InferenceSession(
+    model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+)
 results = sess.run(None, {"X": X32})
 
 print("--labels--")

diff --git a/docs/examples/plot_errors_onnxruntime.py b/docs/examples/plot_errors_onnxruntime.py
@@ -41,7 +41,7 @@
     )
 
 example2 = "logreg_iris.onnx"
-sess = rt.InferenceSession(example2)
+sess = rt.InferenceSession(example2, providers=["CPUExecutionProvider"])
 
 input_name = sess.get_inputs()[0].name
 output_name = sess.get_outputs()[0].name

diff --git a/docs/examples/plot_gpr.py b/docs/examples/plot_gpr.py
@@ -51,7 +51,7 @@
 initial_type = [("X", FloatTensorType([None, X_train.shape[1]]))]
 onx = convert_sklearn(gpr, initial_types=initial_type, target_opset=12)
 
-sess = rt.InferenceSession(onx.SerializeToString())
+sess = rt.InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
 try:
     pred_onx = sess.run(None, {"X": X_test.astype(numpy.float32)})[0]
 except RuntimeError as e:
@@ -74,7 +74,7 @@
 initial_type = [("X", FloatTensorType([None, None]))]
 onx = convert_sklearn(gpr, initial_types=initial_type, target_opset=12)
 
-sess = rt.InferenceSession(onx.SerializeToString())
+sess = rt.InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
 pred_onx = sess.run(None, {"X": X_test.astype(numpy.float32)})[0]
 
 pred_skl = gpr.predict(X_test)
@@ -111,7 +111,9 @@
 initial_type = [("X", DoubleTensorType([None, None]))]
 onx64 = convert_sklearn(gpr, initial_types=initial_type, target_opset=12)
 
-sess64 = rt.InferenceSession(onx64.SerializeToString())
+sess64 = rt.InferenceSession(
+    onx64.SerializeToString(), providers=["CPUExecutionProvider"]
+)
 pred_onx64 = sess64.run(None, {"X": X_test})[0]
 
 print(pred_onx64[0, :10])
@@ -169,7 +171,9 @@
     gpr, initial_types=initial_type, options=options, target_opset=12
 )
 
-sess64_std = rt.InferenceSession(onx64_std.SerializeToString())
+sess64_std = rt.InferenceSession(
+    onx64_std.SerializeToString(), providers=["CPUExecutionProvider"]
+)
 pred_onx64_std = sess64_std.run(None, {"X": X_test[:5]})
 
 pprint.pprint(pred_onx64_std)

diff --git a/docs/examples/plot_intermediate_outputs.py b/docs/examples/plot_intermediate_outputs.py
@@ -195,7 +195,7 @@ def convert_dataframe_schema(df, drop=None):
 ################################
 # We are ready to run *onnxruntime*.
 
-sess = rt.InferenceSession("pipeline_titanic.onnx")
+sess = rt.InferenceSession("pipeline_titanic.onnx", providers=["CPUExecutionProvider"])
 pred_onx = sess.run(None, inputs)
 print("predict", pred_onx[0][:5])
 print("predict_proba", pred_onx[1][:1])
@@ -228,7 +228,9 @@ def convert_dataframe_schema(df, drop=None):
 ################################
 # Let's compute the numerical features.
 
-sess = rt.InferenceSession("pipeline_titanic_numerical.onnx")
+sess = rt.InferenceSession(
+    "pipeline_titanic_numerical.onnx", providers=["CPUExecutionProvider"]
+)
 numX = sess.run(None, inputs)
 print("numerical features", numX[0][:1])
 
@@ -238,7 +240,9 @@ def convert_dataframe_schema(df, drop=None):
 print(model_onnx)
 text_onnx = select_model_inputs_outputs(model_onnx, "variable2")
 save_onnx_model(text_onnx, "pipeline_titanic_textual.onnx")
-sess = rt.InferenceSession("pipeline_titanic_textual.onnx")
+sess = rt.InferenceSession(
+    "pipeline_titanic_textual.onnx", providers=["CPUExecutionProvider"]
+)
 numT = sess.run(None, inputs)
 print("textual features", numT[0][:1])
 

diff --git a/docs/examples/plot_investigate_pipeline.py b/docs/examples/plot_investigate_pipeline.py
@@ -55,7 +55,9 @@
 initial_types = [("input", FloatTensorType((None, X_digits.shape[1])))]
 model_onnx = convert_sklearn(pipe, initial_types=initial_types, target_opset=12)
 
-sess = rt.InferenceSession(model_onnx.SerializeToString())
+sess = rt.InferenceSession(
+    model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+)
 print("skl predict_proba")
 print(pipe.predict_proba(X_digits[:2]))
 onx_pred = sess.run(None, {"input": X_digits[:2].astype(np.float32)})[1]
@@ -82,7 +84,9 @@
 
 for i, step in enumerate(steps):
     onnx_step = step["onnx_step"]
-    sess = rt.InferenceSession(onnx_step.SerializeToString())
+    sess = rt.InferenceSession(
+        onnx_step.SerializeToString(), providers=["CPUExecutionProvider"]
+    )
     onnx_outputs = sess.run(None, {"input": X_digits[:2].astype(np.float32)})
     skl_outputs = step["model"]._debug.outputs
     print("step 1", type(step["model"]))

diff --git a/docs/examples/plot_logging.py b/docs/examples/plot_logging.py
@@ -46,7 +46,7 @@
 onx = convert_sklearn(clr, initial_types=initial_type, target_opset=12)
 
 
-sess = rt.InferenceSession(onx.SerializeToString())
+sess = rt.InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
 input_name = sess.get_inputs()[0].name
 label_name = sess.get_outputs()[0].name
 pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]
@@ -74,18 +74,17 @@
 
 logger = logging.getLogger("skl2onnx")
 logger.setLevel(logging.DEBUG)
-logging.basicConfig(level=logging.DEBUG)
 
 convert_sklearn(clr, initial_types=initial_type, target_opset=12)
 
 ###########################
 # And to disable it.
 
 logger.setLevel(logging.INFO)
-logging.basicConfig(level=logging.INFO)
 
 convert_sklearn(clr, initial_types=initial_type, target_opset=12)
 
+logger.setLevel(logging.WARNING)
 
 #################################
 # **Versions used for this example**

diff --git a/docs/examples/plot_metadata.py b/docs/examples/plot_metadata.py
@@ -39,7 +39,7 @@
 #############################
 # With *ONNX Runtime*:
 
-sess = InferenceSession(example)
+sess = InferenceSession(example, providers=["CPUExecutionProvider"])
 meta = sess.get_modelmeta()
 
 print("custom_metadata_map={}".format(meta.custom_metadata_map))

diff --git a/docs/examples/plot_nmf.py b/docs/examples/plot_nmf.py
@@ -113,7 +113,9 @@ def nmf_to_onnx(W, H, op_version=12):
 ########################################
 # Let's compute prediction with it.
 
-sess = InferenceSession(model_onnx.SerializeToString())
+sess = InferenceSession(
+    model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+)
 
 
 def predict_onnx(sess, row_indices, col_indices):

diff --git a/docs/examples/plot_onnx_operators.py b/docs/examples/plot_onnx_operators.py
@@ -153,7 +153,9 @@
 def predict_with_onnxruntime(model_def, *inputs):
     import onnxruntime as ort
 
-    sess = ort.InferenceSession(model_def.SerializeToString())
+    sess = ort.InferenceSession(
+        model_def.SerializeToString(), providers=["CPUExecutionProvider"]
+    )
     names = [i.name for i in sess.get_inputs()]
     dinputs = {name: input for name, input in zip(names, inputs)}
     res = sess.run(None, dinputs)

diff --git a/docs/examples/plot_pipeline_lightgbm.py b/docs/examples/plot_pipeline_lightgbm.py
@@ -112,7 +112,9 @@
 # Predictions with onnxruntime.
 
 try:
-    sess = rt.InferenceSession("pipeline_lightgbm.onnx")
+    sess = rt.InferenceSession(
+        "pipeline_lightgbm.onnx", providers=["CPUExecutionProvider"]
+    )
 except OrtFail as e:
     print(e)
     print("The converter requires onnxmltools>=1.7.0")

diff --git a/docs/examples/plot_pipeline_xgboost.py b/docs/examples/plot_pipeline_xgboost.py
@@ -128,7 +128,7 @@
 ##########################
 # Predictions with onnxruntime.
 
-sess = rt.InferenceSession("pipeline_xgboost.onnx")
+sess = rt.InferenceSession("pipeline_xgboost.onnx", providers=["CPUExecutionProvider"])
 pred_onx = sess.run(None, {"input": X[:5].astype(numpy.float32)})
 print("predict", pred_onx[0])
 print("predict_proba", pred_onx[1][:1])

diff --git a/docs/examples/plot_tfidfvectorizer.py b/docs/examples/plot_tfidfvectorizer.py
@@ -197,7 +197,7 @@ def transform(self, posts):
 ##########################
 # Predictions with onnxruntime.
 
-sess = rt.InferenceSession("pipeline_tfidf.onnx")
+sess = rt.InferenceSession("pipeline_tfidf.onnx", providers=["CPUExecutionProvider"])
 print("---", train_data[0])
 inputs = {"input": train_data[:1]}
 pred_onx = sess.run(None, inputs)