From bcd9499ba78bdcdb321c4c2195f82ce3fc15944c Mon Sep 17 00:00:00 2001 From: bskrlj Date: Tue, 12 Sep 2023 09:22:21 +0200 Subject: [PATCH 1/2] test --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 309af33..13df678 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import os import setuptools -# test +# test2 def _parse_requirements(file): required_packages = [] From f7b3c9a13e16317f4ee14527d17c63c2a50f19a0 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Tue, 12 Sep 2023 11:26:58 +0200 Subject: [PATCH 2/2] docs --- DOCS.md | 12 + docs/index.html | 7 + docs/outrank.html | 276 ++ docs/outrank/algorithms.html | 240 ++ docs/outrank/algorithms/feature_ranking.html | 237 ++ .../feature_ranking/ranking_mi_numba.html | 633 +++++ .../algorithms/importance_estimator.html | 735 ++++++ docs/outrank/algorithms/sketches.html | 237 ++ .../sketches/counting_ultiloglog.html | 636 +++++ .../algorithms/synthetic_data_generators.html | 237 ++ .../generator_naive.html | 342 +++ docs/outrank/core_ranking.html | 2078 ++++++++++++++++ docs/outrank/core_selftest.html | 239 ++ docs/outrank/core_utils.html | 2209 +++++++++++++++++ docs/outrank/feature_transformations.html | 238 ++ .../feature_transformer_vault.html | 259 ++ .../default_transformers.html | 312 +++ .../fw_transformers.html | 324 +++ .../ranking_transformers.html | 914 +++++++ docs/outrank/task_generators.html | 349 +++ docs/outrank/task_ranking.html | 797 ++++++ docs/outrank/task_selftest.html | 342 +++ docs/outrank/task_summary.html | 401 +++ docs/outrank/task_visualization.html | 301 +++ docs/outrank/visualizations.html | 237 ++ .../visualizations/ranking_visualization.html | 980 ++++++++ docs/search.js | 46 + outrank/__init__.py | 3 + run_build_docs.sh | 1 + setup.py | 2 +- 30 files changed, 13623 insertions(+), 1 deletion(-) create mode 100644 DOCS.md create mode 100644 docs/index.html create mode 100644 docs/outrank.html create mode 100644 docs/outrank/algorithms.html create mode 100644 docs/outrank/algorithms/feature_ranking.html create mode 100644 docs/outrank/algorithms/feature_ranking/ranking_mi_numba.html create mode 100644 docs/outrank/algorithms/importance_estimator.html create mode 100644 docs/outrank/algorithms/sketches.html create mode 100644 docs/outrank/algorithms/sketches/counting_ultiloglog.html create mode 100644 docs/outrank/algorithms/synthetic_data_generators.html create mode 100644 docs/outrank/algorithms/synthetic_data_generators/generator_naive.html create mode 100644 docs/outrank/core_ranking.html create mode 100644 docs/outrank/core_selftest.html create mode 100644 docs/outrank/core_utils.html create mode 100644 docs/outrank/feature_transformations.html create mode 100644 docs/outrank/feature_transformations/feature_transformer_vault.html create mode 100644 docs/outrank/feature_transformations/feature_transformer_vault/default_transformers.html create mode 100644 docs/outrank/feature_transformations/feature_transformer_vault/fw_transformers.html create mode 100644 docs/outrank/feature_transformations/ranking_transformers.html create mode 100644 docs/outrank/task_generators.html create mode 100644 docs/outrank/task_ranking.html create mode 100644 docs/outrank/task_selftest.html create mode 100644 docs/outrank/task_summary.html create mode 100644 docs/outrank/task_visualization.html create mode 100644 docs/outrank/visualizations.html create mode 100644 docs/outrank/visualizations/ranking_visualization.html create mode 100644 docs/search.js create mode 100644 run_build_docs.sh diff --git a/DOCS.md b/DOCS.md new file mode 100644 index 0000000..e7cc33f --- /dev/null +++ b/DOCS.md @@ -0,0 +1,12 @@ + + ░█████╗░██╗░░░██╗████████╗██████╗░░█████╗░███╗░░██╗██╗░░██╗ + ██╔══██╗██║░░░██║╚══██╔══╝██╔══██╗██╔══██╗████╗░██║██║░██╔╝ + ██║░░██║██║░░░██║░░░██║░░░██████╔╝███████║██╔██╗██║█████═╝░ + ██║░░██║██║░░░██║░░░██║░░░██╔══██╗██╔══██║██║╚████║██╔═██╗░ + ╚█████╔╝╚██████╔╝░░░██║░░░██║░░██║██║░░██║██║░╚███║██║░╚██╗ + ░╚════╝░░╚═════╝░░░░╚═╝░░░╚═╝░░╚═╝╚═╝░░╚═╝╚═╝░░╚══╝╚═╝░░╚═╝ + + +# Welcome to OutRank's documentation! + +All functions/methods can be searched-for (search bar on the left). diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..045e692 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/docs/outrank.html b/docs/outrank.html new file mode 100644 index 0000000..6b0cccf --- /dev/null +++ b/docs/outrank.html @@ -0,0 +1,276 @@ + + + + + + + outrank API documentation + + + + + + + + + +
+
+

+outrank

+ +
░█████╗░██╗░░░██╗████████╗██████╗░░█████╗░███╗░░██╗██╗░░██╗
+██╔══██╗██║░░░██║╚══██╔══╝██╔══██╗██╔══██╗████╗░██║██║░██╔╝
+██║░░██║██║░░░██║░░░██║░░░██████╔╝███████║██╔██╗██║█████═╝░
+██║░░██║██║░░░██║░░░██║░░░██╔══██╗██╔══██║██║╚████║██╔═██╗░
+╚█████╔╝╚██████╔╝░░░██║░░░██║░░██║██║░░██║██║░╚███║██║░╚██╗
+░╚════╝░░╚═════╝░░░░╚═╝░░░╚═╝░░╚═╝╚═╝░░╚═╝╚═╝░░╚══╝╚═╝░░╚═╝
+
+ +

Welcome to OutRank's documentation!

+ +

All functions/methods can be searched-for (search bar on the left).

+
+ + + + + +
 1"""
+ 2.. include:: ../DOCS.md
+ 3"""
+ 4
+ 5from __future__ import annotations
+ 6
+ 7import logging
+ 8
+ 9logging.basicConfig(
+10    format='%(asctime)s - %(message)s',
+11    datefmt='%d-%b-%y %H:%M:%S',
+12)
+13logging.getLogger(__name__).setLevel(logging.INFO)
+
+ + +
+
+ + diff --git a/docs/outrank/algorithms.html b/docs/outrank/algorithms.html new file mode 100644 index 0000000..94d44f0 --- /dev/null +++ b/docs/outrank/algorithms.html @@ -0,0 +1,240 @@ + + + + + + + outrank.algorithms API documentation + + + + + + + + + +
+
+

+outrank.algorithms

+ + + + + +
+
+ + diff --git a/docs/outrank/algorithms/feature_ranking.html b/docs/outrank/algorithms/feature_ranking.html new file mode 100644 index 0000000..9f0d614 --- /dev/null +++ b/docs/outrank/algorithms/feature_ranking.html @@ -0,0 +1,237 @@ + + + + + + + outrank.algorithms.feature_ranking API documentation + + + + + + + + + +
+
+

+outrank.algorithms.feature_ranking

+ + + + + +
+
+ + diff --git a/docs/outrank/algorithms/feature_ranking/ranking_mi_numba.html b/docs/outrank/algorithms/feature_ranking/ranking_mi_numba.html new file mode 100644 index 0000000..89c8bc5 --- /dev/null +++ b/docs/outrank/algorithms/feature_ranking/ranking_mi_numba.html @@ -0,0 +1,633 @@ + + + + + + + outrank.algorithms.feature_ranking.ranking_mi_numba API documentation + + + + + + + + + +
+
+

+outrank.algorithms.feature_ranking.ranking_mi_numba

+ + + + + + +
  1from __future__ import annotations
+  2
+  3import numpy as np
+  4from numba import njit
+  5from numba import prange
+  6
+  7np.random.seed(123)
+  8# Fast Numba-based approximative mutual information
+  9
+ 10
+ 11@njit(
+ 12    'Tuple((int32[:], int32[:]))(int32[:])',
+ 13    cache=True,
+ 14    fastmath=True,
+ 15    error_model='numpy',
+ 16    boundscheck=True,
+ 17)
+ 18def numba_unique(a):
+ 19    """Identify unique elements in an array, fast"""
+ 20
+ 21    len_a = a.shape[0]
+ 22    container = np.zeros(np.max(a) + 1, dtype=np.int32)
+ 23    for el in range(len_a):
+ 24        container[a[el]] += 1
+ 25
+ 26    unique_values = np.where(container != 0)[0]
+ 27    unique_counts = container[unique_values]
+ 28    return unique_values.astype(np.int32), unique_counts.astype(np.int32)
+ 29
+ 30
+ 31@njit(
+ 32    'float32(int32[:], int32[:], int32, float32)',
+ 33    cache=True,
+ 34    fastmath=True,
+ 35    error_model='numpy',
+ 36    boundscheck=True,
+ 37)
+ 38def compute_conditional_entropy(Y_classes, class_values, class_var_shape, initial_prob):
+ 39    conditional_entropy = 0.0
+ 40
+ 41    for c in class_values:
+ 42        conditional_prob = np.count_nonzero(Y_classes == c) / class_var_shape
+ 43        if conditional_prob != 0:
+ 44            conditional_entropy -= (
+ 45                initial_prob * conditional_prob * np.log(conditional_prob)
+ 46            )
+ 47
+ 48    return conditional_entropy
+ 49
+ 50
+ 51@njit(
+ 52    'float32(int32[:], int32[:], int32, int32[:], int32[:], b1)',
+ 53    cache=True,
+ 54    parallel=False,
+ 55    fastmath=True,
+ 56    error_model='numpy',
+ 57    boundscheck=True,
+ 58)
+ 59def compute_entropies(
+ 60    X, Y, all_events, f_values, f_value_counts, cardinality_correction,
+ 61):
+ 62    """Core entropy computation function"""
+ 63
+ 64    conditional_entropy = 0.0
+ 65    background_cond_entropy = 0.0
+ 66    full_entropy = 0.0
+ 67
+ 68    class_values, class_counts = numba_unique(Y)
+ 69
+ 70    if not cardinality_correction:
+ 71        for k in prange(len(class_counts)):
+ 72            class_probability = class_counts[k] / all_events
+ 73            full_entropy += -class_probability * np.log(class_probability)
+ 74
+ 75    for f_index in prange(len(f_values)):
+ 76        _f_value_counts = f_value_counts[f_index]
+ 77
+ 78        if _f_value_counts == 1:
+ 79            continue
+ 80
+ 81        initial_prob = _f_value_counts / all_events
+ 82        x_value_subspace = np.where(X == f_values[f_index])
+ 83        Y_classes = Y[x_value_subspace]
+ 84        conditional_entropy += compute_conditional_entropy(
+ 85            Y_classes, class_values, _f_value_counts, initial_prob,
+ 86        )
+ 87
+ 88        if cardinality_correction:
+ 89            # A neat hack that seems to work fine (permutations are expensive)
+ 90            Y_classes = np.roll(Y, _f_value_counts)[x_value_subspace]
+ 91
+ 92            background_cond_entropy += compute_conditional_entropy(
+ 93                Y_classes, class_values, _f_value_counts, initial_prob,
+ 94            )
+ 95
+ 96    if not cardinality_correction:
+ 97        return full_entropy - conditional_entropy
+ 98
+ 99    else:
+100        # note: full entropy falls out during derivation of final term
+101        core_joint_entropy = -conditional_entropy + background_cond_entropy
+102        return core_joint_entropy
+103
+104
+105@njit(
+106    'float32(int32[:], int32[:], float32, b1)',
+107    cache=True,
+108    fastmath=True,
+109    error_model='numpy',
+110    boundscheck=True,
+111)
+112def mutual_info_estimator_numba(
+113    Y, X, approximation_factor=1, cardinality_correction=False,
+114):
+115    """Core estimator logic. Compute unique elements, subset if required"""
+116
+117    all_events = len(X)
+118    f_values, f_value_counts = numba_unique(X)
+119
+120    # Diagonal entries
+121    if np.sum(X - Y) == 0:
+122        cardinality_correction = False
+123
+124    if approximation_factor < 1:
+125        subspace_size = int(approximation_factor * all_events)
+126        if subspace_size != 0:
+127            subspace = np.random.randint(0, all_events, size=subspace_size)
+128            X = X[subspace]
+129            Y = Y[subspace]
+130
+131    joint_entropy_core = compute_entropies(
+132        X, Y, all_events, f_values, f_value_counts, cardinality_correction,
+133    )
+134
+135    return approximation_factor * joint_entropy_core
+136
+137
+138if __name__ == '__main__':
+139    import pandas as pd
+140    from sklearn.feature_selection import mutual_info_classif
+141
+142    np.random.seed(123)
+143    import time
+144
+145    final_times = []
+146    for algo in ['MI-numba-randomized']:
+147        for order in range(20, 21):
+148            for j in range(1):
+149                start = time.time()
+150                a = np.random.randint(1000, size=2**order).astype(np.int32)
+151                b = np.random.randint(1000, size=2**order).astype(np.int32)
+152                if algo == 'MI':
+153                    final_score = mutual_info_classif(
+154                        a.reshape(-1, 1), b.reshape(-1), discrete_features=True,
+155                    )
+156                elif algo == 'MI-numba-randomized':
+157                    final_score = mutual_info_estimator_numba(
+158                        a, b, np.float32(1.0), True,
+159                    )
+160                elif algo == 'MI-numba':
+161                    final_score = mutual_info_estimator_numba(
+162                        a, b, np.float32(1.0), False,
+163                    )
+164                elif algo == 'MI-numba-randomized-ap':
+165                    final_score = mutual_info_estimator_numba(
+166                        a, b, np.float32(0.3), True,
+167                    )
+168                elif algo == 'MI-numba-ap':
+169                    final_score = mutual_info_estimator_numba(
+170                        a, b, np.float32(0.3), False,
+171                    )
+172
+173                end = time.time()
+174                tdiff = end - start
+175                instance = {
+176                    'time': tdiff,
+177                    'samples 2e': order, 'algorithm': algo,
+178                }
+179                final_times.append(instance)
+180                print(instance)
+181    dfx = pd.DataFrame(final_times)
+182    dfx = dfx.sort_values(by=['samples 2e'])
+183    print(dfx)
+
+ + +
+
+ +
+
@njit('Tuple((int32[:], int32[:]))(int32[:])', cache=True, fastmath=True, error_model='numpy', boundscheck=True)
+ + def + numba_unique(a): + + + +
+ +
12@njit(
+13    'Tuple((int32[:], int32[:]))(int32[:])',
+14    cache=True,
+15    fastmath=True,
+16    error_model='numpy',
+17    boundscheck=True,
+18)
+19def numba_unique(a):
+20    """Identify unique elements in an array, fast"""
+21
+22    len_a = a.shape[0]
+23    container = np.zeros(np.max(a) + 1, dtype=np.int32)
+24    for el in range(len_a):
+25        container[a[el]] += 1
+26
+27    unique_values = np.where(container != 0)[0]
+28    unique_counts = container[unique_values]
+29    return unique_values.astype(np.int32), unique_counts.astype(np.int32)
+
+ + +

Identify unique elements in an array, fast

+
+ + +
+
+ +
+
@njit('float32(int32[:], int32[:], int32, float32)', cache=True, fastmath=True, error_model='numpy', boundscheck=True)
+ + def + compute_conditional_entropy(Y_classes, class_values, class_var_shape, initial_prob): + + + +
+ +
32@njit(
+33    'float32(int32[:], int32[:], int32, float32)',
+34    cache=True,
+35    fastmath=True,
+36    error_model='numpy',
+37    boundscheck=True,
+38)
+39def compute_conditional_entropy(Y_classes, class_values, class_var_shape, initial_prob):
+40    conditional_entropy = 0.0
+41
+42    for c in class_values:
+43        conditional_prob = np.count_nonzero(Y_classes == c) / class_var_shape
+44        if conditional_prob != 0:
+45            conditional_entropy -= (
+46                initial_prob * conditional_prob * np.log(conditional_prob)
+47            )
+48
+49    return conditional_entropy
+
+ + + + +
+
+ +
+
@njit('float32(int32[:], int32[:], int32, int32[:], int32[:], b1)', cache=True, parallel=False, fastmath=True, error_model='numpy', boundscheck=True)
+ + def + compute_entropies(X, Y, all_events, f_values, f_value_counts, cardinality_correction): + + + +
+ +
 52@njit(
+ 53    'float32(int32[:], int32[:], int32, int32[:], int32[:], b1)',
+ 54    cache=True,
+ 55    parallel=False,
+ 56    fastmath=True,
+ 57    error_model='numpy',
+ 58    boundscheck=True,
+ 59)
+ 60def compute_entropies(
+ 61    X, Y, all_events, f_values, f_value_counts, cardinality_correction,
+ 62):
+ 63    """Core entropy computation function"""
+ 64
+ 65    conditional_entropy = 0.0
+ 66    background_cond_entropy = 0.0
+ 67    full_entropy = 0.0
+ 68
+ 69    class_values, class_counts = numba_unique(Y)
+ 70
+ 71    if not cardinality_correction:
+ 72        for k in prange(len(class_counts)):
+ 73            class_probability = class_counts[k] / all_events
+ 74            full_entropy += -class_probability * np.log(class_probability)
+ 75
+ 76    for f_index in prange(len(f_values)):
+ 77        _f_value_counts = f_value_counts[f_index]
+ 78
+ 79        if _f_value_counts == 1:
+ 80            continue
+ 81
+ 82        initial_prob = _f_value_counts / all_events
+ 83        x_value_subspace = np.where(X == f_values[f_index])
+ 84        Y_classes = Y[x_value_subspace]
+ 85        conditional_entropy += compute_conditional_entropy(
+ 86            Y_classes, class_values, _f_value_counts, initial_prob,
+ 87        )
+ 88
+ 89        if cardinality_correction:
+ 90            # A neat hack that seems to work fine (permutations are expensive)
+ 91            Y_classes = np.roll(Y, _f_value_counts)[x_value_subspace]
+ 92
+ 93            background_cond_entropy += compute_conditional_entropy(
+ 94                Y_classes, class_values, _f_value_counts, initial_prob,
+ 95            )
+ 96
+ 97    if not cardinality_correction:
+ 98        return full_entropy - conditional_entropy
+ 99
+100    else:
+101        # note: full entropy falls out during derivation of final term
+102        core_joint_entropy = -conditional_entropy + background_cond_entropy
+103        return core_joint_entropy
+
+ + +

Core entropy computation function

+
+ + +
+
+ +
+
@njit('float32(int32[:], int32[:], float32, b1)', cache=True, fastmath=True, error_model='numpy', boundscheck=True)
+ + def + mutual_info_estimator_numba(Y, X, approximation_factor=1, cardinality_correction=False): + + + +
+ +
106@njit(
+107    'float32(int32[:], int32[:], float32, b1)',
+108    cache=True,
+109    fastmath=True,
+110    error_model='numpy',
+111    boundscheck=True,
+112)
+113def mutual_info_estimator_numba(
+114    Y, X, approximation_factor=1, cardinality_correction=False,
+115):
+116    """Core estimator logic. Compute unique elements, subset if required"""
+117
+118    all_events = len(X)
+119    f_values, f_value_counts = numba_unique(X)
+120
+121    # Diagonal entries
+122    if np.sum(X - Y) == 0:
+123        cardinality_correction = False
+124
+125    if approximation_factor < 1:
+126        subspace_size = int(approximation_factor * all_events)
+127        if subspace_size != 0:
+128            subspace = np.random.randint(0, all_events, size=subspace_size)
+129            X = X[subspace]
+130            Y = Y[subspace]
+131
+132    joint_entropy_core = compute_entropies(
+133        X, Y, all_events, f_values, f_value_counts, cardinality_correction,
+134    )
+135
+136    return approximation_factor * joint_entropy_core
+
+ + +

Core estimator logic. Compute unique elements, subset if required

+
+ + +
+
+ + diff --git a/docs/outrank/algorithms/importance_estimator.html b/docs/outrank/algorithms/importance_estimator.html new file mode 100644 index 0000000..bb90b6d --- /dev/null +++ b/docs/outrank/algorithms/importance_estimator.html @@ -0,0 +1,735 @@ + + + + + + + outrank.algorithms.importance_estimator API documentation + + + + + + + + + +
+
+

+outrank.algorithms.importance_estimator

+ + + + + + +
  1# A module for pairwise computation of importances -- entrypoint for the core ranking algorighm(s)
+  2from __future__ import annotations
+  3
+  4import operator
+  5import traceback
+  6from typing import Any
+  7from typing import Dict
+  8
+  9import numpy as np
+ 10import pandas as pd
+ 11from scipy.stats import pearsonr
+ 12from sklearn.feature_selection import mutual_info_classif
+ 13from sklearn.linear_model import LogisticRegression
+ 14from sklearn.metrics import adjusted_mutual_info_score
+ 15from sklearn.model_selection import cross_val_score
+ 16from sklearn.preprocessing import OneHotEncoder
+ 17from sklearn.svm import SVC
+ 18
+ 19try:
+ 20    from outrank.algorithms.feature_ranking import ranking_mi_numba
+ 21
+ 22    numba_available = True
+ 23
+ 24except Exception as es:
+ 25    traceback.print_exc(0)
+ 26    numba_available = False
+ 27
+ 28
+ 29def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
+ 30    estimate_feature_importance = mutual_info_classif(
+ 31        vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True,
+ 32    )[0]
+ 33    return estimate_feature_importance
+ 34
+ 35
+ 36def sklearn_surrogate(
+ 37    vector_first: Any, vector_second: Any, surrogate_model: str,
+ 38) -> float:
+ 39    if surrogate_model == 'surrogate-LR':
+ 40        clf = LogisticRegression(max_iter=100000)
+ 41    elif surrogate_model == 'surrogate-SVM':
+ 42        clf = SVC(gamma='auto', probability=True)
+ 43
+ 44    transf = OneHotEncoder()
+ 45
+ 46    # They do not commute, swap if needed
+ 47    if len(np.unique(vector_second) > 2):
+ 48        vector_third = vector_second
+ 49        vector_second = vector_first
+ 50        vector_first = vector_third
+ 51        del vector_third
+ 52
+ 53    unique_values, counts = np.unique(vector_second, return_counts=True)
+ 54
+ 55    # Establish min support for this type of ranking.
+ 56    if counts[0] < len(unique_values) * (2**5):
+ 57        estimate_feature_importance = 0
+ 58
+ 59    else:
+ 60        vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
+ 61        estimate_feature_importance_list = cross_val_score(
+ 62            clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
+ 63        )
+ 64
+ 65        estimate_feature_importance = 1 + \
+ 66            np.median(estimate_feature_importance_list)
+ 67
+ 68    return estimate_feature_importance
+ 69
+ 70
+ 71def numba_mi(vector_first, vector_second, heuristic):
+ 72    if heuristic == 'MI-numba-randomized':
+ 73        cardinality_correction = True
+ 74
+ 75    else:
+ 76        cardinality_correction = False
+ 77
+ 78    estimate_feature_importance = ranking_mi_numba.mutual_info_estimator_numba(
+ 79        vector_first.reshape(-1).astype(np.int32),
+ 80        vector_second.reshape(-1).astype(np.int32),
+ 81        approximation_factor=np.float32(1.0),
+ 82        cardinality_correction=cardinality_correction,
+ 83    )
+ 84
+ 85    return estimate_feature_importance
+ 86
+ 87
+ 88def sklearn_mi_adj(vector_first, vector_second):
+ 89    # AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
+ 90    estimate_feature_importance = adjusted_mutual_info_score(
+ 91        vector_first.reshape(-1), vector_second.reshape(-1),
+ 92    )
+ 93    return estimate_feature_importance
+ 94
+ 95
+ 96def get_importances_estimate_pairwise(combination, args, tmp_df):
+ 97    """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""
+ 98
+ 99    feature_one = combination[0]
+100    feature_two = combination[1]
+101
+102    vector_first = tmp_df[[feature_one]].values.ravel()
+103    vector_second = tmp_df[[feature_two]].values.ravel()
+104
+105    if len(vector_first) == 0 or len(vector_second) == 0:
+106        return [feature_one, feature_two, 0]
+107
+108    # Compute score based on the selected heuristic.
+109    if args.heuristic == 'MI':
+110        # Compute the infoGain
+111        estimate_feature_importance = sklearn_MI(vector_first, vector_second)
+112
+113    elif 'surrogate-' in args.heuristic:
+114        estimate_feature_importance = sklearn_surrogate(
+115            vector_first, vector_second, args.heuristic,
+116        )
+117
+118    elif 'MI-numba' in args.heuristic:
+119        estimate_feature_importance = numba_mi(
+120            vector_first, vector_second, args.heuristic,
+121        )
+122
+123    elif args.heuristic == 'AMI':
+124        estimate_feature_importance = sklearn_mi_adj(
+125            vector_first, vector_second,
+126        )
+127
+128    elif args.heuristic == 'correlation-Pearson':
+129        estimate_feature_importance = pearsonr(vector_first, vector_second)[0]
+130
+131    elif args.heuristic == 'Constant':
+132        estimate_feature_importance = 0.0
+133
+134    else:
+135        raise ValueError(
+136            'Please select one of the possible heuristics (MI, chi2)',
+137        )
+138
+139    return (feature_one, feature_two, estimate_feature_importance)
+140
+141
+142def rank_features_3MR(
+143    relevance_dict: dict[str, float],
+144    redundancy_dict: dict[tuple[Any, Any], Any],
+145    relational_dict: dict[tuple[Any, Any], Any],
+146    strategy: str = 'median',
+147    alpha: float = 1,
+148    beta: float = 1,
+149) -> pd.DataFrame:
+150    all_features = relevance_dict.keys()
+151    most_important_feature = max(
+152        relevance_dict.items(), key=operator.itemgetter(1),
+153    )[0]
+154    ranked_features = [most_important_feature]
+155
+156    def calc_higher_order(feature, is_redundancy=True):
+157        values = []
+158        for feat in ranked_features:
+159            if is_redundancy:
+160                values.append(redundancy_dict[(feat, feature)])
+161            else:
+162                values.append(relational_dict[(feat, feature)])
+163        if strategy == 'sum':
+164            return sum(values)
+165        if strategy == 'mean':
+166            return np.mean(values)
+167        return np.median(values)
+168
+169    while len(ranked_features) != len(all_features):
+170        top_importance = 0
+171        most_important_feature = ''
+172
+173        for ind, feat in enumerate(set(all_features) - set(ranked_features)):
+174            feature_redundancy = calc_higher_order(feat)
+175            feature_relation = calc_higher_order(feat, False)
+176            feature_relevance = relevance_dict[feat]
+177            importance = (
+178                feature_relevance - alpha * feature_redundancy + beta * feature_relation
+179            )
+180
+181            if (importance > top_importance) or (ind == 0):
+182                top_importance = importance
+183                most_important_feature = feat
+184        ranked_features.append(most_important_feature)
+185    return pd.DataFrame(
+186        {
+187            'Feature': ranked_features,
+188            '3mr_ranking': list(range(1, len(ranked_features) + 1)),
+189        },
+190    )
+191
+192
+193def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
+194    # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
+195    # TODO - this is to be executed directly on df - no need for parallel kernel(s)
+196    pass
+
+ + +
+
+ +
+ + def + sklearn_MI(vector_first: Any, vector_second: Any) -> float: + + + +
+ +
30def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
+31    estimate_feature_importance = mutual_info_classif(
+32        vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True,
+33    )[0]
+34    return estimate_feature_importance
+
+ + + + +
+
+ +
+ + def + sklearn_surrogate(vector_first: Any, vector_second: Any, surrogate_model: str) -> float: + + + +
+ +
37def sklearn_surrogate(
+38    vector_first: Any, vector_second: Any, surrogate_model: str,
+39) -> float:
+40    if surrogate_model == 'surrogate-LR':
+41        clf = LogisticRegression(max_iter=100000)
+42    elif surrogate_model == 'surrogate-SVM':
+43        clf = SVC(gamma='auto', probability=True)
+44
+45    transf = OneHotEncoder()
+46
+47    # They do not commute, swap if needed
+48    if len(np.unique(vector_second) > 2):
+49        vector_third = vector_second
+50        vector_second = vector_first
+51        vector_first = vector_third
+52        del vector_third
+53
+54    unique_values, counts = np.unique(vector_second, return_counts=True)
+55
+56    # Establish min support for this type of ranking.
+57    if counts[0] < len(unique_values) * (2**5):
+58        estimate_feature_importance = 0
+59
+60    else:
+61        vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
+62        estimate_feature_importance_list = cross_val_score(
+63            clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
+64        )
+65
+66        estimate_feature_importance = 1 + \
+67            np.median(estimate_feature_importance_list)
+68
+69    return estimate_feature_importance
+
+ + + + +
+
+ +
+ + def + numba_mi(vector_first, vector_second, heuristic): + + + +
+ +
72def numba_mi(vector_first, vector_second, heuristic):
+73    if heuristic == 'MI-numba-randomized':
+74        cardinality_correction = True
+75
+76    else:
+77        cardinality_correction = False
+78
+79    estimate_feature_importance = ranking_mi_numba.mutual_info_estimator_numba(
+80        vector_first.reshape(-1).astype(np.int32),
+81        vector_second.reshape(-1).astype(np.int32),
+82        approximation_factor=np.float32(1.0),
+83        cardinality_correction=cardinality_correction,
+84    )
+85
+86    return estimate_feature_importance
+
+ + + + +
+
+ +
+ + def + sklearn_mi_adj(vector_first, vector_second): + + + +
+ +
89def sklearn_mi_adj(vector_first, vector_second):
+90    # AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
+91    estimate_feature_importance = adjusted_mutual_info_score(
+92        vector_first.reshape(-1), vector_second.reshape(-1),
+93    )
+94    return estimate_feature_importance
+
+ + + + +
+
+ +
+ + def + get_importances_estimate_pairwise(combination, args, tmp_df): + + + +
+ +
 97def get_importances_estimate_pairwise(combination, args, tmp_df):
+ 98    """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""
+ 99
+100    feature_one = combination[0]
+101    feature_two = combination[1]
+102
+103    vector_first = tmp_df[[feature_one]].values.ravel()
+104    vector_second = tmp_df[[feature_two]].values.ravel()
+105
+106    if len(vector_first) == 0 or len(vector_second) == 0:
+107        return [feature_one, feature_two, 0]
+108
+109    # Compute score based on the selected heuristic.
+110    if args.heuristic == 'MI':
+111        # Compute the infoGain
+112        estimate_feature_importance = sklearn_MI(vector_first, vector_second)
+113
+114    elif 'surrogate-' in args.heuristic:
+115        estimate_feature_importance = sklearn_surrogate(
+116            vector_first, vector_second, args.heuristic,
+117        )
+118
+119    elif 'MI-numba' in args.heuristic:
+120        estimate_feature_importance = numba_mi(
+121            vector_first, vector_second, args.heuristic,
+122        )
+123
+124    elif args.heuristic == 'AMI':
+125        estimate_feature_importance = sklearn_mi_adj(
+126            vector_first, vector_second,
+127        )
+128
+129    elif args.heuristic == 'correlation-Pearson':
+130        estimate_feature_importance = pearsonr(vector_first, vector_second)[0]
+131
+132    elif args.heuristic == 'Constant':
+133        estimate_feature_importance = 0.0
+134
+135    else:
+136        raise ValueError(
+137            'Please select one of the possible heuristics (MI, chi2)',
+138        )
+139
+140    return (feature_one, feature_two, estimate_feature_importance)
+
+ + +

A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel.

+
+ + +
+
+ +
+ + def + rank_features_3MR( relevance_dict: dict[str, float], redundancy_dict: dict[tuple[typing.Any, typing.Any], typing.Any], relational_dict: dict[tuple[typing.Any, typing.Any], typing.Any], strategy: str = 'median', alpha: float = 1, beta: float = 1) -> pandas.core.frame.DataFrame: + + + +
+ +
143def rank_features_3MR(
+144    relevance_dict: dict[str, float],
+145    redundancy_dict: dict[tuple[Any, Any], Any],
+146    relational_dict: dict[tuple[Any, Any], Any],
+147    strategy: str = 'median',
+148    alpha: float = 1,
+149    beta: float = 1,
+150) -> pd.DataFrame:
+151    all_features = relevance_dict.keys()
+152    most_important_feature = max(
+153        relevance_dict.items(), key=operator.itemgetter(1),
+154    )[0]
+155    ranked_features = [most_important_feature]
+156
+157    def calc_higher_order(feature, is_redundancy=True):
+158        values = []
+159        for feat in ranked_features:
+160            if is_redundancy:
+161                values.append(redundancy_dict[(feat, feature)])
+162            else:
+163                values.append(relational_dict[(feat, feature)])
+164        if strategy == 'sum':
+165            return sum(values)
+166        if strategy == 'mean':
+167            return np.mean(values)
+168        return np.median(values)
+169
+170    while len(ranked_features) != len(all_features):
+171        top_importance = 0
+172        most_important_feature = ''
+173
+174        for ind, feat in enumerate(set(all_features) - set(ranked_features)):
+175            feature_redundancy = calc_higher_order(feat)
+176            feature_relation = calc_higher_order(feat, False)
+177            feature_relevance = relevance_dict[feat]
+178            importance = (
+179                feature_relevance - alpha * feature_redundancy + beta * feature_relation
+180            )
+181
+182            if (importance > top_importance) or (ind == 0):
+183                top_importance = importance
+184                most_important_feature = feat
+185        ranked_features.append(most_important_feature)
+186    return pd.DataFrame(
+187        {
+188            'Feature': ranked_features,
+189            '3mr_ranking': list(range(1, len(ranked_features) + 1)),
+190        },
+191    )
+
+ + + + +
+
+ +
+ + def + get_importances_estimate_nonmyopic(args: Any, tmp_df: pandas.core.frame.DataFrame): + + + +
+ +
194def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
+195    # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
+196    # TODO - this is to be executed directly on df - no need for parallel kernel(s)
+197    pass
+
+ + + + +
+
+ + diff --git a/docs/outrank/algorithms/sketches.html b/docs/outrank/algorithms/sketches.html new file mode 100644 index 0000000..c8011bb --- /dev/null +++ b/docs/outrank/algorithms/sketches.html @@ -0,0 +1,237 @@ + + + + + + + outrank.algorithms.sketches API documentation + + + + + + + + + +
+
+

+outrank.algorithms.sketches

+ + + + + +
+
+ + diff --git a/docs/outrank/algorithms/sketches/counting_ultiloglog.html b/docs/outrank/algorithms/sketches/counting_ultiloglog.html new file mode 100644 index 0000000..d326f96 --- /dev/null +++ b/docs/outrank/algorithms/sketches/counting_ultiloglog.html @@ -0,0 +1,636 @@ + + + + + + + outrank.algorithms.sketches.counting_ultiloglog API documentation + + + + + + + + + +
+
+

+outrank.algorithms.sketches.counting_ultiloglog

+ +

This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory

+
+ + + + + +
  1"""
+  2This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory
+  3"""
+  4from __future__ import annotations
+  5
+  6import numpy as np
+  7import xxhash
+  8
+  9
+ 10class HyperLogLogWCache:
+ 11    def __init__(self, error_rate=0.005):
+ 12        # int(np.ceil(np.log2((1.04 / error_rate) ** 2)))
+ 13        self.p = 19
+ 14        self.m = 1 << self.p
+ 15        self.warmup_set = set()
+ 16        self.warmup_size = int(self.m / 2)
+ 17        self.width = 64 - self.p
+ 18        self.hll_flag = False
+ 19
+ 20    def _hasher_update(self, value):
+ 21        self.hasher = xxhash.xxh32(seed=self.p)
+ 22        if isinstance(value, str):
+ 23            value = value.encode('utf-8')
+ 24            self.hasher.update(bytes(value))
+ 25        else:
+ 26            self.hasher.update(bytes(value))
+ 27
+ 28        x = self.hasher.intdigest()
+ 29        j = x & (self.m - 1)
+ 30        w = x >> self.p
+ 31
+ 32        rho = self.width - w.bit_length()
+ 33        self.M[j] = max(self.M[j], rho)
+ 34
+ 35    def add(self, value):
+ 36        if len(self.warmup_set) < self.warmup_size and not self.hll_flag:
+ 37            self.warmup_set.add(value)
+ 38        elif not self.hll_flag:
+ 39            if not self.hll_flag:
+ 40                self.M = np.zeros(self.m)
+ 41                for element in self.warmup_set:
+ 42                    self._hasher_update(element)
+ 43                self.warmup_set = {}
+ 44            self.hll_flag = True
+ 45        else:
+ 46            self._hasher_update(value)
+ 47
+ 48    def __len__(self):
+ 49        if self.hll_flag:
+ 50            basis = np.ceil(
+ 51                self.m *
+ 52                np.log(np.divide(self.m, len(np.where(self.M == 0)[0]))),
+ 53            )
+ 54            if basis != np.inf:
+ 55                return int(basis) - 1
+ 56            else:
+ 57                return 2**self.p
+ 58        else:
+ 59            return len(self.warmup_set)
+ 60
+ 61
+ 62if __name__ == '__main__':
+ 63    import random
+ 64    import string
+ 65    import time
+ 66
+ 67    import matplotlib.pyplot as plt
+ 68    import pandas as pd
+ 69    import seaborn as sns
+ 70    import tqdm
+ 71    from pympler import asizeof
+ 72
+ 73    def get_random_string(length):
+ 74        # choose from all lowercase letter
+ 75        letters = string.ascii_lowercase
+ 76        result_str = ''.join(random.choice(letters) for i in range(length))
+ 77        return result_str
+ 78
+ 79    # results_df = []
+ 80    # num_vals = 100000
+ 81    # nbits = 16
+ 82    # for _ in range(3):
+ 83    #     for j in tqdm.tqdm(range(1000000, 10000000, 1000)):
+ 84    #         ground = list(set(np.random.randint(0, j, num_vals).tolist()))
+ 85    #         ground = ground + [
+ 86    #             get_random_string(random.randint(1, 15)) for k in range(j)
+ 87    #         ]
+ 88
+ 89    #         start_time = time.time()
+ 90    #         GLOBAL_CARDINALITY_STORAGE = {}
+ 91    #         GLOBAL_CARDINALITY_STORAGE[1] = HyperLogLogWCache(0.005)
+ 92
+ 93    #         for j in ground:
+ 94    #             GLOBAL_CARDINALITY_STORAGE[1].add(j)
+ 95
+ 96    #         size1 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE)
+ 97    #         error1 = 100 * \
+ 98    #             (1 - len(GLOBAL_CARDINALITY_STORAGE[1]) / len(set(ground)))
+ 99    #         end_time = time.time()
+100    #         tp1 = end_time - start_time
+101
+102    #         import hyperloglog
+103
+104    #         start_time = time.time()
+105    #         GLOBAL_CARDINALITY_STORAGE = {}
+106    #         GLOBAL_CARDINALITY_STORAGE[1] = hyperloglog.HyperLogLog(0.005)
+107
+108    #         for j in ground:
+109    #             GLOBAL_CARDINALITY_STORAGE[1].add(j)
+110    #         size2 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE)
+111    #         error2 = 100 * \
+112    #             (1 - len(GLOBAL_CARDINALITY_STORAGE[1]) / len(set(ground)))
+113    #         end_time = time.time()
+114    #         tp2 = end_time - start_time
+115
+116    #         start_time = time.time()
+117    #         GLOBAL_CARDINALITY_STORAGE = set()
+118
+119    #         for j in ground:
+120    #             GLOBAL_CARDINALITY_STORAGE.add(j)
+121
+122    #         size3 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE)
+123    #         error3 = 100 * \
+124    #             (1 - len(GLOBAL_CARDINALITY_STORAGE) / len(set(ground)))
+125    #         end_time = time.time()
+126    #         tp3 = end_time - start_time
+127
+128    #         results_df.append(
+129    #             {
+130    #                 'num_samples': len(ground),
+131    #                 'time': tp3,
+132    #                 'algo': 'set',
+133    #                 'error': error3,
+134    #             },
+135    #         )
+136    #         results_df.append(
+137    #             {
+138    #                 'num_samples': len(ground),
+139    #                 'time': tp2,
+140    #                 'algo': 'default',
+141    #                 'error': error2,
+142    #             },
+143    #         )
+144    #         results_df.append(
+145    #             {
+146    #                 'num_samples': len(ground),
+147    #                 'time': tp1,
+148    #                 'algo': f'hllc ({nbits}, mixed)',
+149    #                 'error': error1,
+150    #             },
+151    #         )
+152
+153    # out_df = pd.DataFrame(results_df)
+154    # out_df.to_csv('backup.csv')
+155    # print(out_df)
+156    # print(out_df.groupby('algo').mean())
+157    # sns.lineplot(
+158    #     x=out_df.num_samples, y=out_df.error,
+159    #     hue=out_df.algo, alpha=0.5,
+160    # )
+161    # plt.tight_layout()
+162    # plt.ylabel('Num. of unique values in data')
+163    # plt.ylabel('Abs error')
+164    # plt.savefig('linep.pdf')
+165    # plt.clf()
+166    # plt.cla()
+167
+168    # sns.lineplot(
+169    #     x=out_df.num_samples.astype(
+170    #         float,
+171    #     ), y=out_df.time, hue=out_df.algo,
+172    # )
+173    # plt.tight_layout()
+174    # plt.ylabel('Time (s)')
+175    # plt.savefig('barp.pdf')
+176    # plt.clf()
+177    # plt.cla()
+
+ + +
+
+ +
+ + class + HyperLogLogWCache: + + + +
+ +
11class HyperLogLogWCache:
+12    def __init__(self, error_rate=0.005):
+13        # int(np.ceil(np.log2((1.04 / error_rate) ** 2)))
+14        self.p = 19
+15        self.m = 1 << self.p
+16        self.warmup_set = set()
+17        self.warmup_size = int(self.m / 2)
+18        self.width = 64 - self.p
+19        self.hll_flag = False
+20
+21    def _hasher_update(self, value):
+22        self.hasher = xxhash.xxh32(seed=self.p)
+23        if isinstance(value, str):
+24            value = value.encode('utf-8')
+25            self.hasher.update(bytes(value))
+26        else:
+27            self.hasher.update(bytes(value))
+28
+29        x = self.hasher.intdigest()
+30        j = x & (self.m - 1)
+31        w = x >> self.p
+32
+33        rho = self.width - w.bit_length()
+34        self.M[j] = max(self.M[j], rho)
+35
+36    def add(self, value):
+37        if len(self.warmup_set) < self.warmup_size and not self.hll_flag:
+38            self.warmup_set.add(value)
+39        elif not self.hll_flag:
+40            if not self.hll_flag:
+41                self.M = np.zeros(self.m)
+42                for element in self.warmup_set:
+43                    self._hasher_update(element)
+44                self.warmup_set = {}
+45            self.hll_flag = True
+46        else:
+47            self._hasher_update(value)
+48
+49    def __len__(self):
+50        if self.hll_flag:
+51            basis = np.ceil(
+52                self.m *
+53                np.log(np.divide(self.m, len(np.where(self.M == 0)[0]))),
+54            )
+55            if basis != np.inf:
+56                return int(basis) - 1
+57            else:
+58                return 2**self.p
+59        else:
+60            return len(self.warmup_set)
+
+ + + + +
+ +
+ + HyperLogLogWCache(error_rate=0.005) + + + +
+ +
12    def __init__(self, error_rate=0.005):
+13        # int(np.ceil(np.log2((1.04 / error_rate) ** 2)))
+14        self.p = 19
+15        self.m = 1 << self.p
+16        self.warmup_set = set()
+17        self.warmup_size = int(self.m / 2)
+18        self.width = 64 - self.p
+19        self.hll_flag = False
+
+ + + + +
+
+
+ p + + +
+ + + + +
+
+
+ m + + +
+ + + + +
+
+
+ warmup_set + + +
+ + + + +
+
+
+ warmup_size + + +
+ + + + +
+
+
+ width + + +
+ + + + +
+
+
+ hll_flag + + +
+ + + + +
+
+ +
+ + def + add(self, value): + + + +
+ +
36    def add(self, value):
+37        if len(self.warmup_set) < self.warmup_size and not self.hll_flag:
+38            self.warmup_set.add(value)
+39        elif not self.hll_flag:
+40            if not self.hll_flag:
+41                self.M = np.zeros(self.m)
+42                for element in self.warmup_set:
+43                    self._hasher_update(element)
+44                self.warmup_set = {}
+45            self.hll_flag = True
+46        else:
+47            self._hasher_update(value)
+
+ + + + +
+
+
+ + diff --git a/docs/outrank/algorithms/synthetic_data_generators.html b/docs/outrank/algorithms/synthetic_data_generators.html new file mode 100644 index 0000000..0a2063c --- /dev/null +++ b/docs/outrank/algorithms/synthetic_data_generators.html @@ -0,0 +1,237 @@ + + + + + + + outrank.algorithms.synthetic_data_generators API documentation + + + + + + + + + +
+
+

+outrank.algorithms.synthetic_data_generators

+ + + + + +
+
+ + diff --git a/docs/outrank/algorithms/synthetic_data_generators/generator_naive.html b/docs/outrank/algorithms/synthetic_data_generators/generator_naive.html new file mode 100644 index 0000000..cf1042c --- /dev/null +++ b/docs/outrank/algorithms/synthetic_data_generators/generator_naive.html @@ -0,0 +1,342 @@ + + + + + + + outrank.algorithms.synthetic_data_generators.generator_naive API documentation + + + + + + + + + +
+
+

+outrank.algorithms.synthetic_data_generators.generator_naive

+ + + + + + +
 1# This simplest thing we can do for now.
+ 2from __future__ import annotations
+ 3
+ 4import numpy as np
+ 5
+ 6np.random.seed(123)
+ 7
+ 8
+ 9def generate_random_matrix(num_features=100, size=20000):
+10    # random int matrix (categorical)
+11    sample = np.random.randint(10, 100, size=(size, num_features))
+12
+13    target = sample[:, 30]
+14    # Some noise
+15
+16    target[target < 20] = 0
+17    return sample, target
+18
+19
+20if __name__ == '__main__':
+21    import argparse
+22    import logging
+23    import os
+24    import shutil
+25
+26    import pandas as pd
+27
+28    logging.basicConfig(
+29        format='%(asctime)s - %(message)s',
+30        datefmt='%d-%b-%y %H:%M:%S',
+31    )
+32    logger = logging.getLogger('syn-logger')
+33    logger.setLevel(logging.DEBUG)
+34
+35    parser = argparse.ArgumentParser(
+36        description='Fast feature screening for sparse data sets.',
+37        formatter_class=argparse.RawTextHelpFormatter,
+38    )
+39
+40    parser.add_argument('--output_df_name', type=str, default=None)
+41
+42    parser.add_argument('--verify_outputs', type=str, default=None)
+43
+44    parser.add_argument('--num_features', type=int, default=300)
+45
+46    parser.add_argument('--size', type=int, default=1000)
+47
+48    args = parser.parse_args()
+49
+50    if args.output_df_name is not None:
+51        sample, target = generate_random_matrix(args.num_features, args.size)
+52        dfx = pd.DataFrame(sample)
+53        dfx.columns = [f'f{x}' for x in range(dfx.shape[1])]
+54        dfx['label'] = target
+55        if os.path.exists(args.output_df_name) and os.path.isdir(args.output_df_name):
+56            shutil.rmtree(args.output_df_name)
+57        os.mkdir(args.output_df_name)
+58        dfx.to_csv(f'./{args.output_df_name}/data.csv', index=False)
+59
+60        logging.info(f'Generated dataset {dfx.shape} in {args.output_df_name}')
+61    elif args.verify_outputs is not None:
+62        rankings = pd.read_csv(
+63            os.path.join(args.verify_outputs, 'feature_singles.tsv'), sep='\t',
+64        )
+65        if rankings.iloc[1]['Feature'] != 'f30-(81; 100)':
+66            raise Exception(
+67                f'Could not retrieve the appropriate feature needle in the haystack {rankings.iloc[1].Feature}, exiting',
+68            )
+69        else:
+70            logger.info(
+71                f'Identified the appropriate feature in the haystack ({rankings.iloc[1].Feature})',
+72            )
+
+ + +
+
+ +
+ + def + generate_random_matrix(num_features=100, size=20000): + + + +
+ +
10def generate_random_matrix(num_features=100, size=20000):
+11    # random int matrix (categorical)
+12    sample = np.random.randint(10, 100, size=(size, num_features))
+13
+14    target = sample[:, 30]
+15    # Some noise
+16
+17    target[target < 20] = 0
+18    return sample, target
+
+ + + + +
+
+ + diff --git a/docs/outrank/core_ranking.html b/docs/outrank/core_ranking.html new file mode 100644 index 0000000..230891d --- /dev/null +++ b/docs/outrank/core_ranking.html @@ -0,0 +1,2078 @@ + + + + + + + outrank.core_ranking API documentation + + + + + + + + + +
+
+

+outrank.core_ranking

+ + + + + + +
  1from __future__ import annotations
+  2
+  3import gzip
+  4import itertools
+  5import logging
+  6import os
+  7import random
+  8import time
+  9from collections import Counter
+ 10from collections import defaultdict
+ 11from collections import deque
+ 12from timeit import default_timer as timer
+ 13from typing import Any
+ 14from typing import Dict
+ 15from typing import List
+ 16from typing import Set
+ 17from typing import Tuple
+ 18from typing import Union
+ 19
+ 20import numpy as np
+ 21import pandas as pd
+ 22import tqdm
+ 23
+ 24from outrank.algorithms.importance_estimator import get_importances_estimate_pairwise
+ 25from outrank.algorithms.sketches.counting_ultiloglog import (
+ 26    HyperLogLogWCache as HyperLogLog,
+ 27)
+ 28from outrank.core_utils import BatchRankingSummary
+ 29from outrank.core_utils import extract_features_from_reference_JSON
+ 30from outrank.core_utils import generic_line_parser
+ 31from outrank.core_utils import internal_hash
+ 32from outrank.core_utils import NominalFeatureSummary
+ 33from outrank.core_utils import NumericFeatureSummary
+ 34from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric
+ 35from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise
+ 36
+ 37logger = logging.getLogger('syn-logger')
+ 38logger.setLevel(logging.DEBUG)
+ 39random.seed(a=123, version=2)
+ 40GLOBAL_CARDINALITY_STORAGE: dict[Any, Any] = dict()
+ 41GLOBAL_RARE_VALUE_STORAGE: dict[str, Any] = Counter()
+ 42
+ 43IGNORED_VALUES = set()
+ 44HYPERLL_ERROR_BOUND = 0.02
+ 45
+ 46
+ 47def encode_int_column(input_tuple: tuple[str, Any]) -> tuple[Any, list[int]]:
+ 48    """Encode column values as categoric (at a batch level!)"""
+ 49
+ 50    hashes, _ = pd.factorize(input_tuple[1])
+ 51    return input_tuple[0], hashes
+ 52
+ 53
+ 54def mixed_rank_graph(
+ 55    input_dataframe: pd.DataFrame, args: Any, cpu_pool: Any, pbar: Any,
+ 56) -> BatchRankingSummary:
+ 57    """Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic"""
+ 58
+ 59    all_columns = input_dataframe.columns
+ 60
+ 61    triplets = []
+ 62    tmp_df = input_dataframe.copy()
+ 63    out_time_struct = {}
+ 64
+ 65    # Handle cont. types prior to interaction evaluation
+ 66    pbar.set_description('Encoding columns')
+ 67    jobs = [(cname, tmp_df[cname]) for cname in all_columns]
+ 68    col_dots = '.'
+ 69    start_enc_timer = timer()
+ 70    with cpu_pool as p:
+ 71        results = p.amap(encode_int_column, jobs)
+ 72        while not results.ready():
+ 73            time.sleep(4)
+ 74            col_dots = col_dots + '.'
+ 75            pbar.set_description(f'Encoding columns .{col_dots}')
+ 76        tmp_df = pd.DataFrame({k: v for k, v in results.get()})
+ 77    end_enc_timer = timer()
+ 78    out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer
+ 79
+ 80    # Helper method for parallel estimation
+ 81    combinations = list(
+ 82        itertools.combinations_with_replacement(all_columns, 2),
+ 83    )
+ 84
+ 85    if '3mr' in args.heuristic:
+ 86        rel_columns = [
+ 87            column for column in all_columns if ' AND_REL ' in column
+ 88        ]
+ 89        non_rel_columns = list(set(all_columns) - set(rel_columns))
+ 90        combinations = list(
+ 91            itertools.combinations_with_replacement(non_rel_columns, 2),
+ 92        )
+ 93        combinations += [(column, args.label_column) for column in rel_columns]
+ 94    else:
+ 95        combinations = list(
+ 96            itertools.combinations_with_replacement(all_columns, 2),
+ 97        )
+ 98
+ 99    # Diagonal elements
+100    for individual_column in all_columns:
+101        if individual_column != args.label_column:
+102            combinations += [(individual_column, individual_column)]
+103
+104    # Some applications do not require the full feature-feature triangular matrix
+105    if (args.target_ranking_only == 'True') and ('3mr' not in args.heuristic):
+106        combinations = [x for x in combinations if args.label_column in x]
+107
+108    random.shuffle(combinations)
+109    combinations = combinations[: args.combination_number_upper_bound]
+110
+111    if args.heuristic == 'Constant':
+112        final_constant_imp = []
+113        for c1, c2 in combinations:
+114            final_constant_imp.append((c1, c2, 0.0))
+115
+116        out_time_struct['feature_score_computation'] = end_enc_timer - \
+117            start_enc_timer
+118        return BatchRankingSummary(final_constant_imp, out_time_struct)
+119
+120    # Map the scoring calls to the worker pool
+121    pbar.set_description('Allocating thread pool')
+122
+123    # starmap is an alternative that is slower unfortunately (but nicer)
+124    def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
+125        return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
+126
+127    start_enc_timer = timer()
+128    with cpu_pool as p:
+129        pbar.set_description(f'Computing (#ftr={len(combinations)})')
+130        results = p.amap(get_grounded_importances_estimate, combinations)
+131        while not results.ready():
+132            time.sleep(4)
+133        triplets = results.get()
+134    end_enc_timer = timer()
+135    out_time_struct['feature_score_computation'] = end_enc_timer - \
+136        start_enc_timer
+137
+138    # Gather the final triplets
+139    pbar.set_description('Aggregation of ranking results')
+140    final_triplets = []
+141    for triplet in triplets:
+142        inv = (triplet[1], triplet[0], triplet[2])
+143        final_triplets.append(inv)
+144        final_triplets.append(triplet)
+145        triplets = final_triplets
+146
+147    pbar.set_description('Proceeding to the next batch of data')
+148    return BatchRankingSummary(triplets, out_time_struct)
+149
+150
+151def enrich_with_transformations(
+152    input_dataframe: pd.DataFrame, num_col_types: set[str], logger: Any, args: Any,
+153) -> pd.DataFrame:
+154    """Construct a collection of new features based on pre-defined transformations/rules"""
+155
+156    transformer = FeatureTransformerGeneric(
+157        num_col_types, preset=args.transformers,
+158    )
+159    transformed_df = transformer.construct_new_features(input_dataframe)
+160    logger.info(
+161        f'Constructed {len(transformer.constructed_feature_names)} new features ..',
+162    )
+163
+164    return transformed_df
+165
+166
+167def compute_combined_features(
+168    input_dataframe: pd.DataFrame,
+169    logger: Any,
+170    args: Any,
+171    pbar: Any,
+172    is_3mr: bool = False,
+173) -> pd.DataFrame:
+174    """Compute higher order features via xxhash-based trick."""
+175
+176    all_columns = [
+177        x for x in input_dataframe.columns if x != args.label_column
+178    ]
+179    join_string = ' AND_REL ' if is_3mr else ' AND '
+180    interaction_order = 2 if is_3mr else args.interaction_order
+181
+182    full_combination_space = list(
+183        itertools.combinations(all_columns, interaction_order),
+184    )
+185
+186    if args.combination_number_upper_bound:
+187        random.shuffle(full_combination_space)
+188        full_combination_space = full_combination_space[
+189            : args.combination_number_upper_bound
+190        ]
+191
+192    com_counter = 0
+193    new_feature_hash = {}
+194    for new_combination in full_combination_space:
+195        pbar.set_description(
+196            f'Created {com_counter}/{len(full_combination_space)}',
+197        )
+198        combined_feature: list[str] = [str(0)] * input_dataframe.shape[0]
+199        for feature in new_combination:
+200            tmp_feature = input_dataframe[feature].tolist()
+201            for enx, el in enumerate(tmp_feature):
+202                combined_feature[enx] = str(
+203                    internal_hash(
+204                        str(combined_feature[enx]) + str(el),
+205                    ),
+206                )
+207        ftr_name = join_string.join(str(x) for x in new_combination)
+208        new_feature_hash[ftr_name] = combined_feature
+209        com_counter += 1
+210    tmp_df = pd.DataFrame(new_feature_hash)
+211    pbar.set_description('Concatenating into final frame ..')
+212    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
+213    del tmp_df
+214
+215    return input_dataframe
+216
+217
+218def compute_expanded_multivalue_features(
+219    input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any,
+220) -> pd.DataFrame:
+221    """Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value "a,b,c" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice."""
+222
+223    considered_multivalue_features = args.explode_multivalue_features.split(
+224        ';',
+225    )
+226    new_feature_hash = {}
+227    missing_symbols = set(args.missing_value_symbols.split(','))
+228
+229    for multivalue_feature in considered_multivalue_features:
+230        multivalue_feature_vector = input_dataframe[multivalue_feature].values.tolist(
+231        )
+232        multivalue_feature_vector = [
+233            x.replace(',', '-') for x in multivalue_feature_vector
+234        ]
+235        multivalue_sets = [
+236            set(x.split('-'))
+237            for x in multivalue_feature_vector
+238        ]
+239        unique_values = set.union(*multivalue_sets)
+240
+241        for missing_symbol in missing_symbols:
+242            if missing_symbol in unique_values:
+243                unique_values.remove(missing_symbol)
+244
+245        for unique_value in unique_values:
+246            tmp_vec = []
+247            for enx, multivalue in enumerate(multivalue_sets):
+248                if unique_value in multivalue:
+249                    tmp_vec.append('1')
+250                else:
+251                    tmp_vec.append('')
+252
+253            new_feature_hash[f'MULTIEX-{multivalue_feature}-{unique_value}'] = tmp_vec
+254
+255    tmp_df = pd.DataFrame(new_feature_hash)
+256    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
+257    del tmp_df
+258
+259    return input_dataframe
+260
+261
+262def compute_subfeatures(
+263    input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any,
+264) -> pd.DataFrame:
+265    """Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction.
+266    ->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered.
+267    <->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)
+268    """
+269
+270    all_subfeature_pair_seeds = args.subfeature_mapping.split(';')
+271    new_feature_hash = dict()
+272
+273    for seed_pair in all_subfeature_pair_seeds:
+274        if '<->' in seed_pair:
+275            feature_first, feature_second = seed_pair.split('<->')
+276
+277        elif '->' in seed_pair:
+278            feature_first, feature_second = seed_pair.split('->')
+279
+280        else:
+281            raise NotImplementedError(
+282                'Please specify valid subfeature operator (<-> or ->)',
+283            )
+284
+285        subframe = input_dataframe[[feature_first, feature_second]]
+286        unique_feature_second = subframe[feature_second].unique()
+287        feature_first_vec = subframe[feature_first].tolist()
+288        feature_second_vec = subframe[feature_second].tolist()
+289        out_template_feature = [
+290            (a, b) for a, b in zip(feature_first_vec, feature_second_vec)
+291        ]
+292
+293        if '<->' in seed_pair:
+294            unique_feature_first = subframe[feature_first].unique()
+295
+296            mask_types = []
+297            for unique_target_feature_value in unique_feature_second:
+298                for unique_seed_feature_value in unique_feature_first:
+299                    mask_types.append(
+300                        (unique_seed_feature_value, unique_target_feature_value),
+301                    )
+302
+303            for mask_type in mask_types:
+304                new_feature = []
+305                for value_tuple in out_template_feature:
+306                    if (
+307                        value_tuple[0] == mask_type[0]
+308                        and value_tuple[1] == mask_type[1]
+309                    ):
+310                        new_feature.append(str(1))
+311                    else:
+312                        new_feature.append(str(0))
+313                feature_name = (
+314                    f'SUBFEATURE|{feature_first}|{feature_second}-'
+315                    + mask_type[0]
+316                    + '&'
+317                    + mask_type[1]
+318                )
+319                new_feature_hash[feature_name] = new_feature
+320
+321            del new_feature
+322
+323        elif '->' in seed_pair:
+324            for unique_target_feature_value in unique_feature_second:
+325                tmp_new_feature = [
+326                    'AND'.join(
+327                        x,
+328                    ) if x[1] == unique_target_feature_value else ''
+329                    for x in out_template_feature
+330                ]
+331                feature_name_final = (
+332                    'SUBFEATURE-' + feature_first + '&' + unique_target_feature_value
+333                )
+334                new_feature_hash[feature_name_final] = tmp_new_feature
+335
+336    tmp_df = pd.DataFrame(new_feature_hash)
+337    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
+338
+339    del tmp_df
+340    return input_dataframe
+341
+342
+343def include_noisy_features(
+344    input_dataframe: pd.DataFrame, logger: Any, args: Any,
+345) -> pd.DataFrame:
+346    """Add randomized features that serve as a sanity check"""
+347
+348    transformer = FeatureTransformerNoise()
+349    transformed_df = transformer.construct_new_features(
+350        input_dataframe, args.label_column,
+351    )
+352
+353    return transformed_df
+354
+355
+356def compute_coverage(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]:
+357    """Compute coverage of features, incrementally"""
+358    output_storage_cov = defaultdict(set)
+359    all_missing_symbols = set(args.missing_value_symbols.split(','))
+360    for column in input_dataframe:
+361        all_missing = sum(
+362            [
+363                input_dataframe[column].values.tolist().count(x)
+364                for x in all_missing_symbols
+365            ],
+366        )
+367
+368        output_storage_cov[column] = (
+369            1 - (all_missing / input_dataframe.shape[0])
+370        ) * 100
+371
+372    return output_storage_cov
+373
+374
+375def compute_feature_memory_consumption(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]:
+376    """An approximation of how much feature take up"""
+377    output_storage_features = defaultdict(set)
+378    for col in input_dataframe.columns:
+379        specific_column = [
+380            str(x).strip() for x in input_dataframe[col].astype(str).values.tolist()
+381        ]
+382        col_size = sum(
+383            len(x.encode())
+384            for x in specific_column
+385        ) / input_dataframe.shape[0]
+386        output_storage_features[col] = col_size
+387    return output_storage_features
+388
+389
+390def compute_value_counts(input_dataframe: pd.DataFrame, args: Any):
+391    """Update the count structure"""
+392
+393    global GLOBAL_RARE_VALUE_STORAGE
+394    global IGNORED_VALUES
+395
+396    for column in input_dataframe.columns:
+397        main_values = input_dataframe[column].values
+398        for value in main_values:
+399            if value not in IGNORED_VALUES:
+400                GLOBAL_RARE_VALUE_STORAGE.update({(column, value): 1})
+401
+402    for key, val in GLOBAL_RARE_VALUE_STORAGE.items():
+403        if val > args.rare_value_count_upper_bound:
+404            IGNORED_VALUES.add(key)
+405
+406    for to_remove_val in IGNORED_VALUES:
+407        del GLOBAL_RARE_VALUE_STORAGE[to_remove_val]
+408
+409
+410def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any) -> None:
+411    """Compute cardinalities of features, incrementally"""
+412
+413    global GLOBAL_CARDINALITY_STORAGE
+414    output_storage_card = defaultdict(set)
+415    for enx, column in enumerate(input_dataframe):
+416        output_storage_card[column] = set(input_dataframe[column].unique())
+417        if column not in GLOBAL_CARDINALITY_STORAGE:
+418            GLOBAL_CARDINALITY_STORAGE[column] = HyperLogLog(
+419                HYPERLL_ERROR_BOUND,
+420            )
+421
+422        for unique_value in set(input_dataframe[column].unique()):
+423            if unique_value:
+424                GLOBAL_CARDINALITY_STORAGE[column].add(
+425                    internal_hash(unique_value),
+426                )
+427        pbar.set_description(
+428            f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}',
+429        )
+430
+431
+432def compute_bounds_increment(
+433    input_dataframe: pd.DataFrame, numeric_column_types: set[str],
+434) -> dict[str, Any]:
+435    all_features = input_dataframe.columns
+436    numeric_column_types = set(numeric_column_types)
+437    summary_object = {}
+438    summary_storage: Any = {}
+439    for feature in all_features:
+440        if feature in numeric_column_types:
+441            feature_vector = pd.to_numeric(
+442                input_dataframe[feature], errors='coerce',
+443            )
+444            minimum = np.min(feature_vector)
+445            maximum = np.max(feature_vector)
+446            mean = np.mean(feature_vector)
+447            summary_storage = NumericFeatureSummary(
+448                feature, minimum, maximum, mean, len(
+449                    np.unique(feature_vector),
+450                ),
+451            )
+452            summary_object[feature] = summary_storage
+453
+454        else:
+455            feature_vector = input_dataframe[feature].values
+456            summary_storage = NominalFeatureSummary(
+457                feature, len(np.unique(feature_vector)),
+458            )
+459            summary_object[feature] = summary_storage
+460
+461    return summary_object
+462
+463
+464def compute_batch_ranking(
+465    line_tmp_storage: list[list[Any]],
+466    numeric_column_types: set[str],
+467    args: Any,
+468    cpu_pool: Any,
+469    column_descriptions: list[str],
+470    logger: Any,
+471    pbar: Any,
+472) -> tuple[BatchRankingSummary, dict[str, Any], dict[str, set[str]], dict[str, set[str]]]:
+473    """Enrich the feature space and compute the batch importances"""
+474
+475    input_dataframe = pd.DataFrame(line_tmp_storage)
+476    input_dataframe.columns = column_descriptions
+477    pbar.set_description('Control features')
+478
+479    if args.feature_set_focus:
+480        if args.feature_set_focus == '_all_from_reference_JSON':
+481            focus_set = extract_features_from_reference_JSON(
+482                args.reference_model_JSON,
+483            )
+484
+485        else:
+486            focus_set = set(args.feature_set_focus.split(','))
+487
+488        focus_set.add(args.label_column)
+489        focus_set = {x for x in focus_set if x in input_dataframe.columns}
+490        input_dataframe = input_dataframe[focus_set]
+491
+492    if args.transformers != 'none':
+493        pbar.set_description('Adding transformations')
+494        input_dataframe = enrich_with_transformations(
+495            input_dataframe, numeric_column_types, logger, args,
+496        )
+497
+498    if args.explode_multivalue_features != 'False':
+499        pbar.set_description('Constructing new features from multivalue ones')
+500        input_dataframe = compute_expanded_multivalue_features(
+501            input_dataframe, logger, args, pbar,
+502        )
+503
+504    if args.subfeature_mapping != 'False':
+505        pbar.set_description('Constructing new (sub)features')
+506        input_dataframe = compute_subfeatures(
+507            input_dataframe, logger, args, pbar,
+508        )
+509
+510    if args.interaction_order > 1:
+511        pbar.set_description('Constructing new features')
+512        input_dataframe = compute_combined_features(
+513            input_dataframe, logger, args, pbar,
+514        )
+515
+516    # in case of 3mr we compute the score of combinations against the target
+517    if '3mr' in args.heuristic:
+518        pbar.set_description(
+519            'Constructing features for computing relations in 3mr',
+520        )
+521        input_dataframe = compute_combined_features(
+522            input_dataframe, logger, args, pbar, True,
+523        )
+524
+525    if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant':
+526        pbar.set_description('Computing baseline features')
+527        input_dataframe = include_noisy_features(input_dataframe, logger, args)
+528
+529    # Compute incremental statistic useful for data inspection/transformer generation
+530    pbar.set_description('Computing coverage')
+531    coverage_storage = compute_coverage(input_dataframe, args)
+532    feature_memory_consumption = compute_feature_memory_consumption(
+533        input_dataframe, args,
+534    )
+535    compute_cardinalities(input_dataframe, pbar)
+536
+537    if args.task == 'identify_rare_values':
+538        compute_value_counts(input_dataframe, args)
+539
+540    bounds_storage = compute_bounds_increment(
+541        input_dataframe, numeric_column_types,
+542    )
+543
+544    pbar.set_description(
+545        f'Computing ranks for {input_dataframe.shape[1]} features',
+546    )
+547
+548    return (
+549        mixed_rank_graph(input_dataframe, args, cpu_pool, pbar),
+550        bounds_storage,
+551        coverage_storage,
+552        feature_memory_consumption,
+553    )
+554
+555
+556def get_num_of_instances(fname: str) -> int:
+557    """Count the number of lines in a file, fast - useful for progress logging"""
+558
+559    def _make_gen(reader):
+560        while True:
+561            b = reader(2**16)
+562            if not b:
+563                break
+564            yield b
+565
+566    with open(fname, 'rb') as f:
+567        count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read))
+568    return count
+569
+570
+571def get_grouped_df(importances_df_list: list[tuple[str, str, float]]) -> pd.DataFrame:
+572    """A helper method that enables median-based aggregation after processing"""
+573
+574    importances_df = pd.DataFrame(importances_df_list)
+575    if len(importances_df) == 0:
+576        return None
+577    importances_df.columns = ['FeatureA', 'FeatureB', 'Score']
+578    grouped = importances_df.groupby(
+579        ['FeatureA', 'FeatureB'],
+580    ).median().reset_index()
+581    return grouped
+582
+583
+584def checkpoint_importances_df(importances_batch: list[tuple[str, str, float]]) -> None:
+585    """A helper which stores intermediary state - useful for longer runs"""
+586
+587    gdf = get_grouped_df(importances_batch)
+588    if gdf is not None:
+589        gdf.to_csv('ranking_checkpoint_tmp.tsv', sep='\t')
+590
+591
+592def estimate_importances_minibatches(
+593    input_file: str,
+594    column_descriptions: list,
+595    fw_col_mapping: dict[str, str],
+596    numeric_column_types: set,
+597    batch_size: int = 100000,
+598    args: Any = None,
+599    data_encoding: str = 'utf-8',
+600    cpu_pool: Any = None,
+601    delimiter: str = '\t',
+602    feature_construction_mode: bool = False,
+603    logger: Any = None,
+604) -> tuple[list[dict[str, Any]], Any, dict[Any, Any], list[dict[str, Any]], list[dict[str, set[str]]], defaultdict[str, list[set[str]]], dict[str, Any]]:
+605    """Interaction score estimator - suitable for example for csv-like input data types.
+606    This type of data is normally a single large csv, meaning that minibatch processing needs to
+607    happen during incremental handling of the file (that"s not the case for pre-separated ob data)
+608    """
+609
+610    invalid_line_queue: Any = deque([], maxlen=2**5)
+611
+612    invalid_lines = 0
+613    line_counter = 0
+614
+615    importances_df: list[Any] = []
+616    line_tmp_storage = []
+617    bounds_storage_batch = []
+618    memory_storage_batch = []
+619    step_timing_checkpoints = []
+620
+621    local_coverage_object = defaultdict(list)
+622    local_pbar = tqdm.tqdm(
+623        total=get_num_of_instances(input_file) - 1, position=0,
+624    )
+625
+626    file_name, file_extension = os.path.splitext(input_file)
+627
+628    if file_extension == '.gz':
+629        file_stream = gzip.open(input_file, 'rt', encoding=data_encoding)
+630
+631    else:
+632        file_stream = open(input_file, encoding=data_encoding)
+633
+634    file_stream.readline()
+635
+636    local_pbar.set_description('Starting ranking computation')
+637    for line in file_stream:
+638        line_counter += 1
+639        local_pbar.update(1)
+640
+641        if line_counter % args.subsampling != 0:
+642            continue
+643
+644        parsed_line = generic_line_parser(
+645            line, delimiter, args, fw_col_mapping, column_descriptions,
+646        )
+647
+648        if len(parsed_line) == len(column_descriptions):
+649            line_tmp_storage.append(parsed_line)
+650
+651        else:
+652            invalid_line_queue.appendleft(str(parsed_line))
+653            invalid_lines += 1
+654
+655        # Batches need to be processed on-the-fly
+656        if len(line_tmp_storage) >= args.minibatch_size:
+657
+658            importances_batch, bounds_storage, coverage_storage, memory_storage = compute_batch_ranking(
+659                line_tmp_storage,
+660                numeric_column_types,
+661                args,
+662                cpu_pool,
+663                column_descriptions,
+664                logger,
+665                local_pbar,
+666            )
+667
+668            bounds_storage_batch.append(bounds_storage)
+669            memory_storage_batch.append(memory_storage)
+670            for k, v in coverage_storage.items():
+671                local_coverage_object[k].append(v)
+672
+673            del coverage_storage
+674
+675            line_tmp_storage = []
+676            step_timing_checkpoints.append(importances_batch.step_times)
+677            importances_df += importances_batch.triplet_scores
+678
+679            if args.heuristic != 'Constant':
+680                local_pbar.set_description('Creating checkpoint')
+681                checkpoint_importances_df(importances_df)
+682
+683    file_stream.close()
+684
+685    local_pbar.set_description('Parsing the remainder')
+686    if invalid_lines > 0:
+687        logger.info(
+688            f"Detected {invalid_lines} invalid lines. If this number is very high, it's possible your header is off - re-check your data/attribute-feature mappings please!",
+689        )
+690
+691        invalid_lines_log = '\n INVALID_LINE ====> '.join(
+692            list(invalid_line_queue)[0:5],
+693        )
+694        logger.info(
+695            f'5 samples of invalid lines are printed below\n {invalid_lines_log}',
+696        )
+697
+698    remaining_batch_size = len(line_tmp_storage)
+699
+700    if remaining_batch_size > 2**10:
+701        line_tmp_storage = line_tmp_storage[: args.minibatch_size]
+702        importances_batch, bounds_storage, coverage_storage, _ = compute_batch_ranking(
+703            line_tmp_storage,
+704            numeric_column_types,
+705            args,
+706            cpu_pool,
+707            column_descriptions,
+708            logger,
+709            local_pbar,
+710        )
+711
+712        for k, v in coverage_storage.items():
+713            local_coverage_object[k].append(v)
+714
+715        step_timing_checkpoints.append(importances_batch.step_times)
+716        importances_df += importances_batch.triplet_scores
+717        bounds_storage = dict()
+718        bounds_storage_batch.append(bounds_storage)
+719        checkpoint_importances_df(importances_df)
+720
+721    local_pbar.set_description('Wrapping up')
+722    local_pbar.close()
+723
+724    return (
+725        step_timing_checkpoints,
+726        get_grouped_df(importances_df),
+727        GLOBAL_CARDINALITY_STORAGE,
+728        bounds_storage_batch,
+729        memory_storage_batch,
+730        local_coverage_object,
+731        GLOBAL_RARE_VALUE_STORAGE,
+732    )
+
+ + +
+
+
+ logger = +<Logger syn-logger (DEBUG)> + + +
+ + + + +
+
+
+ GLOBAL_CARDINALITY_STORAGE: dict[typing.Any, typing.Any] = +{} + + +
+ + + + +
+
+
+ GLOBAL_RARE_VALUE_STORAGE: dict[str, typing.Any] = +Counter() + + +
+ + + + +
+
+
+ IGNORED_VALUES = +set() + + +
+ + + + +
+
+
+ HYPERLL_ERROR_BOUND = +0.02 + + +
+ + + + +
+
+ +
+ + def + encode_int_column(input_tuple: tuple[str, typing.Any]) -> tuple[typing.Any, list[int]]: + + + +
+ +
48def encode_int_column(input_tuple: tuple[str, Any]) -> tuple[Any, list[int]]:
+49    """Encode column values as categoric (at a batch level!)"""
+50
+51    hashes, _ = pd.factorize(input_tuple[1])
+52    return input_tuple[0], hashes
+
+ + +

Encode column values as categoric (at a batch level!)

+
+ + +
+
+ +
+ + def + mixed_rank_graph( input_dataframe: pandas.core.frame.DataFrame, args: Any, cpu_pool: Any, pbar: Any) -> outrank.core_utils.BatchRankingSummary: + + + +
+ +
 55def mixed_rank_graph(
+ 56    input_dataframe: pd.DataFrame, args: Any, cpu_pool: Any, pbar: Any,
+ 57) -> BatchRankingSummary:
+ 58    """Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic"""
+ 59
+ 60    all_columns = input_dataframe.columns
+ 61
+ 62    triplets = []
+ 63    tmp_df = input_dataframe.copy()
+ 64    out_time_struct = {}
+ 65
+ 66    # Handle cont. types prior to interaction evaluation
+ 67    pbar.set_description('Encoding columns')
+ 68    jobs = [(cname, tmp_df[cname]) for cname in all_columns]
+ 69    col_dots = '.'
+ 70    start_enc_timer = timer()
+ 71    with cpu_pool as p:
+ 72        results = p.amap(encode_int_column, jobs)
+ 73        while not results.ready():
+ 74            time.sleep(4)
+ 75            col_dots = col_dots + '.'
+ 76            pbar.set_description(f'Encoding columns .{col_dots}')
+ 77        tmp_df = pd.DataFrame({k: v for k, v in results.get()})
+ 78    end_enc_timer = timer()
+ 79    out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer
+ 80
+ 81    # Helper method for parallel estimation
+ 82    combinations = list(
+ 83        itertools.combinations_with_replacement(all_columns, 2),
+ 84    )
+ 85
+ 86    if '3mr' in args.heuristic:
+ 87        rel_columns = [
+ 88            column for column in all_columns if ' AND_REL ' in column
+ 89        ]
+ 90        non_rel_columns = list(set(all_columns) - set(rel_columns))
+ 91        combinations = list(
+ 92            itertools.combinations_with_replacement(non_rel_columns, 2),
+ 93        )
+ 94        combinations += [(column, args.label_column) for column in rel_columns]
+ 95    else:
+ 96        combinations = list(
+ 97            itertools.combinations_with_replacement(all_columns, 2),
+ 98        )
+ 99
+100    # Diagonal elements
+101    for individual_column in all_columns:
+102        if individual_column != args.label_column:
+103            combinations += [(individual_column, individual_column)]
+104
+105    # Some applications do not require the full feature-feature triangular matrix
+106    if (args.target_ranking_only == 'True') and ('3mr' not in args.heuristic):
+107        combinations = [x for x in combinations if args.label_column in x]
+108
+109    random.shuffle(combinations)
+110    combinations = combinations[: args.combination_number_upper_bound]
+111
+112    if args.heuristic == 'Constant':
+113        final_constant_imp = []
+114        for c1, c2 in combinations:
+115            final_constant_imp.append((c1, c2, 0.0))
+116
+117        out_time_struct['feature_score_computation'] = end_enc_timer - \
+118            start_enc_timer
+119        return BatchRankingSummary(final_constant_imp, out_time_struct)
+120
+121    # Map the scoring calls to the worker pool
+122    pbar.set_description('Allocating thread pool')
+123
+124    # starmap is an alternative that is slower unfortunately (but nicer)
+125    def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
+126        return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
+127
+128    start_enc_timer = timer()
+129    with cpu_pool as p:
+130        pbar.set_description(f'Computing (#ftr={len(combinations)})')
+131        results = p.amap(get_grounded_importances_estimate, combinations)
+132        while not results.ready():
+133            time.sleep(4)
+134        triplets = results.get()
+135    end_enc_timer = timer()
+136    out_time_struct['feature_score_computation'] = end_enc_timer - \
+137        start_enc_timer
+138
+139    # Gather the final triplets
+140    pbar.set_description('Aggregation of ranking results')
+141    final_triplets = []
+142    for triplet in triplets:
+143        inv = (triplet[1], triplet[0], triplet[2])
+144        final_triplets.append(inv)
+145        final_triplets.append(triplet)
+146        triplets = final_triplets
+147
+148    pbar.set_description('Proceeding to the next batch of data')
+149    return BatchRankingSummary(triplets, out_time_struct)
+
+ + +

Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic

+
+ + +
+
+ +
+ + def + enrich_with_transformations( input_dataframe: pandas.core.frame.DataFrame, num_col_types: set[str], logger: Any, args: Any) -> pandas.core.frame.DataFrame: + + + +
+ +
152def enrich_with_transformations(
+153    input_dataframe: pd.DataFrame, num_col_types: set[str], logger: Any, args: Any,
+154) -> pd.DataFrame:
+155    """Construct a collection of new features based on pre-defined transformations/rules"""
+156
+157    transformer = FeatureTransformerGeneric(
+158        num_col_types, preset=args.transformers,
+159    )
+160    transformed_df = transformer.construct_new_features(input_dataframe)
+161    logger.info(
+162        f'Constructed {len(transformer.constructed_feature_names)} new features ..',
+163    )
+164
+165    return transformed_df
+
+ + +

Construct a collection of new features based on pre-defined transformations/rules

+
+ + +
+
+ +
+ + def + compute_combined_features( input_dataframe: pandas.core.frame.DataFrame, logger: Any, args: Any, pbar: Any, is_3mr: bool = False) -> pandas.core.frame.DataFrame: + + + +
+ +
168def compute_combined_features(
+169    input_dataframe: pd.DataFrame,
+170    logger: Any,
+171    args: Any,
+172    pbar: Any,
+173    is_3mr: bool = False,
+174) -> pd.DataFrame:
+175    """Compute higher order features via xxhash-based trick."""
+176
+177    all_columns = [
+178        x for x in input_dataframe.columns if x != args.label_column
+179    ]
+180    join_string = ' AND_REL ' if is_3mr else ' AND '
+181    interaction_order = 2 if is_3mr else args.interaction_order
+182
+183    full_combination_space = list(
+184        itertools.combinations(all_columns, interaction_order),
+185    )
+186
+187    if args.combination_number_upper_bound:
+188        random.shuffle(full_combination_space)
+189        full_combination_space = full_combination_space[
+190            : args.combination_number_upper_bound
+191        ]
+192
+193    com_counter = 0
+194    new_feature_hash = {}
+195    for new_combination in full_combination_space:
+196        pbar.set_description(
+197            f'Created {com_counter}/{len(full_combination_space)}',
+198        )
+199        combined_feature: list[str] = [str(0)] * input_dataframe.shape[0]
+200        for feature in new_combination:
+201            tmp_feature = input_dataframe[feature].tolist()
+202            for enx, el in enumerate(tmp_feature):
+203                combined_feature[enx] = str(
+204                    internal_hash(
+205                        str(combined_feature[enx]) + str(el),
+206                    ),
+207                )
+208        ftr_name = join_string.join(str(x) for x in new_combination)
+209        new_feature_hash[ftr_name] = combined_feature
+210        com_counter += 1
+211    tmp_df = pd.DataFrame(new_feature_hash)
+212    pbar.set_description('Concatenating into final frame ..')
+213    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
+214    del tmp_df
+215
+216    return input_dataframe
+
+ + +

Compute higher order features via xxhash-based trick.

+
+ + +
+
+ +
+ + def + compute_expanded_multivalue_features( input_dataframe: pandas.core.frame.DataFrame, logger: Any, args: Any, pbar: Any) -> pandas.core.frame.DataFrame: + + + +
+ +
219def compute_expanded_multivalue_features(
+220    input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any,
+221) -> pd.DataFrame:
+222    """Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value "a,b,c" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice."""
+223
+224    considered_multivalue_features = args.explode_multivalue_features.split(
+225        ';',
+226    )
+227    new_feature_hash = {}
+228    missing_symbols = set(args.missing_value_symbols.split(','))
+229
+230    for multivalue_feature in considered_multivalue_features:
+231        multivalue_feature_vector = input_dataframe[multivalue_feature].values.tolist(
+232        )
+233        multivalue_feature_vector = [
+234            x.replace(',', '-') for x in multivalue_feature_vector
+235        ]
+236        multivalue_sets = [
+237            set(x.split('-'))
+238            for x in multivalue_feature_vector
+239        ]
+240        unique_values = set.union(*multivalue_sets)
+241
+242        for missing_symbol in missing_symbols:
+243            if missing_symbol in unique_values:
+244                unique_values.remove(missing_symbol)
+245
+246        for unique_value in unique_values:
+247            tmp_vec = []
+248            for enx, multivalue in enumerate(multivalue_sets):
+249                if unique_value in multivalue:
+250                    tmp_vec.append('1')
+251                else:
+252                    tmp_vec.append('')
+253
+254            new_feature_hash[f'MULTIEX-{multivalue_feature}-{unique_value}'] = tmp_vec
+255
+256    tmp_df = pd.DataFrame(new_feature_hash)
+257    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
+258    del tmp_df
+259
+260    return input_dataframe
+
+ + +

Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value "a,b,c" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice.

+
+ + +
+
+ +
+ + def + compute_subfeatures( input_dataframe: pandas.core.frame.DataFrame, logger: Any, args: Any, pbar: Any) -> pandas.core.frame.DataFrame: + + + +
+ +
263def compute_subfeatures(
+264    input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any,
+265) -> pd.DataFrame:
+266    """Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction.
+267    ->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered.
+268    <->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)
+269    """
+270
+271    all_subfeature_pair_seeds = args.subfeature_mapping.split(';')
+272    new_feature_hash = dict()
+273
+274    for seed_pair in all_subfeature_pair_seeds:
+275        if '<->' in seed_pair:
+276            feature_first, feature_second = seed_pair.split('<->')
+277
+278        elif '->' in seed_pair:
+279            feature_first, feature_second = seed_pair.split('->')
+280
+281        else:
+282            raise NotImplementedError(
+283                'Please specify valid subfeature operator (<-> or ->)',
+284            )
+285
+286        subframe = input_dataframe[[feature_first, feature_second]]
+287        unique_feature_second = subframe[feature_second].unique()
+288        feature_first_vec = subframe[feature_first].tolist()
+289        feature_second_vec = subframe[feature_second].tolist()
+290        out_template_feature = [
+291            (a, b) for a, b in zip(feature_first_vec, feature_second_vec)
+292        ]
+293
+294        if '<->' in seed_pair:
+295            unique_feature_first = subframe[feature_first].unique()
+296
+297            mask_types = []
+298            for unique_target_feature_value in unique_feature_second:
+299                for unique_seed_feature_value in unique_feature_first:
+300                    mask_types.append(
+301                        (unique_seed_feature_value, unique_target_feature_value),
+302                    )
+303
+304            for mask_type in mask_types:
+305                new_feature = []
+306                for value_tuple in out_template_feature:
+307                    if (
+308                        value_tuple[0] == mask_type[0]
+309                        and value_tuple[1] == mask_type[1]
+310                    ):
+311                        new_feature.append(str(1))
+312                    else:
+313                        new_feature.append(str(0))
+314                feature_name = (
+315                    f'SUBFEATURE|{feature_first}|{feature_second}-'
+316                    + mask_type[0]
+317                    + '&'
+318                    + mask_type[1]
+319                )
+320                new_feature_hash[feature_name] = new_feature
+321
+322            del new_feature
+323
+324        elif '->' in seed_pair:
+325            for unique_target_feature_value in unique_feature_second:
+326                tmp_new_feature = [
+327                    'AND'.join(
+328                        x,
+329                    ) if x[1] == unique_target_feature_value else ''
+330                    for x in out_template_feature
+331                ]
+332                feature_name_final = (
+333                    'SUBFEATURE-' + feature_first + '&' + unique_target_feature_value
+334                )
+335                new_feature_hash[feature_name_final] = tmp_new_feature
+336
+337    tmp_df = pd.DataFrame(new_feature_hash)
+338    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
+339
+340    del tmp_df
+341    return input_dataframe
+
+ + +

Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction. +->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered. +<->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)

+
+ + +
+
+ +
+ + def + include_noisy_features( input_dataframe: pandas.core.frame.DataFrame, logger: Any, args: Any) -> pandas.core.frame.DataFrame: + + + +
+ +
344def include_noisy_features(
+345    input_dataframe: pd.DataFrame, logger: Any, args: Any,
+346) -> pd.DataFrame:
+347    """Add randomized features that serve as a sanity check"""
+348
+349    transformer = FeatureTransformerNoise()
+350    transformed_df = transformer.construct_new_features(
+351        input_dataframe, args.label_column,
+352    )
+353
+354    return transformed_df
+
+ + +

Add randomized features that serve as a sanity check

+
+ + +
+
+ +
+ + def + compute_coverage( input_dataframe: pandas.core.frame.DataFrame, args: Any) -> dict[str, set[str]]: + + + +
+ +
357def compute_coverage(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]:
+358    """Compute coverage of features, incrementally"""
+359    output_storage_cov = defaultdict(set)
+360    all_missing_symbols = set(args.missing_value_symbols.split(','))
+361    for column in input_dataframe:
+362        all_missing = sum(
+363            [
+364                input_dataframe[column].values.tolist().count(x)
+365                for x in all_missing_symbols
+366            ],
+367        )
+368
+369        output_storage_cov[column] = (
+370            1 - (all_missing / input_dataframe.shape[0])
+371        ) * 100
+372
+373    return output_storage_cov
+
+ + +

Compute coverage of features, incrementally

+
+ + +
+
+ +
+ + def + compute_feature_memory_consumption( input_dataframe: pandas.core.frame.DataFrame, args: Any) -> dict[str, set[str]]: + + + +
+ +
376def compute_feature_memory_consumption(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]:
+377    """An approximation of how much feature take up"""
+378    output_storage_features = defaultdict(set)
+379    for col in input_dataframe.columns:
+380        specific_column = [
+381            str(x).strip() for x in input_dataframe[col].astype(str).values.tolist()
+382        ]
+383        col_size = sum(
+384            len(x.encode())
+385            for x in specific_column
+386        ) / input_dataframe.shape[0]
+387        output_storage_features[col] = col_size
+388    return output_storage_features
+
+ + +

An approximation of how much feature take up

+
+ + +
+
+ +
+ + def + compute_value_counts(input_dataframe: pandas.core.frame.DataFrame, args: Any): + + + +
+ +
391def compute_value_counts(input_dataframe: pd.DataFrame, args: Any):
+392    """Update the count structure"""
+393
+394    global GLOBAL_RARE_VALUE_STORAGE
+395    global IGNORED_VALUES
+396
+397    for column in input_dataframe.columns:
+398        main_values = input_dataframe[column].values
+399        for value in main_values:
+400            if value not in IGNORED_VALUES:
+401                GLOBAL_RARE_VALUE_STORAGE.update({(column, value): 1})
+402
+403    for key, val in GLOBAL_RARE_VALUE_STORAGE.items():
+404        if val > args.rare_value_count_upper_bound:
+405            IGNORED_VALUES.add(key)
+406
+407    for to_remove_val in IGNORED_VALUES:
+408        del GLOBAL_RARE_VALUE_STORAGE[to_remove_val]
+
+ + +

Update the count structure

+
+ + +
+
+ +
+ + def + compute_cardinalities(input_dataframe: pandas.core.frame.DataFrame, pbar: Any) -> None: + + + +
+ +
411def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any) -> None:
+412    """Compute cardinalities of features, incrementally"""
+413
+414    global GLOBAL_CARDINALITY_STORAGE
+415    output_storage_card = defaultdict(set)
+416    for enx, column in enumerate(input_dataframe):
+417        output_storage_card[column] = set(input_dataframe[column].unique())
+418        if column not in GLOBAL_CARDINALITY_STORAGE:
+419            GLOBAL_CARDINALITY_STORAGE[column] = HyperLogLog(
+420                HYPERLL_ERROR_BOUND,
+421            )
+422
+423        for unique_value in set(input_dataframe[column].unique()):
+424            if unique_value:
+425                GLOBAL_CARDINALITY_STORAGE[column].add(
+426                    internal_hash(unique_value),
+427                )
+428        pbar.set_description(
+429            f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}',
+430        )
+
+ + +

Compute cardinalities of features, incrementally

+
+ + +
+
+ +
+ + def + compute_bounds_increment( input_dataframe: pandas.core.frame.DataFrame, numeric_column_types: set[str]) -> dict[str, typing.Any]: + + + +
+ +
433def compute_bounds_increment(
+434    input_dataframe: pd.DataFrame, numeric_column_types: set[str],
+435) -> dict[str, Any]:
+436    all_features = input_dataframe.columns
+437    numeric_column_types = set(numeric_column_types)
+438    summary_object = {}
+439    summary_storage: Any = {}
+440    for feature in all_features:
+441        if feature in numeric_column_types:
+442            feature_vector = pd.to_numeric(
+443                input_dataframe[feature], errors='coerce',
+444            )
+445            minimum = np.min(feature_vector)
+446            maximum = np.max(feature_vector)
+447            mean = np.mean(feature_vector)
+448            summary_storage = NumericFeatureSummary(
+449                feature, minimum, maximum, mean, len(
+450                    np.unique(feature_vector),
+451                ),
+452            )
+453            summary_object[feature] = summary_storage
+454
+455        else:
+456            feature_vector = input_dataframe[feature].values
+457            summary_storage = NominalFeatureSummary(
+458                feature, len(np.unique(feature_vector)),
+459            )
+460            summary_object[feature] = summary_storage
+461
+462    return summary_object
+
+ + + + +
+
+ +
+ + def + compute_batch_ranking( line_tmp_storage: list[list[typing.Any]], numeric_column_types: set[str], args: Any, cpu_pool: Any, column_descriptions: list[str], logger: Any, pbar: Any) -> tuple[outrank.core_utils.BatchRankingSummary, dict[str, typing.Any], dict[str, set[str]], dict[str, set[str]]]: + + + +
+ +
465def compute_batch_ranking(
+466    line_tmp_storage: list[list[Any]],
+467    numeric_column_types: set[str],
+468    args: Any,
+469    cpu_pool: Any,
+470    column_descriptions: list[str],
+471    logger: Any,
+472    pbar: Any,
+473) -> tuple[BatchRankingSummary, dict[str, Any], dict[str, set[str]], dict[str, set[str]]]:
+474    """Enrich the feature space and compute the batch importances"""
+475
+476    input_dataframe = pd.DataFrame(line_tmp_storage)
+477    input_dataframe.columns = column_descriptions
+478    pbar.set_description('Control features')
+479
+480    if args.feature_set_focus:
+481        if args.feature_set_focus == '_all_from_reference_JSON':
+482            focus_set = extract_features_from_reference_JSON(
+483                args.reference_model_JSON,
+484            )
+485
+486        else:
+487            focus_set = set(args.feature_set_focus.split(','))
+488
+489        focus_set.add(args.label_column)
+490        focus_set = {x for x in focus_set if x in input_dataframe.columns}
+491        input_dataframe = input_dataframe[focus_set]
+492
+493    if args.transformers != 'none':
+494        pbar.set_description('Adding transformations')
+495        input_dataframe = enrich_with_transformations(
+496            input_dataframe, numeric_column_types, logger, args,
+497        )
+498
+499    if args.explode_multivalue_features != 'False':
+500        pbar.set_description('Constructing new features from multivalue ones')
+501        input_dataframe = compute_expanded_multivalue_features(
+502            input_dataframe, logger, args, pbar,
+503        )
+504
+505    if args.subfeature_mapping != 'False':
+506        pbar.set_description('Constructing new (sub)features')
+507        input_dataframe = compute_subfeatures(
+508            input_dataframe, logger, args, pbar,
+509        )
+510
+511    if args.interaction_order > 1:
+512        pbar.set_description('Constructing new features')
+513        input_dataframe = compute_combined_features(
+514            input_dataframe, logger, args, pbar,
+515        )
+516
+517    # in case of 3mr we compute the score of combinations against the target
+518    if '3mr' in args.heuristic:
+519        pbar.set_description(
+520            'Constructing features for computing relations in 3mr',
+521        )
+522        input_dataframe = compute_combined_features(
+523            input_dataframe, logger, args, pbar, True,
+524        )
+525
+526    if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant':
+527        pbar.set_description('Computing baseline features')
+528        input_dataframe = include_noisy_features(input_dataframe, logger, args)
+529
+530    # Compute incremental statistic useful for data inspection/transformer generation
+531    pbar.set_description('Computing coverage')
+532    coverage_storage = compute_coverage(input_dataframe, args)
+533    feature_memory_consumption = compute_feature_memory_consumption(
+534        input_dataframe, args,
+535    )
+536    compute_cardinalities(input_dataframe, pbar)
+537
+538    if args.task == 'identify_rare_values':
+539        compute_value_counts(input_dataframe, args)
+540
+541    bounds_storage = compute_bounds_increment(
+542        input_dataframe, numeric_column_types,
+543    )
+544
+545    pbar.set_description(
+546        f'Computing ranks for {input_dataframe.shape[1]} features',
+547    )
+548
+549    return (
+550        mixed_rank_graph(input_dataframe, args, cpu_pool, pbar),
+551        bounds_storage,
+552        coverage_storage,
+553        feature_memory_consumption,
+554    )
+
+ + +

Enrich the feature space and compute the batch importances

+
+ + +
+
+ +
+ + def + get_num_of_instances(fname: str) -> int: + + + +
+ +
557def get_num_of_instances(fname: str) -> int:
+558    """Count the number of lines in a file, fast - useful for progress logging"""
+559
+560    def _make_gen(reader):
+561        while True:
+562            b = reader(2**16)
+563            if not b:
+564                break
+565            yield b
+566
+567    with open(fname, 'rb') as f:
+568        count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read))
+569    return count
+
+ + +

Count the number of lines in a file, fast - useful for progress logging

+
+ + +
+
+ +
+ + def + get_grouped_df( importances_df_list: list[tuple[str, str, float]]) -> pandas.core.frame.DataFrame: + + + +
+ +
572def get_grouped_df(importances_df_list: list[tuple[str, str, float]]) -> pd.DataFrame:
+573    """A helper method that enables median-based aggregation after processing"""
+574
+575    importances_df = pd.DataFrame(importances_df_list)
+576    if len(importances_df) == 0:
+577        return None
+578    importances_df.columns = ['FeatureA', 'FeatureB', 'Score']
+579    grouped = importances_df.groupby(
+580        ['FeatureA', 'FeatureB'],
+581    ).median().reset_index()
+582    return grouped
+
+ + +

A helper method that enables median-based aggregation after processing

+
+ + +
+
+ +
+ + def + checkpoint_importances_df(importances_batch: list[tuple[str, str, float]]) -> None: + + + +
+ +
585def checkpoint_importances_df(importances_batch: list[tuple[str, str, float]]) -> None:
+586    """A helper which stores intermediary state - useful for longer runs"""
+587
+588    gdf = get_grouped_df(importances_batch)
+589    if gdf is not None:
+590        gdf.to_csv('ranking_checkpoint_tmp.tsv', sep='\t')
+
+ + +

A helper which stores intermediary state - useful for longer runs

+
+ + +
+
+ +
+ + def + estimate_importances_minibatches( input_file: str, column_descriptions: list, fw_col_mapping: dict[str, str], numeric_column_types: set, batch_size: int = 100000, args: Any = None, data_encoding: str = 'utf-8', cpu_pool: Any = None, delimiter: str = '\t', feature_construction_mode: bool = False, logger: Any = None) -> tuple[list[dict[str, typing.Any]], typing.Any, dict[typing.Any, typing.Any], list[dict[str, typing.Any]], list[dict[str, set[str]]], collections.defaultdict[str, list[set[str]]], dict[str, typing.Any]]: + + + +
+ +
593def estimate_importances_minibatches(
+594    input_file: str,
+595    column_descriptions: list,
+596    fw_col_mapping: dict[str, str],
+597    numeric_column_types: set,
+598    batch_size: int = 100000,
+599    args: Any = None,
+600    data_encoding: str = 'utf-8',
+601    cpu_pool: Any = None,
+602    delimiter: str = '\t',
+603    feature_construction_mode: bool = False,
+604    logger: Any = None,
+605) -> tuple[list[dict[str, Any]], Any, dict[Any, Any], list[dict[str, Any]], list[dict[str, set[str]]], defaultdict[str, list[set[str]]], dict[str, Any]]:
+606    """Interaction score estimator - suitable for example for csv-like input data types.
+607    This type of data is normally a single large csv, meaning that minibatch processing needs to
+608    happen during incremental handling of the file (that"s not the case for pre-separated ob data)
+609    """
+610
+611    invalid_line_queue: Any = deque([], maxlen=2**5)
+612
+613    invalid_lines = 0
+614    line_counter = 0
+615
+616    importances_df: list[Any] = []
+617    line_tmp_storage = []
+618    bounds_storage_batch = []
+619    memory_storage_batch = []
+620    step_timing_checkpoints = []
+621
+622    local_coverage_object = defaultdict(list)
+623    local_pbar = tqdm.tqdm(
+624        total=get_num_of_instances(input_file) - 1, position=0,
+625    )
+626
+627    file_name, file_extension = os.path.splitext(input_file)
+628
+629    if file_extension == '.gz':
+630        file_stream = gzip.open(input_file, 'rt', encoding=data_encoding)
+631
+632    else:
+633        file_stream = open(input_file, encoding=data_encoding)
+634
+635    file_stream.readline()
+636
+637    local_pbar.set_description('Starting ranking computation')
+638    for line in file_stream:
+639        line_counter += 1
+640        local_pbar.update(1)
+641
+642        if line_counter % args.subsampling != 0:
+643            continue
+644
+645        parsed_line = generic_line_parser(
+646            line, delimiter, args, fw_col_mapping, column_descriptions,
+647        )
+648
+649        if len(parsed_line) == len(column_descriptions):
+650            line_tmp_storage.append(parsed_line)
+651
+652        else:
+653            invalid_line_queue.appendleft(str(parsed_line))
+654            invalid_lines += 1
+655
+656        # Batches need to be processed on-the-fly
+657        if len(line_tmp_storage) >= args.minibatch_size:
+658
+659            importances_batch, bounds_storage, coverage_storage, memory_storage = compute_batch_ranking(
+660                line_tmp_storage,
+661                numeric_column_types,
+662                args,
+663                cpu_pool,
+664                column_descriptions,
+665                logger,
+666                local_pbar,
+667            )
+668
+669            bounds_storage_batch.append(bounds_storage)
+670            memory_storage_batch.append(memory_storage)
+671            for k, v in coverage_storage.items():
+672                local_coverage_object[k].append(v)
+673
+674            del coverage_storage
+675
+676            line_tmp_storage = []
+677            step_timing_checkpoints.append(importances_batch.step_times)
+678            importances_df += importances_batch.triplet_scores
+679
+680            if args.heuristic != 'Constant':
+681                local_pbar.set_description('Creating checkpoint')
+682                checkpoint_importances_df(importances_df)
+683
+684    file_stream.close()
+685
+686    local_pbar.set_description('Parsing the remainder')
+687    if invalid_lines > 0:
+688        logger.info(
+689            f"Detected {invalid_lines} invalid lines. If this number is very high, it's possible your header is off - re-check your data/attribute-feature mappings please!",
+690        )
+691
+692        invalid_lines_log = '\n INVALID_LINE ====> '.join(
+693            list(invalid_line_queue)[0:5],
+694        )
+695        logger.info(
+696            f'5 samples of invalid lines are printed below\n {invalid_lines_log}',
+697        )
+698
+699    remaining_batch_size = len(line_tmp_storage)
+700
+701    if remaining_batch_size > 2**10:
+702        line_tmp_storage = line_tmp_storage[: args.minibatch_size]
+703        importances_batch, bounds_storage, coverage_storage, _ = compute_batch_ranking(
+704            line_tmp_storage,
+705            numeric_column_types,
+706            args,
+707            cpu_pool,
+708            column_descriptions,
+709            logger,
+710            local_pbar,
+711        )
+712
+713        for k, v in coverage_storage.items():
+714            local_coverage_object[k].append(v)
+715
+716        step_timing_checkpoints.append(importances_batch.step_times)
+717        importances_df += importances_batch.triplet_scores
+718        bounds_storage = dict()
+719        bounds_storage_batch.append(bounds_storage)
+720        checkpoint_importances_df(importances_df)
+721
+722    local_pbar.set_description('Wrapping up')
+723    local_pbar.close()
+724
+725    return (
+726        step_timing_checkpoints,
+727        get_grouped_df(importances_df),
+728        GLOBAL_CARDINALITY_STORAGE,
+729        bounds_storage_batch,
+730        memory_storage_batch,
+731        local_coverage_object,
+732        GLOBAL_RARE_VALUE_STORAGE,
+733    )
+
+ + +

Interaction score estimator - suitable for example for csv-like input data types. +This type of data is normally a single large csv, meaning that minibatch processing needs to +happen during incremental handling of the file (that"s not the case for pre-separated ob data)

+
+ + +
+
+ + diff --git a/docs/outrank/core_selftest.html b/docs/outrank/core_selftest.html new file mode 100644 index 0000000..47af91f --- /dev/null +++ b/docs/outrank/core_selftest.html @@ -0,0 +1,239 @@ + + + + + + + outrank.core_selftest API documentation + + + + + + + + + +
+
+

+outrank.core_selftest

+ + + + + + +
1# helper set of methods that enable anywhere verification of core functions
+2from __future__ import annotations
+
+ + +
+
+ + diff --git a/docs/outrank/core_utils.html b/docs/outrank/core_utils.html new file mode 100644 index 0000000..e640a6a --- /dev/null +++ b/docs/outrank/core_utils.html @@ -0,0 +1,2209 @@ + + + + + + + outrank.core_utils API documentation + + + + + + + + + +
+
+

+outrank.core_utils

+ + + + + + +
  1from __future__ import annotations
+  2
+  3import csv
+  4import glob
+  5import json
+  6import logging
+  7import os
+  8from collections import Counter
+  9from collections import defaultdict
+ 10from dataclasses import dataclass
+ 11from typing import Any
+ 12from typing import Dict
+ 13from typing import List
+ 14from typing import Optional
+ 15from typing import Set
+ 16from typing import Tuple
+ 17from typing import Union
+ 18
+ 19import numpy as np
+ 20import pandas as pd
+ 21import xxhash
+ 22
+ 23logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
+ 24
+ 25pro_tips = [
+ 26    'OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"',
+ 27    'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).',
+ 28    'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!',
+ 29    'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).',
+ 30    'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)',
+ 31    'Visualization part also includes clustering - this might be very insightful!',
+ 32    'By default OutRank includes feature cardinality and coverage in feature names (card; cov)',
+ 33    'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.',
+ 34    'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).',
+ 35    'Give it as many threads as physically possible (--num_threads).',
+ 36    'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.',
+ 37    'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).',
+ 38    'Your target can be any feature! (explaining one feature with others)',
+ 39    'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).',
+ 40    'Each feature is named as featureName(cardinality, coverage in percents) in the final files.',
+ 41    'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.',
+ 42]
+ 43
+ 44
+ 45def internal_hash(input_obj: str) -> str:
+ 46    """A generic internal hash used throughout ranking procedure - let's hardcode seed here for sure"""
+ 47    return xxhash.xxh32(input_obj, seed=20141025).hexdigest()
+ 48
+ 49
+ 50@dataclass
+ 51class DatasetInformationStorage:
+ 52    """A generic class for holding properties of a given type of dataset"""
+ 53
+ 54    data_path: str
+ 55    column_names: list[str]
+ 56    column_types: set[str]
+ 57    col_delimiter: str | None
+ 58    encoding: str
+ 59    fw_map: dict[str, str] | None
+ 60
+ 61
+ 62@dataclass
+ 63class NumericFeatureSummary:
+ 64    """A generic class storing numeric feature statistics"""
+ 65
+ 66    feature_name: str
+ 67    minimum: float
+ 68    maximum: float
+ 69    median: float
+ 70    num_unique: int
+ 71
+ 72
+ 73@dataclass
+ 74class NominalFeatureSummary:
+ 75    """A generic class storing numeric feature statistics"""
+ 76
+ 77    feature_name: str
+ 78    num_unique: int
+ 79
+ 80
+ 81@dataclass
+ 82class BatchRankingSummary:
+ 83    """A generic class representing batched ranking results"""
+ 84
+ 85    triplet_scores: list[tuple[str, str, float]]
+ 86    step_times: dict[str, Any]
+ 87
+ 88
+ 89def display_random_tip() -> None:
+ 90    TIP_CONTENT = np.random.choice(pro_tips)
+ 91    tip_core = f"""
+ 92=====>
+ 93Random tip: {TIP_CONTENT}
+ 94=====>
+ 95    """
+ 96
+ 97    print(tip_core)
+ 98
+ 99
+100def get_dataset_info(args: Any):
+101    if args.data_source == 'ob-raw-dump':
+102        dataset_info = parse_ob_raw_feature_information(args.data_path)
+103
+104    elif args.data_source == 'ob-vw':
+105        dataset_info = parse_ob_vw_feature_information(args.data_path)
+106
+107    elif args.data_source == 'ob-csv':
+108        dataset_info = parse_csv_with_description_information(args.data_path)
+109
+110    elif args.data_source == 'csv-raw':
+111        dataset_info = parse_csv_raw(args.data_path)
+112    else:
+113        raise NotImplementedError(
+114            'Plase, select a supported data source. Possible sources: {csv-raw, ob-vw, ob-csv}',
+115        )
+116
+117    return dataset_info
+118
+119
+120def display_tool_name() -> None:
+121    tool_name = """
+122
+123
+124                        *///////////////.
+125                     //////////////////////*
+126                   */////////////////////////.
+127                  ////////////// */////////////
+128                  /////////*          /////////
+129                 //////   /////   ////,   /////
+130                  ////////     ///    /////////
+131                  /////   /////  ./////   ////*
+132                   ,////                 ////
+133                     *////             ////.
+134                         ///////*///////
+135
+136
+137    ░█████╗░██╗░░░██╗████████╗██████╗░░█████╗░███╗░░██╗██╗░░██╗
+138    ██╔══██╗██║░░░██║╚══██╔══╝██╔══██╗██╔══██╗████╗░██║██║░██╔╝
+139    ██║░░██║██║░░░██║░░░██║░░░██████╔╝███████║██╔██╗██║█████═╝░
+140    ██║░░██║██║░░░██║░░░██║░░░██╔══██╗██╔══██║██║╚████║██╔═██╗░
+141    ╚█████╔╝╚██████╔╝░░░██║░░░██║░░██║██║░░██║██║░╚███║██║░╚██╗
+142    ░╚════╝░░╚═════╝░░░░╚═╝░░░╚═╝░░╚═╝╚═╝░░╚═╝╚═╝░░╚══╝╚═╝░░╚═╝
+143
+144
+145    """
+146
+147    print(tool_name)
+148
+149
+150def parse_ob_line(
+151    line_string: str, delimiter: str = '\t', args: Any = None,
+152) -> list[str]:
+153    """Outbrain line parsing - generic TSVs"""
+154
+155    line_string = line_string.strip()
+156    parts = line_string.split(delimiter)
+157    return parts
+158
+159
+160def parse_ob_line_vw(
+161    line_string: str,
+162    delimiter: str,
+163    args: Any = None,
+164    fw_col_mapping  = None,
+165    table_header  = None,
+166    include_namespace_info = False,
+167) -> list[str | None]:
+168    """Parse a sparse vw line into a pandas df with pre-defined namespace"""
+169
+170    all_line_parts = line_string.strip().split('|')
+171    label_part = all_line_parts[0].split(' ')[0]
+172    remainder = all_line_parts[1:]
+173    label = label_part
+174    remainder_hash = dict()
+175
+176    # Hash multi-value tuples and store name-val mappings
+177    for remaining_part in remainder:
+178        core_parts = remaining_part.split(' ')
+179        namespace_part = core_parts[0]
+180        other_parts = '-'.join(x for x in core_parts[1:] if x != '')
+181        if namespace_part in fw_col_mapping:
+182            remainder_hash[fw_col_mapping[namespace_part]] = other_parts
+183
+184    # Construct the consistently-mapped instance based on the remainder mapping
+185    the_real_instance = [
+186        remainder_hash.get(
+187            el, None,
+188        ) for el in table_header[1:]
+189    ]
+190    if not include_namespace_info:
+191        the_real_instance = [
+192            x[2:] if not x is None else None for x in the_real_instance
+193        ]
+194
+195    parts = [label] + the_real_instance
+196    return parts
+197
+198
+199def parse_ob_csv_line(
+200    line_string: str, delimiter: str = ',', args: Any = None,
+201) -> list[str]:
+202    """Data can have commas within JSON field dumps"""
+203
+204    clx = list(csv.reader([line_string])).pop()
+205    return clx
+206
+207
+208def generic_line_parser(
+209    line_string: str,
+210    delimiter: str,
+211    args: Any = None,
+212    fw_col_mapping: Any = None,
+213    table_header: Any = None,
+214) -> list[Any]:
+215    """A generic method aimed to parse data from different sources."""
+216
+217    if args.data_source == 'ob-raw-dump':
+218        return parse_ob_line(line_string, delimiter, args)
+219
+220    elif args.data_source == 'ob-vw':
+221        return parse_ob_line_vw(
+222            line_string, delimiter, args, fw_col_mapping, table_header,
+223        )
+224
+225    elif args.data_source == 'ob-csv' or args.data_source == 'csv-raw':
+226        return parse_ob_csv_line(line_string, delimiter, args)
+227
+228    else:
+229        raise NotImplementedError(
+230            'Please, specify a valid --data_source argument!',
+231        )
+232
+233
+234def read_reference_json(json_path) -> dict[str, dict]:
+235    """A helper method for reading a JSON"""
+236    with open(json_path) as jp:
+237        return json.load(jp)
+238
+239
+240def parse_namespace(namespace_path: str) -> tuple[set[str], dict[str, str]]:
+241    """Parse the feature namespace for type awareness"""
+242
+243    float_set = set()
+244    id_feature_map = {}
+245
+246    with open(namespace_path) as nm:
+247        for line in nm:
+248            try:
+249                namespace_parts = line.strip().split(',')
+250                if len(namespace_parts) == 2 and '_' not in namespace_parts[0]:
+251                    fw_id, feature = namespace_parts
+252                    type_name = 'generic'
+253
+254                else:
+255                    fw_id, feature, type_name = namespace_parts
+256
+257                id_feature_map[fw_id] = feature
+258                if type_name == 'f32':
+259                    float_set.add(feature)
+260            except Exception as es:
+261                logging.error(f'\U0001F631 {es} -- {namespace_parts}')
+262
+263    return float_set, id_feature_map
+264
+265
+266def read_column_names(mapping_file: str) -> list[str]:
+267    """Read the col. header"""
+268
+269    with open(mapping_file, encoding='utf-8') as mf:
+270        columns = mf.read().strip().split('\t')
+271    return columns
+272
+273
+274def parse_ob_vw_feature_information(data_path) -> DatasetInformationStorage:
+275    """A generic parser of ob-based data"""
+276
+277    # Get column names
+278    column_descriptions = os.path.join(data_path, 'vw_namespace_map.csv')
+279    column_types, fw_map = parse_namespace(column_descriptions)
+280
+281    # We establish column order here
+282    column_names = ['label'] + list(fw_map.values())
+283
+284    data_path = os.path.join(data_path, 'data.vw.gz')
+285    col_delimiter = None
+286    encoding = 'utf-8'
+287
+288    return DatasetInformationStorage(
+289        data_path, column_names, column_types, col_delimiter, encoding, fw_map,
+290    )
+291
+292
+293def parse_ob_raw_feature_information(data_path) -> DatasetInformationStorage:
+294    """A generic parser of ob-based data"""
+295
+296    # Get column names
+297    column_types: list[str] = []
+298
+299    # Get set of numeric columns
+300    table_header_path = os.path.join(data_path, 'raw_data/0_header/header.csv')
+301    table_header = read_column_names(table_header_path)
+302
+303    data_path_train = os.path.join(data_path, 'raw_data/1_train/*')
+304    col_delimiter = '\t'
+305    encoding = 'utf-8'
+306
+307    final_df = []
+308    core_data_folders = glob.glob(data_path_train)
+309    for actual_data in core_data_folders:
+310        for dump in glob.glob(actual_data + '/*'):
+311            tmp_df = pd.read_csv(
+312                dump, sep='\t', low_memory=True, dtype='object',
+313            )
+314            assert tmp_df.shape[1] == len(table_header)
+315            tmp_df.columns = table_header
+316            final_df.append(tmp_df)
+317
+318    final_df_concat = pd.concat(final_df, axis=0)
+319    final_path = os.path.join(data_path, 'raw_dump.tsv')
+320    logging.info(
+321        f'Stored data dump of dimension {final_df_concat.shape} to {final_path}',
+322    )
+323    final_df_concat.to_csv(final_path, sep='\t', index=False)
+324    data_path = os.path.join(data_path, 'raw_dump.tsv')
+325
+326    return DatasetInformationStorage(
+327        data_path, table_header, set(column_types), col_delimiter, encoding, None,
+328    )
+329
+330
+331def parse_ob_feature_information(data_path) -> DatasetInformationStorage:
+332    """A generic parser of ob-based data"""
+333
+334    # Get column names
+335    column_names = os.path.join(data_path, 'vw_namespace_map.csv')
+336    column_types, _ = parse_namespace(column_names)
+337
+338    # Get set of numeric columns
+339    table_header_path = os.path.join(data_path, 'raw_data/0_header/header.csv')
+340    table_header = read_column_names(table_header_path)
+341
+342    data_path = os.path.join(data_path, 'raw_data/1_train/*')
+343    col_delimiter = '\t'
+344    encoding = 'utf-8'
+345
+346    return DatasetInformationStorage(
+347        data_path, table_header, column_types, col_delimiter, encoding, None,
+348    )
+349
+350
+351def parse_csv_with_description_information(data_path) -> DatasetInformationStorage:
+352    dataset_description = read_reference_json(
+353        os.path.join(data_path, 'dataset_desc.json'),
+354    )
+355    column_names = []
+356    column_types = set()
+357    for feature in dataset_description.get('data_features', []):
+358        feature_name = feature.get('name')
+359        column_names.append(feature_name)
+360        feature_type = feature.get('type', '')
+361        if 'float' in feature_type or 'Float' in feature_type:
+362            column_types.add(feature_name)
+363    col_delimiter = ','
+364    data_path = os.path.join(data_path, 'data.csv')
+365    encoding = 'latin1'
+366    return DatasetInformationStorage(
+367        data_path, column_names, column_types, col_delimiter, encoding, None,
+368    )
+369
+370
+371def parse_csv_raw(data_path) -> DatasetInformationStorage:
+372    column_types: set[str] = set()
+373
+374    data_path = os.path.join(data_path, 'data.csv')
+375    with open(data_path) as inp_data:
+376        header = inp_data.readline()
+377    col_delimiter = ','
+378    column_names = header.strip().split(col_delimiter)
+379    encoding = 'latin1'
+380    return DatasetInformationStorage(
+381        data_path, column_names, column_types, col_delimiter, encoding, None,
+382    )
+383
+384
+385def extract_features_from_reference_JSON(json_path: str) -> set[Any]:
+386    """Given a model's JSON, extract unique features"""
+387
+388    with open(json_path) as jp:
+389        content = json.load(jp)
+390
+391    unique_features = set()
+392    feature_space = content['desc'].get('features', [])
+393    fields_space = content['desc'].get('fields', [])
+394    joint_space = feature_space + fields_space
+395
+396    for feature_tuple in joint_space:
+397        for individual_feature in feature_tuple.split(','):
+398            unique_features.add(individual_feature)
+399
+400    return unique_features
+401
+402
+403def summarize_feature_bounds_for_transformers(
+404    bounds_object_storage: Any,
+405    feature_types: list[str],
+406    task_name: str,
+407    label_name: str,
+408    granularity: int = 15,
+409    output_summary_table_only: bool = False,
+410):
+411    """summarization auxilliary method for generating JSON-based specs"""
+412
+413    if bounds_object_storage is None:
+414        logging.info('Bounds storage object is empty.')
+415        exit()
+416
+417    final_storage = defaultdict(list)
+418    for el in bounds_object_storage:
+419        if isinstance(el, dict):
+420            for k, v in el.items():
+421                final_storage[k].append(v)
+422
+423    summary_table_rows = []
+424    for k, v in final_storage.items():
+425        # Conduct local aggregation + bound changes
+426        if k in feature_types and k != label_name:
+427            minima, maxima, medians, uniques = [], [], [], []
+428            for feature_summary in v:
+429                minima.append(feature_summary.minimum)
+430                maxima.append(feature_summary.maximum)
+431                medians.append(feature_summary.median)
+432                uniques.append(feature_summary.num_unique)
+433            summary_table_rows.append(
+434                [
+435                    k,
+436                    round(np.min(minima), 2),
+437                    round(np.max(maxima), 2),
+438                    round(np.median(medians), 2),
+439                    int(np.mean(uniques)),
+440                ],
+441            )
+442
+443    if len(summary_table_rows) == 0:
+444        logging.info('No numeric features to summarize.')
+445        return None
+446
+447    summary_table: pd.Dataframe = pd.DataFrame(summary_table_rows)
+448    summary_table.columns = [
+449        'Feature',
+450        'Minimum',
+451        'Maximum',
+452        'Median',
+453        'Num avg. unique (batch)',
+454    ]
+455
+456    if output_summary_table_only:
+457        return summary_table
+458
+459    if len(summary_table) == 0:
+460        logging.info('Summary table empty, skipping transformer generation ..')
+461        return
+462
+463    if task_name == 'feature_summary_transformers':
+464        transformers_per_feature = defaultdict(list)
+465
+466        # Take care of weights first -> range is pre-defined
+467        for k, v in final_storage.items():
+468            if label_name in k or 'dummy' in k:
+469                continue
+470
+471            weight_template = {
+472                'feature': k,
+473                'src_features': [k],
+474                'transformations': ['Weight'],
+475                'weights': [0, 0.5, 1.5, 2, 3, 10],
+476            }
+477            transformers_per_feature[k].append(weight_template)
+478
+479        # Consider numeric transformations - pairs and single ones
+480        for enx, row in summary_table.iterrows():
+481            if row.Feature == 'dummy':
+482                continue
+483            try:
+484                actual_range = (
+485                    np.arange(
+486                        row['Minimum'],
+487                        row['Maximum'],
+488                        (row['Maximum'] - row['Minimum']) / granularity,
+489                    )
+490                    .round(2)
+491                    .tolist()
+492                )
+493                binner_template = {
+494                    'feature': f'{row.Feature}',
+495                    'src_features': [row.Feature],
+496                    'transformations': [
+497                        'BinnerSqrt',
+498                        'BinnerLog',
+499                        'BinnerSqrtPlain',
+500                        'BinnerLogPlain',
+501                    ],
+502                    'n': actual_range,
+503                    'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
+504                }
+505
+506            except Exception as es:
+507                logging.info(
+508                    f'\U0001F631 Encountered {es}. The problematic feature is: {row}, skipping transformer for this feature ..',
+509                )
+510
+511            transformers_per_feature[row.Feature].append(binner_template)
+512
+513            # We want the full loop here, due to asymmetry of transformation(s)
+514            for enx_second, row_second in summary_table.iterrows():
+515                if enx_second < enx:
+516                    continue
+517
+518                # The n values are defined based on maxima of the second feature
+519                if row_second.Feature != row.Feature:
+520                    n_bound = round(row_second['Median'] + row['Median'], 2)
+521                    max_bound = round(
+522                        min(row_second['Maximum'], row['Maximum']), 2,
+523                    )
+524                    min_bound = round(
+525                        row_second['Minimum'] + row['Minimum'], 2,
+526                    )
+527                    range_spectrum = sorted(
+528                        list(
+529                            {
+530                                0.0,
+531                                min_bound,
+532                                n_bound / 10,
+533                                n_bound / 5,
+534                                n_bound,
+535                                max_bound,
+536                            },
+537                        ),
+538                    )
+539
+540                    range_spectrum = [x for x in range_spectrum if x >= 0]
+541                    binner_pair_template = {
+542                        'feature': f'{row.Feature}Ratio{row_second.Feature}',
+543                        'src_features': [row.Feature, row_second.Feature],
+544                        'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'],
+545                        'n': range_spectrum,
+546                        'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
+547                    }
+548
+549                    binner_pair_template_second = {
+550                        'feature': f'{row_second.Feature}Ratio{row.Feature}',
+551                        'src_features': [row_second.Feature, row.Feature],
+552                        'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'],
+553                        'n': range_spectrum,
+554                        'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
+555                    }
+556
+557                    transformers_per_feature[row.Feature].append(
+558                        binner_pair_template,
+559                    )
+560                    transformers_per_feature[row.Feature].append(
+561                        binner_pair_template_second,
+562                    )
+563
+564        binner_templates = []
+565        for k, v in transformers_per_feature.items():
+566            for transformer_struct in v:
+567                binner_templates.append(transformer_struct)
+568
+569        logging.info(
+570            f'Generated {len(binner_templates)} transformation search specifications.\n',
+571        )
+572        namespace_full = f'"random_grid_feature_transform": {json.dumps(binner_templates)}, "random_grid_epochs": 512'
+573        logging.info('Generated transformations below:\n')
+574        print(namespace_full)
+575
+576
+577def summarize_rare_counts(
+578    term_counter: Any,
+579    args: Any,
+580    cardinality_object: Any,
+581    object_info: DatasetInformationStorage,
+582) -> None:
+583    """Write rare values"""
+584
+585    out_df_rows = []
+586    logging.info(
+587        f'Rare value summary (freq <= {args.rare_value_count_upper_bound}) follows ..',
+588    )
+589
+590    for namespace_tuple, count in term_counter.items():
+591        namespace, value = namespace_tuple
+592        out_df_rows.append([namespace, value, count])
+593    out_df: pd.DataFrame = pd.DataFrame(out_df_rows)
+594    out_df.columns = ['Namespace', 'value', 'Count']
+595    out_df.to_csv(
+596        os.path.join(args.output_folder, 'rare_values.tsv'), sep='\t', index=False,
+597    )
+598    logging.info(f'Wrote rare values to {args.output_folder}/rare_values.tsv')
+599
+600    overall_rare_counts = Counter(out_df.Namespace.values)
+601    sorted_counts = sorted(
+602        overall_rare_counts.items(), key=lambda pair: pair[1], reverse=True,
+603    )
+604    for k, v in sorted_counts:
+605        logging.info(f'Namespace: {k} ---- Rare values observed: {v}')
+606
+607    final_df_rows = []
+608    for k, v in sorted_counts:
+609        cardinality = len(cardinality_object[k])
+610        rare_proportion = np.round(100 * (v / cardinality), 2)
+611        col_type = 'nominal'
+612        if k in object_info.column_types:
+613            col_type = 'numeric'
+614        final_df_rows.append(
+615            {
+616                'rare_proportion': rare_proportion,
+617                'feature_type': col_type,
+618                'feature_name': k,
+619            },
+620        )
+621
+622    final_df: pd.DataFrame = pd.DataFrame(final_df_rows)
+623    final_df = final_df.sort_values(by=['rare_proportion'])
+624    logging.info(
+625        f'Wrote feature sparsity summary to {args.output_folder}/feature_sparsity_summary.tsv',
+626    )
+627    final_df.to_csv(
+628        f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t',
+629    )
+
+ + +
+
+
+ pro_tips = + + ['OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"', 'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).', 'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!', 'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).', 'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)', 'Visualization part also includes clustering - this might be very insightful!', 'By default OutRank includes feature cardinality and coverage in feature names (card; cov)', 'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.', 'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).', 'Give it as many threads as physically possible (--num_threads).', 'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.', 'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).', 'Your target can be any feature! (explaining one feature with others)', 'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).', 'Each feature is named as featureName(cardinality, coverage in percents) in the final files.', 'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.'] + + +
+ + + + +
+
+ +
+ + def + internal_hash(input_obj: str) -> str: + + + +
+ +
46def internal_hash(input_obj: str) -> str:
+47    """A generic internal hash used throughout ranking procedure - let's hardcode seed here for sure"""
+48    return xxhash.xxh32(input_obj, seed=20141025).hexdigest()
+
+ + +

A generic internal hash used throughout ranking procedure - let's hardcode seed here for sure

+
+ + +
+
+ +
+
@dataclass
+ + class + DatasetInformationStorage: + + + +
+ +
51@dataclass
+52class DatasetInformationStorage:
+53    """A generic class for holding properties of a given type of dataset"""
+54
+55    data_path: str
+56    column_names: list[str]
+57    column_types: set[str]
+58    col_delimiter: str | None
+59    encoding: str
+60    fw_map: dict[str, str] | None
+
+ + +

A generic class for holding properties of a given type of dataset

+
+ + +
+
+ + DatasetInformationStorage( data_path: str, column_names: list[str], column_types: set[str], col_delimiter: str | None, encoding: str, fw_map: dict[str, str] | None) + + +
+ + + + +
+
+
+ data_path: str + + +
+ + + + +
+
+
+ column_names: list[str] + + +
+ + + + +
+
+
+ column_types: set[str] + + +
+ + + + +
+
+
+ col_delimiter: str | None + + +
+ + + + +
+
+
+ encoding: str + + +
+ + + + +
+
+
+ fw_map: dict[str, str] | None + + +
+ + + + +
+
+
+ +
+
@dataclass
+ + class + NumericFeatureSummary: + + + +
+ +
63@dataclass
+64class NumericFeatureSummary:
+65    """A generic class storing numeric feature statistics"""
+66
+67    feature_name: str
+68    minimum: float
+69    maximum: float
+70    median: float
+71    num_unique: int
+
+ + +

A generic class storing numeric feature statistics

+
+ + +
+
+ + NumericFeatureSummary( feature_name: str, minimum: float, maximum: float, median: float, num_unique: int) + + +
+ + + + +
+
+
+ feature_name: str + + +
+ + + + +
+
+
+ minimum: float + + +
+ + + + +
+
+
+ maximum: float + + +
+ + + + +
+
+
+ median: float + + +
+ + + + +
+
+
+ num_unique: int + + +
+ + + + +
+
+
+ +
+
@dataclass
+ + class + NominalFeatureSummary: + + + +
+ +
74@dataclass
+75class NominalFeatureSummary:
+76    """A generic class storing numeric feature statistics"""
+77
+78    feature_name: str
+79    num_unique: int
+
+ + +

A generic class storing numeric feature statistics

+
+ + +
+
+ + NominalFeatureSummary(feature_name: str, num_unique: int) + + +
+ + + + +
+
+
+ feature_name: str + + +
+ + + + +
+
+
+ num_unique: int + + +
+ + + + +
+
+
+ +
+
@dataclass
+ + class + BatchRankingSummary: + + + +
+ +
82@dataclass
+83class BatchRankingSummary:
+84    """A generic class representing batched ranking results"""
+85
+86    triplet_scores: list[tuple[str, str, float]]
+87    step_times: dict[str, Any]
+
+ + +

A generic class representing batched ranking results

+
+ + +
+
+ + BatchRankingSummary( triplet_scores: list[tuple[str, str, float]], step_times: dict[str, typing.Any]) + + +
+ + + + +
+
+
+ triplet_scores: list[tuple[str, str, float]] + + +
+ + + + +
+
+
+ step_times: dict[str, typing.Any] + + +
+ + + + +
+
+
+ +
+ + def + display_random_tip() -> None: + + + +
+ +
90def display_random_tip() -> None:
+91    TIP_CONTENT = np.random.choice(pro_tips)
+92    tip_core = f"""
+93=====>
+94Random tip: {TIP_CONTENT}
+95=====>
+96    """
+97
+98    print(tip_core)
+
+ + + + +
+
+ +
+ + def + get_dataset_info(args: Any): + + + +
+ +
101def get_dataset_info(args: Any):
+102    if args.data_source == 'ob-raw-dump':
+103        dataset_info = parse_ob_raw_feature_information(args.data_path)
+104
+105    elif args.data_source == 'ob-vw':
+106        dataset_info = parse_ob_vw_feature_information(args.data_path)
+107
+108    elif args.data_source == 'ob-csv':
+109        dataset_info = parse_csv_with_description_information(args.data_path)
+110
+111    elif args.data_source == 'csv-raw':
+112        dataset_info = parse_csv_raw(args.data_path)
+113    else:
+114        raise NotImplementedError(
+115            'Plase, select a supported data source. Possible sources: {csv-raw, ob-vw, ob-csv}',
+116        )
+117
+118    return dataset_info
+
+ + + + +
+
+ +
+ + def + display_tool_name() -> None: + + + +
+ +
121def display_tool_name() -> None:
+122    tool_name = """
+123
+124
+125                        *///////////////.
+126                     //////////////////////*
+127                   */////////////////////////.
+128                  ////////////// */////////////
+129                  /////////*          /////////
+130                 //////   /////   ////,   /////
+131                  ////////     ///    /////////
+132                  /////   /////  ./////   ////*
+133                   ,////                 ////
+134                     *////             ////.
+135                         ///////*///////
+136
+137
+138    ░█████╗░██╗░░░██╗████████╗██████╗░░█████╗░███╗░░██╗██╗░░██╗
+139    ██╔══██╗██║░░░██║╚══██╔══╝██╔══██╗██╔══██╗████╗░██║██║░██╔╝
+140    ██║░░██║██║░░░██║░░░██║░░░██████╔╝███████║██╔██╗██║█████═╝░
+141    ██║░░██║██║░░░██║░░░██║░░░██╔══██╗██╔══██║██║╚████║██╔═██╗░
+142    ╚█████╔╝╚██████╔╝░░░██║░░░██║░░██║██║░░██║██║░╚███║██║░╚██╗
+143    ░╚════╝░░╚═════╝░░░░╚═╝░░░╚═╝░░╚═╝╚═╝░░╚═╝╚═╝░░╚══╝╚═╝░░╚═╝
+144
+145
+146    """
+147
+148    print(tool_name)
+
+ + + + +
+
+ +
+ + def + parse_ob_line(line_string: str, delimiter: str = '\t', args: Any = None) -> list[str]: + + + +
+ +
151def parse_ob_line(
+152    line_string: str, delimiter: str = '\t', args: Any = None,
+153) -> list[str]:
+154    """Outbrain line parsing - generic TSVs"""
+155
+156    line_string = line_string.strip()
+157    parts = line_string.split(delimiter)
+158    return parts
+
+ + +

Outbrain line parsing - generic TSVs

+
+ + +
+
+ +
+ + def + parse_ob_line_vw( line_string: str, delimiter: str, args: Any = None, fw_col_mapping=None, table_header=None, include_namespace_info=False) -> list[str | None]: + + + +
+ +
161def parse_ob_line_vw(
+162    line_string: str,
+163    delimiter: str,
+164    args: Any = None,
+165    fw_col_mapping  = None,
+166    table_header  = None,
+167    include_namespace_info = False,
+168) -> list[str | None]:
+169    """Parse a sparse vw line into a pandas df with pre-defined namespace"""
+170
+171    all_line_parts = line_string.strip().split('|')
+172    label_part = all_line_parts[0].split(' ')[0]
+173    remainder = all_line_parts[1:]
+174    label = label_part
+175    remainder_hash = dict()
+176
+177    # Hash multi-value tuples and store name-val mappings
+178    for remaining_part in remainder:
+179        core_parts = remaining_part.split(' ')
+180        namespace_part = core_parts[0]
+181        other_parts = '-'.join(x for x in core_parts[1:] if x != '')
+182        if namespace_part in fw_col_mapping:
+183            remainder_hash[fw_col_mapping[namespace_part]] = other_parts
+184
+185    # Construct the consistently-mapped instance based on the remainder mapping
+186    the_real_instance = [
+187        remainder_hash.get(
+188            el, None,
+189        ) for el in table_header[1:]
+190    ]
+191    if not include_namespace_info:
+192        the_real_instance = [
+193            x[2:] if not x is None else None for x in the_real_instance
+194        ]
+195
+196    parts = [label] + the_real_instance
+197    return parts
+
+ + +

Parse a sparse vw line into a pandas df with pre-defined namespace

+
+ + +
+
+ +
+ + def + parse_ob_csv_line(line_string: str, delimiter: str = ',', args: Any = None) -> list[str]: + + + +
+ +
200def parse_ob_csv_line(
+201    line_string: str, delimiter: str = ',', args: Any = None,
+202) -> list[str]:
+203    """Data can have commas within JSON field dumps"""
+204
+205    clx = list(csv.reader([line_string])).pop()
+206    return clx
+
+ + +

Data can have commas within JSON field dumps

+
+ + +
+
+ +
+ + def + generic_line_parser( line_string: str, delimiter: str, args: Any = None, fw_col_mapping: Any = None, table_header: Any = None) -> list[typing.Any]: + + + +
+ +
209def generic_line_parser(
+210    line_string: str,
+211    delimiter: str,
+212    args: Any = None,
+213    fw_col_mapping: Any = None,
+214    table_header: Any = None,
+215) -> list[Any]:
+216    """A generic method aimed to parse data from different sources."""
+217
+218    if args.data_source == 'ob-raw-dump':
+219        return parse_ob_line(line_string, delimiter, args)
+220
+221    elif args.data_source == 'ob-vw':
+222        return parse_ob_line_vw(
+223            line_string, delimiter, args, fw_col_mapping, table_header,
+224        )
+225
+226    elif args.data_source == 'ob-csv' or args.data_source == 'csv-raw':
+227        return parse_ob_csv_line(line_string, delimiter, args)
+228
+229    else:
+230        raise NotImplementedError(
+231            'Please, specify a valid --data_source argument!',
+232        )
+
+ + +

A generic method aimed to parse data from different sources.

+
+ + +
+
+ +
+ + def + read_reference_json(json_path) -> dict[str, dict]: + + + +
+ +
235def read_reference_json(json_path) -> dict[str, dict]:
+236    """A helper method for reading a JSON"""
+237    with open(json_path) as jp:
+238        return json.load(jp)
+
+ + +

A helper method for reading a JSON

+
+ + +
+
+ +
+ + def + parse_namespace(namespace_path: str) -> tuple[set[str], dict[str, str]]: + + + +
+ +
241def parse_namespace(namespace_path: str) -> tuple[set[str], dict[str, str]]:
+242    """Parse the feature namespace for type awareness"""
+243
+244    float_set = set()
+245    id_feature_map = {}
+246
+247    with open(namespace_path) as nm:
+248        for line in nm:
+249            try:
+250                namespace_parts = line.strip().split(',')
+251                if len(namespace_parts) == 2 and '_' not in namespace_parts[0]:
+252                    fw_id, feature = namespace_parts
+253                    type_name = 'generic'
+254
+255                else:
+256                    fw_id, feature, type_name = namespace_parts
+257
+258                id_feature_map[fw_id] = feature
+259                if type_name == 'f32':
+260                    float_set.add(feature)
+261            except Exception as es:
+262                logging.error(f'\U0001F631 {es} -- {namespace_parts}')
+263
+264    return float_set, id_feature_map
+
+ + +

Parse the feature namespace for type awareness

+
+ + +
+
+ +
+ + def + read_column_names(mapping_file: str) -> list[str]: + + + +
+ +
267def read_column_names(mapping_file: str) -> list[str]:
+268    """Read the col. header"""
+269
+270    with open(mapping_file, encoding='utf-8') as mf:
+271        columns = mf.read().strip().split('\t')
+272    return columns
+
+ + +

Read the col. header

+
+ + +
+
+ +
+ + def + parse_ob_vw_feature_information(data_path) -> DatasetInformationStorage: + + + +
+ +
275def parse_ob_vw_feature_information(data_path) -> DatasetInformationStorage:
+276    """A generic parser of ob-based data"""
+277
+278    # Get column names
+279    column_descriptions = os.path.join(data_path, 'vw_namespace_map.csv')
+280    column_types, fw_map = parse_namespace(column_descriptions)
+281
+282    # We establish column order here
+283    column_names = ['label'] + list(fw_map.values())
+284
+285    data_path = os.path.join(data_path, 'data.vw.gz')
+286    col_delimiter = None
+287    encoding = 'utf-8'
+288
+289    return DatasetInformationStorage(
+290        data_path, column_names, column_types, col_delimiter, encoding, fw_map,
+291    )
+
+ + +

A generic parser of ob-based data

+
+ + +
+
+ +
+ + def + parse_ob_raw_feature_information(data_path) -> DatasetInformationStorage: + + + +
+ +
294def parse_ob_raw_feature_information(data_path) -> DatasetInformationStorage:
+295    """A generic parser of ob-based data"""
+296
+297    # Get column names
+298    column_types: list[str] = []
+299
+300    # Get set of numeric columns
+301    table_header_path = os.path.join(data_path, 'raw_data/0_header/header.csv')
+302    table_header = read_column_names(table_header_path)
+303
+304    data_path_train = os.path.join(data_path, 'raw_data/1_train/*')
+305    col_delimiter = '\t'
+306    encoding = 'utf-8'
+307
+308    final_df = []
+309    core_data_folders = glob.glob(data_path_train)
+310    for actual_data in core_data_folders:
+311        for dump in glob.glob(actual_data + '/*'):
+312            tmp_df = pd.read_csv(
+313                dump, sep='\t', low_memory=True, dtype='object',
+314            )
+315            assert tmp_df.shape[1] == len(table_header)
+316            tmp_df.columns = table_header
+317            final_df.append(tmp_df)
+318
+319    final_df_concat = pd.concat(final_df, axis=0)
+320    final_path = os.path.join(data_path, 'raw_dump.tsv')
+321    logging.info(
+322        f'Stored data dump of dimension {final_df_concat.shape} to {final_path}',
+323    )
+324    final_df_concat.to_csv(final_path, sep='\t', index=False)
+325    data_path = os.path.join(data_path, 'raw_dump.tsv')
+326
+327    return DatasetInformationStorage(
+328        data_path, table_header, set(column_types), col_delimiter, encoding, None,
+329    )
+
+ + +

A generic parser of ob-based data

+
+ + +
+
+ +
+ + def + parse_ob_feature_information(data_path) -> DatasetInformationStorage: + + + +
+ +
332def parse_ob_feature_information(data_path) -> DatasetInformationStorage:
+333    """A generic parser of ob-based data"""
+334
+335    # Get column names
+336    column_names = os.path.join(data_path, 'vw_namespace_map.csv')
+337    column_types, _ = parse_namespace(column_names)
+338
+339    # Get set of numeric columns
+340    table_header_path = os.path.join(data_path, 'raw_data/0_header/header.csv')
+341    table_header = read_column_names(table_header_path)
+342
+343    data_path = os.path.join(data_path, 'raw_data/1_train/*')
+344    col_delimiter = '\t'
+345    encoding = 'utf-8'
+346
+347    return DatasetInformationStorage(
+348        data_path, table_header, column_types, col_delimiter, encoding, None,
+349    )
+
+ + +

A generic parser of ob-based data

+
+ + +
+
+ +
+ + def + parse_csv_with_description_information(data_path) -> DatasetInformationStorage: + + + +
+ +
352def parse_csv_with_description_information(data_path) -> DatasetInformationStorage:
+353    dataset_description = read_reference_json(
+354        os.path.join(data_path, 'dataset_desc.json'),
+355    )
+356    column_names = []
+357    column_types = set()
+358    for feature in dataset_description.get('data_features', []):
+359        feature_name = feature.get('name')
+360        column_names.append(feature_name)
+361        feature_type = feature.get('type', '')
+362        if 'float' in feature_type or 'Float' in feature_type:
+363            column_types.add(feature_name)
+364    col_delimiter = ','
+365    data_path = os.path.join(data_path, 'data.csv')
+366    encoding = 'latin1'
+367    return DatasetInformationStorage(
+368        data_path, column_names, column_types, col_delimiter, encoding, None,
+369    )
+
+ + + + +
+
+ +
+ + def + parse_csv_raw(data_path) -> DatasetInformationStorage: + + + +
+ +
372def parse_csv_raw(data_path) -> DatasetInformationStorage:
+373    column_types: set[str] = set()
+374
+375    data_path = os.path.join(data_path, 'data.csv')
+376    with open(data_path) as inp_data:
+377        header = inp_data.readline()
+378    col_delimiter = ','
+379    column_names = header.strip().split(col_delimiter)
+380    encoding = 'latin1'
+381    return DatasetInformationStorage(
+382        data_path, column_names, column_types, col_delimiter, encoding, None,
+383    )
+
+ + + + +
+
+ +
+ + def + extract_features_from_reference_JSON(json_path: str) -> set[typing.Any]: + + + +
+ +
386def extract_features_from_reference_JSON(json_path: str) -> set[Any]:
+387    """Given a model's JSON, extract unique features"""
+388
+389    with open(json_path) as jp:
+390        content = json.load(jp)
+391
+392    unique_features = set()
+393    feature_space = content['desc'].get('features', [])
+394    fields_space = content['desc'].get('fields', [])
+395    joint_space = feature_space + fields_space
+396
+397    for feature_tuple in joint_space:
+398        for individual_feature in feature_tuple.split(','):
+399            unique_features.add(individual_feature)
+400
+401    return unique_features
+
+ + +

Given a model's JSON, extract unique features

+
+ + +
+
+ +
+ + def + summarize_feature_bounds_for_transformers( bounds_object_storage: Any, feature_types: list[str], task_name: str, label_name: str, granularity: int = 15, output_summary_table_only: bool = False): + + + +
+ +
404def summarize_feature_bounds_for_transformers(
+405    bounds_object_storage: Any,
+406    feature_types: list[str],
+407    task_name: str,
+408    label_name: str,
+409    granularity: int = 15,
+410    output_summary_table_only: bool = False,
+411):
+412    """summarization auxilliary method for generating JSON-based specs"""
+413
+414    if bounds_object_storage is None:
+415        logging.info('Bounds storage object is empty.')
+416        exit()
+417
+418    final_storage = defaultdict(list)
+419    for el in bounds_object_storage:
+420        if isinstance(el, dict):
+421            for k, v in el.items():
+422                final_storage[k].append(v)
+423
+424    summary_table_rows = []
+425    for k, v in final_storage.items():
+426        # Conduct local aggregation + bound changes
+427        if k in feature_types and k != label_name:
+428            minima, maxima, medians, uniques = [], [], [], []
+429            for feature_summary in v:
+430                minima.append(feature_summary.minimum)
+431                maxima.append(feature_summary.maximum)
+432                medians.append(feature_summary.median)
+433                uniques.append(feature_summary.num_unique)
+434            summary_table_rows.append(
+435                [
+436                    k,
+437                    round(np.min(minima), 2),
+438                    round(np.max(maxima), 2),
+439                    round(np.median(medians), 2),
+440                    int(np.mean(uniques)),
+441                ],
+442            )
+443
+444    if len(summary_table_rows) == 0:
+445        logging.info('No numeric features to summarize.')
+446        return None
+447
+448    summary_table: pd.Dataframe = pd.DataFrame(summary_table_rows)
+449    summary_table.columns = [
+450        'Feature',
+451        'Minimum',
+452        'Maximum',
+453        'Median',
+454        'Num avg. unique (batch)',
+455    ]
+456
+457    if output_summary_table_only:
+458        return summary_table
+459
+460    if len(summary_table) == 0:
+461        logging.info('Summary table empty, skipping transformer generation ..')
+462        return
+463
+464    if task_name == 'feature_summary_transformers':
+465        transformers_per_feature = defaultdict(list)
+466
+467        # Take care of weights first -> range is pre-defined
+468        for k, v in final_storage.items():
+469            if label_name in k or 'dummy' in k:
+470                continue
+471
+472            weight_template = {
+473                'feature': k,
+474                'src_features': [k],
+475                'transformations': ['Weight'],
+476                'weights': [0, 0.5, 1.5, 2, 3, 10],
+477            }
+478            transformers_per_feature[k].append(weight_template)
+479
+480        # Consider numeric transformations - pairs and single ones
+481        for enx, row in summary_table.iterrows():
+482            if row.Feature == 'dummy':
+483                continue
+484            try:
+485                actual_range = (
+486                    np.arange(
+487                        row['Minimum'],
+488                        row['Maximum'],
+489                        (row['Maximum'] - row['Minimum']) / granularity,
+490                    )
+491                    .round(2)
+492                    .tolist()
+493                )
+494                binner_template = {
+495                    'feature': f'{row.Feature}',
+496                    'src_features': [row.Feature],
+497                    'transformations': [
+498                        'BinnerSqrt',
+499                        'BinnerLog',
+500                        'BinnerSqrtPlain',
+501                        'BinnerLogPlain',
+502                    ],
+503                    'n': actual_range,
+504                    'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
+505                }
+506
+507            except Exception as es:
+508                logging.info(
+509                    f'\U0001F631 Encountered {es}. The problematic feature is: {row}, skipping transformer for this feature ..',
+510                )
+511
+512            transformers_per_feature[row.Feature].append(binner_template)
+513
+514            # We want the full loop here, due to asymmetry of transformation(s)
+515            for enx_second, row_second in summary_table.iterrows():
+516                if enx_second < enx:
+517                    continue
+518
+519                # The n values are defined based on maxima of the second feature
+520                if row_second.Feature != row.Feature:
+521                    n_bound = round(row_second['Median'] + row['Median'], 2)
+522                    max_bound = round(
+523                        min(row_second['Maximum'], row['Maximum']), 2,
+524                    )
+525                    min_bound = round(
+526                        row_second['Minimum'] + row['Minimum'], 2,
+527                    )
+528                    range_spectrum = sorted(
+529                        list(
+530                            {
+531                                0.0,
+532                                min_bound,
+533                                n_bound / 10,
+534                                n_bound / 5,
+535                                n_bound,
+536                                max_bound,
+537                            },
+538                        ),
+539                    )
+540
+541                    range_spectrum = [x for x in range_spectrum if x >= 0]
+542                    binner_pair_template = {
+543                        'feature': f'{row.Feature}Ratio{row_second.Feature}',
+544                        'src_features': [row.Feature, row_second.Feature],
+545                        'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'],
+546                        'n': range_spectrum,
+547                        'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
+548                    }
+549
+550                    binner_pair_template_second = {
+551                        'feature': f'{row_second.Feature}Ratio{row.Feature}',
+552                        'src_features': [row_second.Feature, row.Feature],
+553                        'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'],
+554                        'n': range_spectrum,
+555                        'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
+556                    }
+557
+558                    transformers_per_feature[row.Feature].append(
+559                        binner_pair_template,
+560                    )
+561                    transformers_per_feature[row.Feature].append(
+562                        binner_pair_template_second,
+563                    )
+564
+565        binner_templates = []
+566        for k, v in transformers_per_feature.items():
+567            for transformer_struct in v:
+568                binner_templates.append(transformer_struct)
+569
+570        logging.info(
+571            f'Generated {len(binner_templates)} transformation search specifications.\n',
+572        )
+573        namespace_full = f'"random_grid_feature_transform": {json.dumps(binner_templates)}, "random_grid_epochs": 512'
+574        logging.info('Generated transformations below:\n')
+575        print(namespace_full)
+
+ + +

summarization auxilliary method for generating JSON-based specs

+
+ + +
+
+ +
+ + def + summarize_rare_counts( term_counter: Any, args: Any, cardinality_object: Any, object_info: DatasetInformationStorage) -> None: + + + +
+ +
578def summarize_rare_counts(
+579    term_counter: Any,
+580    args: Any,
+581    cardinality_object: Any,
+582    object_info: DatasetInformationStorage,
+583) -> None:
+584    """Write rare values"""
+585
+586    out_df_rows = []
+587    logging.info(
+588        f'Rare value summary (freq <= {args.rare_value_count_upper_bound}) follows ..',
+589    )
+590
+591    for namespace_tuple, count in term_counter.items():
+592        namespace, value = namespace_tuple
+593        out_df_rows.append([namespace, value, count])
+594    out_df: pd.DataFrame = pd.DataFrame(out_df_rows)
+595    out_df.columns = ['Namespace', 'value', 'Count']
+596    out_df.to_csv(
+597        os.path.join(args.output_folder, 'rare_values.tsv'), sep='\t', index=False,
+598    )
+599    logging.info(f'Wrote rare values to {args.output_folder}/rare_values.tsv')
+600
+601    overall_rare_counts = Counter(out_df.Namespace.values)
+602    sorted_counts = sorted(
+603        overall_rare_counts.items(), key=lambda pair: pair[1], reverse=True,
+604    )
+605    for k, v in sorted_counts:
+606        logging.info(f'Namespace: {k} ---- Rare values observed: {v}')
+607
+608    final_df_rows = []
+609    for k, v in sorted_counts:
+610        cardinality = len(cardinality_object[k])
+611        rare_proportion = np.round(100 * (v / cardinality), 2)
+612        col_type = 'nominal'
+613        if k in object_info.column_types:
+614            col_type = 'numeric'
+615        final_df_rows.append(
+616            {
+617                'rare_proportion': rare_proportion,
+618                'feature_type': col_type,
+619                'feature_name': k,
+620            },
+621        )
+622
+623    final_df: pd.DataFrame = pd.DataFrame(final_df_rows)
+624    final_df = final_df.sort_values(by=['rare_proportion'])
+625    logging.info(
+626        f'Wrote feature sparsity summary to {args.output_folder}/feature_sparsity_summary.tsv',
+627    )
+628    final_df.to_csv(
+629        f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t',
+630    )
+
+ + +

Write rare values

+
+ + +
+
+ + diff --git a/docs/outrank/feature_transformations.html b/docs/outrank/feature_transformations.html new file mode 100644 index 0000000..77c10c6 --- /dev/null +++ b/docs/outrank/feature_transformations.html @@ -0,0 +1,238 @@ + + + + + + + outrank.feature_transformations API documentation + + + + + + + + + +
+
+

+outrank.feature_transformations

+ + + + + +
+
+ + diff --git a/docs/outrank/feature_transformations/feature_transformer_vault.html b/docs/outrank/feature_transformations/feature_transformer_vault.html new file mode 100644 index 0000000..3219c64 --- /dev/null +++ b/docs/outrank/feature_transformations/feature_transformer_vault.html @@ -0,0 +1,259 @@ + + + + + + + outrank.feature_transformations.feature_transformer_vault API documentation + + + + + + + + + +
+
+

+outrank.feature_transformations.feature_transformer_vault

+ + + + + + +
 1from __future__ import annotations
+ 2
+ 3from outrank.feature_transformations.feature_transformer_vault.default_transformers import DEFAULT_TRANSFORMERS
+ 4from outrank.feature_transformations.feature_transformer_vault.default_transformers import MINIMAL_TRANSFORMERS
+ 5from outrank.feature_transformations.feature_transformer_vault.fw_transformers import (
+ 6    FW_TRANSFORMERS,
+ 7)
+ 8
+ 9_tr_global_namespace = {
+10    'default': DEFAULT_TRANSFORMERS,
+11    'minimal': MINIMAL_TRANSFORMERS,
+12    'fw-transformers': FW_TRANSFORMERS,
+13}
+
+ + +
+
+ + diff --git a/docs/outrank/feature_transformations/feature_transformer_vault/default_transformers.html b/docs/outrank/feature_transformations/feature_transformer_vault/default_transformers.html new file mode 100644 index 0000000..e9db800 --- /dev/null +++ b/docs/outrank/feature_transformations/feature_transformer_vault/default_transformers.html @@ -0,0 +1,312 @@ + + + + + + + outrank.feature_transformations.feature_transformer_vault.default_transformers API documentation + + + + + + + + + +
+
+

+outrank.feature_transformations.feature_transformer_vault.default_transformers

+ + + + + + +
 1# Some boilerplate transformations people tend to use
+ 2from __future__ import annotations
+ 3MINIMAL_TRANSFORMERS = {
+ 4    '_tr_sqrt': 'np.sqrt(X)',
+ 5    '_tr_log(x+1)': 'np.log(X + 1)',
+ 6    '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))',
+ 7    '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)',
+ 8}
+ 9
+10DEFAULT_TRANSFORMERS = {
+11    '_tr_sqrt': 'np.sqrt(X)',
+12    '_tr_log(x+1)': 'np.log(X + 1)',
+13    '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))',
+14    '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)',
+15    '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))',
+16    '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))',
+17    '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)',
+18    '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)',
+19    '_tr_nonzero': 'np.where(X != 0, 1, 0)',
+20    '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)',
+21}
+22
+23if __name__ == '__main__':
+24    import numpy as np
+25
+26    # generate some input (call it X)
+27    X = np.random.random(100)
+28
+29    # get some transformer
+30    some_transformer = DEFAULT_TRANSFORMERS.get('_tr_nonzero')
+31
+32    if some_transformer is None:
+33        some_transformer = ''
+34
+35    # evaluate to get output
+36    output = eval(some_transformer)
+37
+38    # check output somehow
+39    print(output)
+
+ + +
+
+
+ MINIMAL_TRANSFORMERS = + + {'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)'} + + +
+ + + + +
+
+
+ DEFAULT_TRANSFORMERS = + + {'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)'} + + +
+ + + + +
+
+ + diff --git a/docs/outrank/feature_transformations/feature_transformer_vault/fw_transformers.html b/docs/outrank/feature_transformations/feature_transformer_vault/fw_transformers.html new file mode 100644 index 0000000..a3e5a60 --- /dev/null +++ b/docs/outrank/feature_transformations/feature_transformer_vault/fw_transformers.html @@ -0,0 +1,324 @@ + + + + + + + outrank.feature_transformations.feature_transformer_vault.fw_transformers API documentation + + + + + + + + + +
+
+

+outrank.feature_transformations.feature_transformer_vault.fw_transformers

+ + + + + + +
 1from __future__ import annotations
+ 2
+ 3import numpy as np
+ 4
+ 5from outrank.feature_transformations.feature_transformer_vault.default_transformers import (
+ 6    DEFAULT_TRANSFORMERS,
+ 7)
+ 8
+ 9FW_TRANSFORMERS = DEFAULT_TRANSFORMERS.copy()
+10resolution_range = [1, 10, 50, 100]
+11greater_than_range = [1, 2, 4, 8, 16, 32, 64, 96]
+12
+13for resolution in resolution_range:
+14    for greater_than in greater_than_range:
+15        FW_TRANSFORMERS[f'_tr_fw_sqrt_res_{resolution}_gt_{greater_than}'] = (
+16            f'np.where(X < {greater_than}, '
+17            f'X, '
+18            f'np.where(X>{greater_than} ,'
+19            f'np.round(np.sqrt(X-{greater_than})*{resolution},0), 0))'
+20        )
+21
+22        FW_TRANSFORMERS[
+23            f'_tr_fw_log_res_{resolution}_gt_{greater_than}'
+24        ] = f'np.where(X <{greater_than}, X, np.where(X >{greater_than}, np.round(np.log(X-{greater_than})*{resolution},0), 0))'
+25
+26for resolution in resolution_range:
+27    for greater_than in [np.divide(x, 100) for x in greater_than_range]:
+28        FW_TRANSFORMERS[
+29            f'_tr_fw_prob_sqrt_res_{resolution}_gt_{greater_than}'
+30        ] = f'np.where(X < {greater_than}, X, np.where(X>{greater_than}, np.round(np.sqrt(X-{greater_than})*{resolution},0), 0))'
+31
+32        FW_TRANSFORMERS[
+33            f'_tr_fw_prob_log_res_{resolution}_gt_{greater_than}'
+34        ] = f'np.where(X <{greater_than},X, np.where(X>{greater_than}, np.round(np.log(X-{greater_than})*{resolution},0), 0))'
+35
+36if __name__ == '__main__':
+37    print(len(FW_TRANSFORMERS))
+
+ + +
+
+
+ FW_TRANSFORMERS = + + {'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)', '_tr_fw_sqrt_res_1_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*1,0), 0))', '_tr_fw_log_res_1_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*1,0), 0))', '_tr_fw_log_res_1_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*1,0), 0))', '_tr_fw_log_res_1_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*1,0), 0))', '_tr_fw_log_res_1_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*1,0), 0))', '_tr_fw_log_res_1_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*1,0), 0))', '_tr_fw_log_res_1_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*1,0), 0))', '_tr_fw_log_res_1_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*1,0), 0))', '_tr_fw_log_res_1_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*1,0), 0))', '_tr_fw_sqrt_res_10_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*10,0), 0))', '_tr_fw_log_res_10_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*10,0), 0))', '_tr_fw_log_res_10_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*10,0), 0))', '_tr_fw_log_res_10_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*10,0), 0))', '_tr_fw_log_res_10_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*10,0), 0))', '_tr_fw_log_res_10_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*10,0), 0))', '_tr_fw_log_res_10_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*10,0), 0))', '_tr_fw_log_res_10_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*10,0), 0))', '_tr_fw_log_res_10_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*10,0), 0))', '_tr_fw_sqrt_res_50_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*50,0), 0))', '_tr_fw_log_res_50_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*50,0), 0))', '_tr_fw_log_res_50_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*50,0), 0))', '_tr_fw_log_res_50_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*50,0), 0))', '_tr_fw_log_res_50_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*50,0), 0))', '_tr_fw_log_res_50_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*50,0), 0))', '_tr_fw_log_res_50_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*50,0), 0))', '_tr_fw_log_res_50_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*50,0), 0))', '_tr_fw_log_res_50_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*50,0), 0))', '_tr_fw_sqrt_res_100_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*100,0), 0))', '_tr_fw_log_res_100_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*100,0), 0))', '_tr_fw_log_res_100_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*100,0), 0))', '_tr_fw_log_res_100_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*100,0), 0))', '_tr_fw_log_res_100_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*100,0), 0))', '_tr_fw_log_res_100_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*100,0), 0))', '_tr_fw_log_res_100_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*100,0), 0))', '_tr_fw_log_res_100_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*100,0), 0))', '_tr_fw_log_res_100_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*100,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*1,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*10,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*50,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*100,0), 0))'} + + +
+ + + + +
+
+
+ resolution_range = +[1, 10, 50, 100] + + +
+ + + + +
+
+
+ greater_than_range = +[1, 2, 4, 8, 16, 32, 64, 96] + + +
+ + + + +
+
+ + diff --git a/docs/outrank/feature_transformations/ranking_transformers.html b/docs/outrank/feature_transformations/ranking_transformers.html new file mode 100644 index 0000000..458b822 --- /dev/null +++ b/docs/outrank/feature_transformations/ranking_transformers.html @@ -0,0 +1,914 @@ + + + + + + + outrank.feature_transformations.ranking_transformers API documentation + + + + + + + + + +
+
+

+outrank.feature_transformations.ranking_transformers

+ + + + + + +
  1# A collection of feature transformers that can be considered
+  2from __future__ import annotations
+  3
+  4import logging
+  5from typing import Any
+  6from typing import Dict
+  7from typing import List
+  8from typing import Set
+  9
+ 10import numpy as np
+ 11import pandas as pd
+ 12
+ 13import outrank.feature_transformations.feature_transformer_vault as transformer_vault
+ 14from outrank.core_utils import internal_hash
+ 15
+ 16logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
+ 17
+ 18
+ 19class FeatureTransformerNoise:
+ 20    def __init__(self):
+ 21        self.noise_preset = 'default'
+ 22
+ 23    def construct_new_features(self, dataframe: pd.DataFrame, label_column=None):
+ 24        """Generate a few standard noise distributions"""
+ 25
+ 26        new_columns = dict()
+ 27        if self.noise_preset == 'default':
+ 28            new_columns['CONTROL-constant0'] = np.array([0] * dataframe.shape[0])
+ 29            new_columns['CONTROL-gaussian'] = np.random.normal(
+ 30                size=dataframe.shape[0],
+ 31            )
+ 32            new_columns['CONTROL-uniform'] = np.random.random(
+ 33                dataframe.shape[0],
+ 34            )
+ 35            new_columns['CONTROL-random-binary'] = np.random.randint(
+ 36                0, 2, dataframe.shape[0],
+ 37            )
+ 38            new_columns['CONTROL-random-card100'] = np.random.randint(
+ 39                0, 1 + 1 * 10**2, dataframe.shape[0],
+ 40            )
+ 41            new_columns['CONTROL-random-card2k'] = np.random.randint(
+ 42                0, 1 + 2 * 10**3, dataframe.shape[0],
+ 43            )
+ 44            new_columns['CONTROL-random-card10k'] = np.random.randint(
+ 45                0, 1 + 10 * 10**3, dataframe.shape[0],
+ 46            )
+ 47            new_columns['CONTROL-random-card50k'] = np.random.randint(
+ 48                0, 1 + 50 * 10**3, dataframe.shape[0],
+ 49            )
+ 50            new_columns['CONTROL-int-sequence'] = np.arange(
+ 51                0, dataframe.shape[0], 1.0,
+ 52            )
+ 53
+ 54            if label_column not in dataframe.columns:
+ 55                logging.warn(
+ 56                    'Could not find target feature in your data set - please inspect the columns if doing targeted ranking!',
+ 57                )
+ 58            else:
+ 59                new_columns['CONTROL-target'] = dataframe[label_column]
+ 60
+ 61            new_columns['CONTROL-volume'] = np.array([
+ 62                internal_hash(str(x)) for _, x in dataframe.iterrows()
+ 63            ])
+ 64        else:
+ 65            # Not relevant yet; will be if this is useful.
+ 66            pass
+ 67
+ 68        if len(new_columns) > 0:
+ 69            tmp_df = pd.DataFrame(new_columns)
+ 70            dataframe = pd.concat([dataframe, tmp_df], axis=1)
+ 71            del tmp_df
+ 72
+ 73        return dataframe
+ 74
+ 75
+ 76class FeatureTransformerGeneric:
+ 77    def __init__(self, numeric_column_names: set[str], preset: str = 'default'):
+ 78        for transformer_namespace in preset.split(','):
+ 79            self.transformer_collection: dict[str, str] = dict()
+ 80            transformer_subspace = transformer_vault._tr_global_namespace.get(
+ 81                transformer_namespace, None,
+ 82            )
+ 83            if transformer_subspace:
+ 84                self.transformer_collection = {
+ 85                    **self.transformer_collection,
+ 86                    **transformer_subspace,
+ 87                }
+ 88
+ 89            if len(self.transformer_collection) == 0:
+ 90                raise NotImplementedError(
+ 91                    'Please, specify valid transformer namespaces (e.g., default, minimal etc.)',
+ 92                )
+ 93
+ 94        self.numeric_column_names = set(numeric_column_names)
+ 95        self.constructed_feature_names: set[str] = set()
+ 96
+ 97        # If 80% of values are the same, don't consider a transformation
+ 98        self.max_maj_support = 0.80
+ 99
+100        # If more than 75% of vals are missing, don't consider a transformation
+101        self.nan_prop_support = 0.75
+102
+103    def get_vals(self, tmp_df: pd.DataFrame, col_name: str) -> Any:
+104        cvals = tmp_df[col_name].values.tolist()
+105        cvals = [str(x).replace('"', '') for x in cvals]
+106        cvals = [0.0 if len(x) == 0 else float(x) for x in cvals]
+107
+108        return np.array(cvals)
+109
+110    def construct_baseline_features(self, dataframe: Any) -> pd.DataFrame:
+111        fvals = []
+112        for enx, row in dataframe.iterrows():
+113            missing_prop = np.round(
+114                row.values.tolist().count('') / dataframe.shape[1], 1,
+115            )
+116            fvals.append(missing_prop)
+117
+118        dataframe['BASELINE-MISSING-PROPORTION'] = fvals
+119        dataframe['BASELINE-DUMMY'] = 0
+120
+121        return dataframe
+122
+123    def construct_new_features(self, dataframe: Any) -> pd.DataFrame:
+124        new_numeric = set()
+125        logging.info(
+126            f'Considering {len(self.transformer_collection)} transformations for {len(self.numeric_column_names)} features ({len(self.transformer_collection) * len(self.numeric_column_names)} new features will be considered).',
+127        )
+128
+129        invalid_transforms = 0
+130        new_columns = dict()
+131        for numeric_column in self.numeric_column_names:
+132            X = self.get_vals(dataframe, numeric_column)
+133
+134            if len(X) == 0:
+135                raise AssertionError(
+136                    f"Could not retrieve the colomn {numeric_column}'s values. Please check the data.",
+137                )
+138
+139            for k, v in self.transformer_collection.items():
+140                feature_name = f'{numeric_column}{k}'
+141                transformed_array = eval(v).astype(str)
+142                u, c = np.unique(transformed_array, return_counts=True)
+143                nan_prop = np.count_nonzero(transformed_array == 'nan') / len(
+144                    transformed_array,
+145                )
+146                cfreq = np.divide(np.max(c), np.sum(c))
+147                if (
+148                    len(u) > 1
+149                    and cfreq < self.max_maj_support
+150                    and nan_prop < self.nan_prop_support
+151                ):
+152                    new_columns[feature_name] = transformed_array
+153                    new_numeric.add(feature_name)
+154
+155                else:
+156                    invalid_transforms += 1
+157
+158        if len(new_columns) > 0:
+159            tmp_df = pd.DataFrame(new_columns)
+160            dataframe = pd.concat([dataframe, tmp_df], axis=1)
+161            del tmp_df
+162
+163        logging.info(
+164            f'{invalid_transforms} invalid transformations were skipped.',
+165        )
+166        self.numeric_column_names = self.numeric_column_names
+167        self.constructed_feature_names = new_numeric
+168        return dataframe
+
+ + +
+
+ +
+ + class + FeatureTransformerNoise: + + + +
+ +
20class FeatureTransformerNoise:
+21    def __init__(self):
+22        self.noise_preset = 'default'
+23
+24    def construct_new_features(self, dataframe: pd.DataFrame, label_column=None):
+25        """Generate a few standard noise distributions"""
+26
+27        new_columns = dict()
+28        if self.noise_preset == 'default':
+29            new_columns['CONTROL-constant0'] = np.array([0] * dataframe.shape[0])
+30            new_columns['CONTROL-gaussian'] = np.random.normal(
+31                size=dataframe.shape[0],
+32            )
+33            new_columns['CONTROL-uniform'] = np.random.random(
+34                dataframe.shape[0],
+35            )
+36            new_columns['CONTROL-random-binary'] = np.random.randint(
+37                0, 2, dataframe.shape[0],
+38            )
+39            new_columns['CONTROL-random-card100'] = np.random.randint(
+40                0, 1 + 1 * 10**2, dataframe.shape[0],
+41            )
+42            new_columns['CONTROL-random-card2k'] = np.random.randint(
+43                0, 1 + 2 * 10**3, dataframe.shape[0],
+44            )
+45            new_columns['CONTROL-random-card10k'] = np.random.randint(
+46                0, 1 + 10 * 10**3, dataframe.shape[0],
+47            )
+48            new_columns['CONTROL-random-card50k'] = np.random.randint(
+49                0, 1 + 50 * 10**3, dataframe.shape[0],
+50            )
+51            new_columns['CONTROL-int-sequence'] = np.arange(
+52                0, dataframe.shape[0], 1.0,
+53            )
+54
+55            if label_column not in dataframe.columns:
+56                logging.warn(
+57                    'Could not find target feature in your data set - please inspect the columns if doing targeted ranking!',
+58                )
+59            else:
+60                new_columns['CONTROL-target'] = dataframe[label_column]
+61
+62            new_columns['CONTROL-volume'] = np.array([
+63                internal_hash(str(x)) for _, x in dataframe.iterrows()
+64            ])
+65        else:
+66            # Not relevant yet; will be if this is useful.
+67            pass
+68
+69        if len(new_columns) > 0:
+70            tmp_df = pd.DataFrame(new_columns)
+71            dataframe = pd.concat([dataframe, tmp_df], axis=1)
+72            del tmp_df
+73
+74        return dataframe
+
+ + + + +
+
+ noise_preset + + +
+ + + + +
+
+ +
+ + def + construct_new_features(self, dataframe: pandas.core.frame.DataFrame, label_column=None): + + + +
+ +
24    def construct_new_features(self, dataframe: pd.DataFrame, label_column=None):
+25        """Generate a few standard noise distributions"""
+26
+27        new_columns = dict()
+28        if self.noise_preset == 'default':
+29            new_columns['CONTROL-constant0'] = np.array([0] * dataframe.shape[0])
+30            new_columns['CONTROL-gaussian'] = np.random.normal(
+31                size=dataframe.shape[0],
+32            )
+33            new_columns['CONTROL-uniform'] = np.random.random(
+34                dataframe.shape[0],
+35            )
+36            new_columns['CONTROL-random-binary'] = np.random.randint(
+37                0, 2, dataframe.shape[0],
+38            )
+39            new_columns['CONTROL-random-card100'] = np.random.randint(
+40                0, 1 + 1 * 10**2, dataframe.shape[0],
+41            )
+42            new_columns['CONTROL-random-card2k'] = np.random.randint(
+43                0, 1 + 2 * 10**3, dataframe.shape[0],
+44            )
+45            new_columns['CONTROL-random-card10k'] = np.random.randint(
+46                0, 1 + 10 * 10**3, dataframe.shape[0],
+47            )
+48            new_columns['CONTROL-random-card50k'] = np.random.randint(
+49                0, 1 + 50 * 10**3, dataframe.shape[0],
+50            )
+51            new_columns['CONTROL-int-sequence'] = np.arange(
+52                0, dataframe.shape[0], 1.0,
+53            )
+54
+55            if label_column not in dataframe.columns:
+56                logging.warn(
+57                    'Could not find target feature in your data set - please inspect the columns if doing targeted ranking!',
+58                )
+59            else:
+60                new_columns['CONTROL-target'] = dataframe[label_column]
+61
+62            new_columns['CONTROL-volume'] = np.array([
+63                internal_hash(str(x)) for _, x in dataframe.iterrows()
+64            ])
+65        else:
+66            # Not relevant yet; will be if this is useful.
+67            pass
+68
+69        if len(new_columns) > 0:
+70            tmp_df = pd.DataFrame(new_columns)
+71            dataframe = pd.concat([dataframe, tmp_df], axis=1)
+72            del tmp_df
+73
+74        return dataframe
+
+ + +

Generate a few standard noise distributions

+
+ + +
+
+
+ +
+ + class + FeatureTransformerGeneric: + + + +
+ +
 77class FeatureTransformerGeneric:
+ 78    def __init__(self, numeric_column_names: set[str], preset: str = 'default'):
+ 79        for transformer_namespace in preset.split(','):
+ 80            self.transformer_collection: dict[str, str] = dict()
+ 81            transformer_subspace = transformer_vault._tr_global_namespace.get(
+ 82                transformer_namespace, None,
+ 83            )
+ 84            if transformer_subspace:
+ 85                self.transformer_collection = {
+ 86                    **self.transformer_collection,
+ 87                    **transformer_subspace,
+ 88                }
+ 89
+ 90            if len(self.transformer_collection) == 0:
+ 91                raise NotImplementedError(
+ 92                    'Please, specify valid transformer namespaces (e.g., default, minimal etc.)',
+ 93                )
+ 94
+ 95        self.numeric_column_names = set(numeric_column_names)
+ 96        self.constructed_feature_names: set[str] = set()
+ 97
+ 98        # If 80% of values are the same, don't consider a transformation
+ 99        self.max_maj_support = 0.80
+100
+101        # If more than 75% of vals are missing, don't consider a transformation
+102        self.nan_prop_support = 0.75
+103
+104    def get_vals(self, tmp_df: pd.DataFrame, col_name: str) -> Any:
+105        cvals = tmp_df[col_name].values.tolist()
+106        cvals = [str(x).replace('"', '') for x in cvals]
+107        cvals = [0.0 if len(x) == 0 else float(x) for x in cvals]
+108
+109        return np.array(cvals)
+110
+111    def construct_baseline_features(self, dataframe: Any) -> pd.DataFrame:
+112        fvals = []
+113        for enx, row in dataframe.iterrows():
+114            missing_prop = np.round(
+115                row.values.tolist().count('') / dataframe.shape[1], 1,
+116            )
+117            fvals.append(missing_prop)
+118
+119        dataframe['BASELINE-MISSING-PROPORTION'] = fvals
+120        dataframe['BASELINE-DUMMY'] = 0
+121
+122        return dataframe
+123
+124    def construct_new_features(self, dataframe: Any) -> pd.DataFrame:
+125        new_numeric = set()
+126        logging.info(
+127            f'Considering {len(self.transformer_collection)} transformations for {len(self.numeric_column_names)} features ({len(self.transformer_collection) * len(self.numeric_column_names)} new features will be considered).',
+128        )
+129
+130        invalid_transforms = 0
+131        new_columns = dict()
+132        for numeric_column in self.numeric_column_names:
+133            X = self.get_vals(dataframe, numeric_column)
+134
+135            if len(X) == 0:
+136                raise AssertionError(
+137                    f"Could not retrieve the colomn {numeric_column}'s values. Please check the data.",
+138                )
+139
+140            for k, v in self.transformer_collection.items():
+141                feature_name = f'{numeric_column}{k}'
+142                transformed_array = eval(v).astype(str)
+143                u, c = np.unique(transformed_array, return_counts=True)
+144                nan_prop = np.count_nonzero(transformed_array == 'nan') / len(
+145                    transformed_array,
+146                )
+147                cfreq = np.divide(np.max(c), np.sum(c))
+148                if (
+149                    len(u) > 1
+150                    and cfreq < self.max_maj_support
+151                    and nan_prop < self.nan_prop_support
+152                ):
+153                    new_columns[feature_name] = transformed_array
+154                    new_numeric.add(feature_name)
+155
+156                else:
+157                    invalid_transforms += 1
+158
+159        if len(new_columns) > 0:
+160            tmp_df = pd.DataFrame(new_columns)
+161            dataframe = pd.concat([dataframe, tmp_df], axis=1)
+162            del tmp_df
+163
+164        logging.info(
+165            f'{invalid_transforms} invalid transformations were skipped.',
+166        )
+167        self.numeric_column_names = self.numeric_column_names
+168        self.constructed_feature_names = new_numeric
+169        return dataframe
+
+ + + + +
+ +
+ + FeatureTransformerGeneric(numeric_column_names: set[str], preset: str = 'default') + + + +
+ +
 78    def __init__(self, numeric_column_names: set[str], preset: str = 'default'):
+ 79        for transformer_namespace in preset.split(','):
+ 80            self.transformer_collection: dict[str, str] = dict()
+ 81            transformer_subspace = transformer_vault._tr_global_namespace.get(
+ 82                transformer_namespace, None,
+ 83            )
+ 84            if transformer_subspace:
+ 85                self.transformer_collection = {
+ 86                    **self.transformer_collection,
+ 87                    **transformer_subspace,
+ 88                }
+ 89
+ 90            if len(self.transformer_collection) == 0:
+ 91                raise NotImplementedError(
+ 92                    'Please, specify valid transformer namespaces (e.g., default, minimal etc.)',
+ 93                )
+ 94
+ 95        self.numeric_column_names = set(numeric_column_names)
+ 96        self.constructed_feature_names: set[str] = set()
+ 97
+ 98        # If 80% of values are the same, don't consider a transformation
+ 99        self.max_maj_support = 0.80
+100
+101        # If more than 75% of vals are missing, don't consider a transformation
+102        self.nan_prop_support = 0.75
+
+ + + + +
+
+
+ numeric_column_names + + +
+ + + + +
+
+
+ constructed_feature_names: set[str] + + +
+ + + + +
+
+
+ max_maj_support + + +
+ + + + +
+
+
+ nan_prop_support + + +
+ + + + +
+
+ +
+ + def + get_vals(self, tmp_df: pandas.core.frame.DataFrame, col_name: str) -> Any: + + + +
+ +
104    def get_vals(self, tmp_df: pd.DataFrame, col_name: str) -> Any:
+105        cvals = tmp_df[col_name].values.tolist()
+106        cvals = [str(x).replace('"', '') for x in cvals]
+107        cvals = [0.0 if len(x) == 0 else float(x) for x in cvals]
+108
+109        return np.array(cvals)
+
+ + + + +
+
+ +
+ + def + construct_baseline_features(self, dataframe: Any) -> pandas.core.frame.DataFrame: + + + +
+ +
111    def construct_baseline_features(self, dataframe: Any) -> pd.DataFrame:
+112        fvals = []
+113        for enx, row in dataframe.iterrows():
+114            missing_prop = np.round(
+115                row.values.tolist().count('') / dataframe.shape[1], 1,
+116            )
+117            fvals.append(missing_prop)
+118
+119        dataframe['BASELINE-MISSING-PROPORTION'] = fvals
+120        dataframe['BASELINE-DUMMY'] = 0
+121
+122        return dataframe
+
+ + + + +
+
+ +
+ + def + construct_new_features(self, dataframe: Any) -> pandas.core.frame.DataFrame: + + + +
+ +
124    def construct_new_features(self, dataframe: Any) -> pd.DataFrame:
+125        new_numeric = set()
+126        logging.info(
+127            f'Considering {len(self.transformer_collection)} transformations for {len(self.numeric_column_names)} features ({len(self.transformer_collection) * len(self.numeric_column_names)} new features will be considered).',
+128        )
+129
+130        invalid_transforms = 0
+131        new_columns = dict()
+132        for numeric_column in self.numeric_column_names:
+133            X = self.get_vals(dataframe, numeric_column)
+134
+135            if len(X) == 0:
+136                raise AssertionError(
+137                    f"Could not retrieve the colomn {numeric_column}'s values. Please check the data.",
+138                )
+139
+140            for k, v in self.transformer_collection.items():
+141                feature_name = f'{numeric_column}{k}'
+142                transformed_array = eval(v).astype(str)
+143                u, c = np.unique(transformed_array, return_counts=True)
+144                nan_prop = np.count_nonzero(transformed_array == 'nan') / len(
+145                    transformed_array,
+146                )
+147                cfreq = np.divide(np.max(c), np.sum(c))
+148                if (
+149                    len(u) > 1
+150                    and cfreq < self.max_maj_support
+151                    and nan_prop < self.nan_prop_support
+152                ):
+153                    new_columns[feature_name] = transformed_array
+154                    new_numeric.add(feature_name)
+155
+156                else:
+157                    invalid_transforms += 1
+158
+159        if len(new_columns) > 0:
+160            tmp_df = pd.DataFrame(new_columns)
+161            dataframe = pd.concat([dataframe, tmp_df], axis=1)
+162            del tmp_df
+163
+164        logging.info(
+165            f'{invalid_transforms} invalid transformations were skipped.',
+166        )
+167        self.numeric_column_names = self.numeric_column_names
+168        self.constructed_feature_names = new_numeric
+169        return dataframe
+
+ + + + +
+
+
+ + diff --git a/docs/outrank/task_generators.html b/docs/outrank/task_generators.html new file mode 100644 index 0000000..17abb63 --- /dev/null +++ b/docs/outrank/task_generators.html @@ -0,0 +1,349 @@ + + + + + + + outrank.task_generators API documentation + + + + + + + + + +
+
+

+outrank.task_generators

+ + + + + + +
 1# OutRank is also capable of generating data sets.
+ 2from __future__ import annotations
+ 3
+ 4import logging
+ 5import os
+ 6import shutil
+ 7
+ 8import pandas as pd
+ 9
+10from outrank.algorithms.synthetic_data_generators import generator_naive
+11
+12logging.basicConfig(
+13    format='%(asctime)s - %(message)s',
+14    datefmt='%d-%b-%y %H:%M:%S',
+15)
+16logger = logging.getLogger('syn-logger')
+17logger.setLevel(logging.DEBUG)
+18
+19
+20def outrank_task_generate_data_set(args):
+21    """Core method for generating data sets"""
+22
+23    if args.generator_type == 'naive':
+24        sample, target = generator_naive.generate_random_matrix(
+25            args.num_synthetic_features, args.num_synthetic_rows,
+26        )
+27    else:
+28        raise ValueError(f'Generator {args.generator_type} not implemented.')
+29
+30    dfx = pd.DataFrame(sample)
+31    dfx.columns = [f'f{x}' for x in range(dfx.shape[1])]
+32    dfx['label'] = target
+33    if os.path.exists(args.output_synthetic_df_name) and os.path.isdir(
+34        args.output_synthetic_df_name,
+35    ):
+36        logger.info(
+37            f'Found existing: {args.output_synthetic_df_name}, removing first ..',
+38        )
+39        shutil.rmtree(args.output_synthetic_df_name)
+40    os.mkdir(args.output_synthetic_df_name)
+41    dfx.to_csv(f'./{args.output_synthetic_df_name}/data.csv', index=False)
+42
+43    logger.info(
+44        f'Generated data set of shape {dfx.shape} in {args.output_synthetic_df_name}',
+45    )
+
+ + +
+
+
+ logger = +<Logger syn-logger (DEBUG)> + + +
+ + + + +
+
+ +
+ + def + outrank_task_generate_data_set(args): + + + +
+ +
21def outrank_task_generate_data_set(args):
+22    """Core method for generating data sets"""
+23
+24    if args.generator_type == 'naive':
+25        sample, target = generator_naive.generate_random_matrix(
+26            args.num_synthetic_features, args.num_synthetic_rows,
+27        )
+28    else:
+29        raise ValueError(f'Generator {args.generator_type} not implemented.')
+30
+31    dfx = pd.DataFrame(sample)
+32    dfx.columns = [f'f{x}' for x in range(dfx.shape[1])]
+33    dfx['label'] = target
+34    if os.path.exists(args.output_synthetic_df_name) and os.path.isdir(
+35        args.output_synthetic_df_name,
+36    ):
+37        logger.info(
+38            f'Found existing: {args.output_synthetic_df_name}, removing first ..',
+39        )
+40        shutil.rmtree(args.output_synthetic_df_name)
+41    os.mkdir(args.output_synthetic_df_name)
+42    dfx.to_csv(f'./{args.output_synthetic_df_name}/data.csv', index=False)
+43
+44    logger.info(
+45        f'Generated data set of shape {dfx.shape} in {args.output_synthetic_df_name}',
+46    )
+
+ + +

Core method for generating data sets

+
+ + +
+
+ + diff --git a/docs/outrank/task_ranking.html b/docs/outrank/task_ranking.html new file mode 100644 index 0000000..e7288da --- /dev/null +++ b/docs/outrank/task_ranking.html @@ -0,0 +1,797 @@ + + + + + + + outrank.task_ranking API documentation + + + + + + + + + +
+
+

+outrank.task_ranking

+ + + + + + +
  1from __future__ import annotations
+  2
+  3import glob
+  4import logging
+  5import os
+  6import signal
+  7from typing import Any
+  8
+  9import numpy as np
+ 10import pandas as pd
+ 11
+ 12from outrank.algorithms.importance_estimator import rank_features_3MR
+ 13from outrank.core_ranking import estimate_importances_minibatches
+ 14from outrank.core_utils import display_random_tip
+ 15from outrank.core_utils import display_tool_name
+ 16from outrank.core_utils import get_dataset_info
+ 17from outrank.core_utils import summarize_feature_bounds_for_transformers
+ 18from outrank.core_utils import summarize_rare_counts
+ 19
+ 20logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
+ 21signal.signal(signal.SIGINT, signal.default_int_handler)
+ 22
+ 23try:
+ 24    # pathos enables proper pickling during parallelization (multiprocessing does not)
+ 25    from pathos.multiprocessing import ProcessingPool as Pool
+ 26
+ 27except Exception as es:
+ 28    logging.info(
+ 29        f'\U0001F631 Please install the "pathos" library (pip install pathos) for required multithreading capabilities. {es}',
+ 30    )
+ 31
+ 32
+ 33def outrank_task_conduct_ranking(args: Any):
+ 34    # Data source = folder structure + relevant file specifications
+ 35
+ 36    # No need for full-blown ranking in this case
+ 37    if args.task in ['identify_rare_values', 'feature_summary_transformers']:
+ 38        args.heuristic = 'Constant'
+ 39
+ 40    display_tool_name()
+ 41    display_random_tip()
+ 42
+ 43    dataset_info = get_dataset_info(args)
+ 44
+ 45    for arg in vars(args):
+ 46        logging.info(f'{arg} set to: {getattr(args, arg)}')
+ 47
+ 48    # Generate output folders (if not present)
+ 49    output_dir = os.path.dirname(
+ 50        os.path.join(
+ 51            args.output_folder, 'pairwise_ranks.tsv',
+ 52        ),
+ 53    )
+ 54    if not os.path.exists(output_dir):
+ 55        os.makedirs(output_dir)
+ 56
+ 57    # Initialize the global pool
+ 58    GLOBAL_CPU_POOL = Pool(args.num_threads)
+ 59    global_mutual_information_estimates = []
+ 60    global_bounds_storage = []
+ 61    global_memory_storage = []
+ 62    all_timings = []
+ 63    # Traverse the batches
+ 64    for raw_dump in glob.glob(dataset_info.data_path):
+ 65
+ 66        if (
+ 67            args.data_source == 'ob-vw'
+ 68            or args.data_source == 'ob-csv'
+ 69            or args.data_source == 'csv-raw'
+ 70            or args.data_source == 'ob-raw-dump'
+ 71        ):
+ 72            all_subfiles = [raw_dump]
+ 73
+ 74        for partial_data in all_subfiles:
+ 75            cmd_arguments = {
+ 76                'input_file': partial_data,
+ 77                'fw_col_mapping': dataset_info.fw_map,
+ 78                'column_descriptions': dataset_info.column_names,
+ 79                'numeric_column_types': dataset_info.column_types,
+ 80                'args': args,
+ 81                'data_encoding': dataset_info.encoding,
+ 82                'cpu_pool': GLOBAL_CPU_POOL,
+ 83                'delimiter': dataset_info.col_delimiter,
+ 84                'logger': logging,
+ 85            }
+ 86
+ 87            if (
+ 88                args.data_source == 'ob-csv'
+ 89                or args.data_source == 'ob-vw'
+ 90                or args.data_source == 'csv-raw'
+ 91                or args.data_source == 'ob-raw-dump'
+ 92            ):
+ 93                (
+ 94                    checkpoint_timings,
+ 95                    mutual_information_estimates,
+ 96                    cardinality_object,
+ 97                    bounds_object_storage,
+ 98                    memory_object_storage,
+ 99                    coverage_object,
+100                    RARE_VALUE_STORAGE,
+101                ) = estimate_importances_minibatches(**cmd_arguments)
+102
+103            global_bounds_storage += bounds_object_storage
+104            global_memory_storage += memory_object_storage
+105            all_timings += checkpoint_timings
+106
+107            if cardinality_object is None:
+108                continue
+109
+110            if coverage_object is None:
+111                continue
+112
+113            if mutual_information_estimates is not None:
+114                global_mutual_information_estimates.append(
+115                    mutual_information_estimates,
+116                )
+117
+118    if args.task == 'identify_rare_values':
+119        logging.info('Summarizing rare values ..')
+120        summarize_rare_counts(
+121            RARE_VALUE_STORAGE, args, cardinality_object, dataset_info,
+122        )
+123        exit()
+124
+125    if args.task == 'feature_summary_transformers':
+126        summarize_feature_bounds_for_transformers(
+127            bounds_object_storage,
+128            dataset_info.column_types,
+129            args.task,
+130            args.label_column,
+131        )
+132        exit()
+133    else:
+134        summary_of_numeric_features = summarize_feature_bounds_for_transformers(
+135            bounds_object_storage,
+136            dataset_info.column_types,
+137            args.task,
+138            args.label_column,
+139            output_summary_table_only=True,
+140        )
+141        if summary_of_numeric_features is not None:
+142            num_out = os.path.join(
+143                args.output_folder, 'numeric_feature_statistics.tsv',
+144            )
+145            summary_of_numeric_features.to_csv(num_out, sep='\t', index=False)
+146            logging.info(
+147                f'Stored statistics of numeric features to {num_out} ..',
+148            )
+149
+150    # Just in case.
+151    GLOBAL_CPU_POOL.close()
+152    GLOBAL_CPU_POOL.join()
+153
+154    if len(global_mutual_information_estimates) == 0:
+155        logging.info('No rankings were obtained, exiting ..')
+156        exit()
+157
+158    # Compute median imps across batches
+159    triplets = pd.concat(global_mutual_information_estimates, axis=0)
+160    triplets.columns = ['FeatureA', 'FeatureB', 'Score']
+161
+162    if '3mr' in args.heuristic:
+163        # relevance include MI-scores of features w.r.t. labels
+164        relevance_df = triplets[triplets.FeatureB == args.label_column].copy()
+165        relevance_df = relevance_df[
+166            relevance_df.FeatureA.map(lambda x: ' AND_REL ' not in x)
+167        ][['FeatureA', 'Score']]
+168        relevance_df = relevance_df[relevance_df.FeatureA != args.label_column]
+169
+170        # relations include MI-scores of combinations w.r.t. label
+171        relations_df = triplets[triplets.FeatureB == args.label_column][
+172            ['FeatureA', 'Score']
+173        ].copy()
+174        relations_df = relations_df[
+175            relations_df.FeatureA.map(lambda x: ' AND_REL ' in x)
+176        ]
+177        relations_df['FeatureB'] = relations_df.FeatureA.map(
+178            lambda x: x.split(' AND_REL ')[1],
+179        )
+180        relations_df['FeatureA'] = relations_df.FeatureA.map(
+181            lambda x: x.split(' AND_REL ')[0],
+182        )
+183
+184        # redundancies include MI-scores of features w.r.t. non-label features
+185        redundancies_df = triplets[(
+186            triplets.FeatureB != args.label_column
+187        )].copy()
+188        redundancies_df = redundancies_df[
+189            redundancies_df.FeatureA !=
+190            args.label_column
+191        ]
+192        redundancies_df = redundancies_df[
+193            redundancies_df.apply(
+194                lambda x: (' AND_REL ' not in x.FeatureA)
+195                and (' AND_REL ' not in x.FeatureB),
+196                axis=1,
+197            )
+198        ]
+199
+200        # normalize
+201        relevance_df['score'] = (relevance_df.Score - relevance_df.Score.min()) / (
+202            relevance_df.Score.max() - relevance_df.Score.min()
+203        )
+204        relations_df['score'] = (relations_df.Score - relations_df.Score.min()) / (
+205            relations_df.Score.max() - relations_df.Score.min()
+206        )
+207        redundancies_df['score'] = (
+208            redundancies_df.Score - redundancies_df.Score.min()
+209        ) / (redundancies_df.Score.max() - redundancies_df.Score.min())
+210
+211        # create dicts
+212        relevance_dict = {
+213            row.FeatureA: row.score for _,
+214            row in relevance_df.iterrows()
+215        }
+216        relations_dict = {
+217            (row.FeatureA, row.FeatureB): row.score
+218            for _, row in relations_df.iterrows()
+219        }
+220        relations_dict.update(
+221            {
+222                (row.FeatureB, row.FeatureA): row.score
+223                for _, row in relations_df.iterrows()
+224            },
+225        )
+226        redundancy_dict = {
+227            (row.FeatureA, row.FeatureB): row.score
+228            for _, row in redundancies_df.iterrows()
+229        }
+230
+231        # compute 3mr ranks
+232        mrmrmr_ranking = rank_features_3MR(
+233            relevance_dict, redundancy_dict, relations_dict,
+234        )
+235        mrmrmr_ranking.to_csv(
+236            os.path.join(args.output_folder, '3mr_ranks.tsv'), sep='\t', index=False,
+237        )
+238
+239    feature_first_modified = []
+240    feature_second_modified = []
+241
+242    if args.include_cardinality_in_feature_names == 'True':
+243        for enx in range(triplets.shape[0]):
+244            feature_first = triplets.iloc[enx]['FeatureA']
+245            feature_second = triplets.iloc[enx]['FeatureB']
+246            card_first = str(len(cardinality_object[feature_first]))
+247            card_second = str(len(cardinality_object[feature_second]))
+248            cov_first = int(
+249                round((np.mean(np.array(coverage_object[feature_first]))), 1),
+250            )
+251            cov_second = int(
+252                round(np.mean(np.array(coverage_object[feature_second])), 1),
+253            )
+254
+255            feature_first_modified.append(
+256                feature_first + f'-({card_first}; {cov_first})',
+257            )
+258            feature_second_modified.append(
+259                feature_second + f'-({card_second}; {cov_second})',
+260            )
+261
+262        triplets['FeatureA'] = feature_first_modified
+263        triplets['FeatureB'] = feature_second_modified
+264
+265    feature_memory_df = pd.DataFrame(global_memory_storage).mean()
+266    feature_memory_df.columns = ['NormalizedSize']
+267    feature_memory_df.to_csv(
+268        f'{args.output_folder}/memory.tsv', sep='\t', index=True,
+269    )
+270
+271    triplets = triplets.sort_values(by=['Score'])
+272
+273    triplets.to_csv(
+274        os.path.join(args.output_folder, 'pairwise_ranks.tsv'), sep='\t', index=False,
+275    )
+276
+277    dfx = pd.DataFrame(all_timings)
+278    dfx.to_json(f'{args.output_folder}/timings.json')
+279
+280    logging.info(
+281        f'Finished with ranking! Result stored as: {args.output_folder}/pairwise_ranks.tsv.',
+282    )
+283
+284    os.remove('ranking_checkpoint_tmp.tsv')
+
+ + +
+
+ +
+ + def + outrank_task_conduct_ranking(args: Any): + + + +
+ +
 34def outrank_task_conduct_ranking(args: Any):
+ 35    # Data source = folder structure + relevant file specifications
+ 36
+ 37    # No need for full-blown ranking in this case
+ 38    if args.task in ['identify_rare_values', 'feature_summary_transformers']:
+ 39        args.heuristic = 'Constant'
+ 40
+ 41    display_tool_name()
+ 42    display_random_tip()
+ 43
+ 44    dataset_info = get_dataset_info(args)
+ 45
+ 46    for arg in vars(args):
+ 47        logging.info(f'{arg} set to: {getattr(args, arg)}')
+ 48
+ 49    # Generate output folders (if not present)
+ 50    output_dir = os.path.dirname(
+ 51        os.path.join(
+ 52            args.output_folder, 'pairwise_ranks.tsv',
+ 53        ),
+ 54    )
+ 55    if not os.path.exists(output_dir):
+ 56        os.makedirs(output_dir)
+ 57
+ 58    # Initialize the global pool
+ 59    GLOBAL_CPU_POOL = Pool(args.num_threads)
+ 60    global_mutual_information_estimates = []
+ 61    global_bounds_storage = []
+ 62    global_memory_storage = []
+ 63    all_timings = []
+ 64    # Traverse the batches
+ 65    for raw_dump in glob.glob(dataset_info.data_path):
+ 66
+ 67        if (
+ 68            args.data_source == 'ob-vw'
+ 69            or args.data_source == 'ob-csv'
+ 70            or args.data_source == 'csv-raw'
+ 71            or args.data_source == 'ob-raw-dump'
+ 72        ):
+ 73            all_subfiles = [raw_dump]
+ 74
+ 75        for partial_data in all_subfiles:
+ 76            cmd_arguments = {
+ 77                'input_file': partial_data,
+ 78                'fw_col_mapping': dataset_info.fw_map,
+ 79                'column_descriptions': dataset_info.column_names,
+ 80                'numeric_column_types': dataset_info.column_types,
+ 81                'args': args,
+ 82                'data_encoding': dataset_info.encoding,
+ 83                'cpu_pool': GLOBAL_CPU_POOL,
+ 84                'delimiter': dataset_info.col_delimiter,
+ 85                'logger': logging,
+ 86            }
+ 87
+ 88            if (
+ 89                args.data_source == 'ob-csv'
+ 90                or args.data_source == 'ob-vw'
+ 91                or args.data_source == 'csv-raw'
+ 92                or args.data_source == 'ob-raw-dump'
+ 93            ):
+ 94                (
+ 95                    checkpoint_timings,
+ 96                    mutual_information_estimates,
+ 97                    cardinality_object,
+ 98                    bounds_object_storage,
+ 99                    memory_object_storage,
+100                    coverage_object,
+101                    RARE_VALUE_STORAGE,
+102                ) = estimate_importances_minibatches(**cmd_arguments)
+103
+104            global_bounds_storage += bounds_object_storage
+105            global_memory_storage += memory_object_storage
+106            all_timings += checkpoint_timings
+107
+108            if cardinality_object is None:
+109                continue
+110
+111            if coverage_object is None:
+112                continue
+113
+114            if mutual_information_estimates is not None:
+115                global_mutual_information_estimates.append(
+116                    mutual_information_estimates,
+117                )
+118
+119    if args.task == 'identify_rare_values':
+120        logging.info('Summarizing rare values ..')
+121        summarize_rare_counts(
+122            RARE_VALUE_STORAGE, args, cardinality_object, dataset_info,
+123        )
+124        exit()
+125
+126    if args.task == 'feature_summary_transformers':
+127        summarize_feature_bounds_for_transformers(
+128            bounds_object_storage,
+129            dataset_info.column_types,
+130            args.task,
+131            args.label_column,
+132        )
+133        exit()
+134    else:
+135        summary_of_numeric_features = summarize_feature_bounds_for_transformers(
+136            bounds_object_storage,
+137            dataset_info.column_types,
+138            args.task,
+139            args.label_column,
+140            output_summary_table_only=True,
+141        )
+142        if summary_of_numeric_features is not None:
+143            num_out = os.path.join(
+144                args.output_folder, 'numeric_feature_statistics.tsv',
+145            )
+146            summary_of_numeric_features.to_csv(num_out, sep='\t', index=False)
+147            logging.info(
+148                f'Stored statistics of numeric features to {num_out} ..',
+149            )
+150
+151    # Just in case.
+152    GLOBAL_CPU_POOL.close()
+153    GLOBAL_CPU_POOL.join()
+154
+155    if len(global_mutual_information_estimates) == 0:
+156        logging.info('No rankings were obtained, exiting ..')
+157        exit()
+158
+159    # Compute median imps across batches
+160    triplets = pd.concat(global_mutual_information_estimates, axis=0)
+161    triplets.columns = ['FeatureA', 'FeatureB', 'Score']
+162
+163    if '3mr' in args.heuristic:
+164        # relevance include MI-scores of features w.r.t. labels
+165        relevance_df = triplets[triplets.FeatureB == args.label_column].copy()
+166        relevance_df = relevance_df[
+167            relevance_df.FeatureA.map(lambda x: ' AND_REL ' not in x)
+168        ][['FeatureA', 'Score']]
+169        relevance_df = relevance_df[relevance_df.FeatureA != args.label_column]
+170
+171        # relations include MI-scores of combinations w.r.t. label
+172        relations_df = triplets[triplets.FeatureB == args.label_column][
+173            ['FeatureA', 'Score']
+174        ].copy()
+175        relations_df = relations_df[
+176            relations_df.FeatureA.map(lambda x: ' AND_REL ' in x)
+177        ]
+178        relations_df['FeatureB'] = relations_df.FeatureA.map(
+179            lambda x: x.split(' AND_REL ')[1],
+180        )
+181        relations_df['FeatureA'] = relations_df.FeatureA.map(
+182            lambda x: x.split(' AND_REL ')[0],
+183        )
+184
+185        # redundancies include MI-scores of features w.r.t. non-label features
+186        redundancies_df = triplets[(
+187            triplets.FeatureB != args.label_column
+188        )].copy()
+189        redundancies_df = redundancies_df[
+190            redundancies_df.FeatureA !=
+191            args.label_column
+192        ]
+193        redundancies_df = redundancies_df[
+194            redundancies_df.apply(
+195                lambda x: (' AND_REL ' not in x.FeatureA)
+196                and (' AND_REL ' not in x.FeatureB),
+197                axis=1,
+198            )
+199        ]
+200
+201        # normalize
+202        relevance_df['score'] = (relevance_df.Score - relevance_df.Score.min()) / (
+203            relevance_df.Score.max() - relevance_df.Score.min()
+204        )
+205        relations_df['score'] = (relations_df.Score - relations_df.Score.min()) / (
+206            relations_df.Score.max() - relations_df.Score.min()
+207        )
+208        redundancies_df['score'] = (
+209            redundancies_df.Score - redundancies_df.Score.min()
+210        ) / (redundancies_df.Score.max() - redundancies_df.Score.min())
+211
+212        # create dicts
+213        relevance_dict = {
+214            row.FeatureA: row.score for _,
+215            row in relevance_df.iterrows()
+216        }
+217        relations_dict = {
+218            (row.FeatureA, row.FeatureB): row.score
+219            for _, row in relations_df.iterrows()
+220        }
+221        relations_dict.update(
+222            {
+223                (row.FeatureB, row.FeatureA): row.score
+224                for _, row in relations_df.iterrows()
+225            },
+226        )
+227        redundancy_dict = {
+228            (row.FeatureA, row.FeatureB): row.score
+229            for _, row in redundancies_df.iterrows()
+230        }
+231
+232        # compute 3mr ranks
+233        mrmrmr_ranking = rank_features_3MR(
+234            relevance_dict, redundancy_dict, relations_dict,
+235        )
+236        mrmrmr_ranking.to_csv(
+237            os.path.join(args.output_folder, '3mr_ranks.tsv'), sep='\t', index=False,
+238        )
+239
+240    feature_first_modified = []
+241    feature_second_modified = []
+242
+243    if args.include_cardinality_in_feature_names == 'True':
+244        for enx in range(triplets.shape[0]):
+245            feature_first = triplets.iloc[enx]['FeatureA']
+246            feature_second = triplets.iloc[enx]['FeatureB']
+247            card_first = str(len(cardinality_object[feature_first]))
+248            card_second = str(len(cardinality_object[feature_second]))
+249            cov_first = int(
+250                round((np.mean(np.array(coverage_object[feature_first]))), 1),
+251            )
+252            cov_second = int(
+253                round(np.mean(np.array(coverage_object[feature_second])), 1),
+254            )
+255
+256            feature_first_modified.append(
+257                feature_first + f'-({card_first}; {cov_first})',
+258            )
+259            feature_second_modified.append(
+260                feature_second + f'-({card_second}; {cov_second})',
+261            )
+262
+263        triplets['FeatureA'] = feature_first_modified
+264        triplets['FeatureB'] = feature_second_modified
+265
+266    feature_memory_df = pd.DataFrame(global_memory_storage).mean()
+267    feature_memory_df.columns = ['NormalizedSize']
+268    feature_memory_df.to_csv(
+269        f'{args.output_folder}/memory.tsv', sep='\t', index=True,
+270    )
+271
+272    triplets = triplets.sort_values(by=['Score'])
+273
+274    triplets.to_csv(
+275        os.path.join(args.output_folder, 'pairwise_ranks.tsv'), sep='\t', index=False,
+276    )
+277
+278    dfx = pd.DataFrame(all_timings)
+279    dfx.to_json(f'{args.output_folder}/timings.json')
+280
+281    logging.info(
+282        f'Finished with ranking! Result stored as: {args.output_folder}/pairwise_ranks.tsv.',
+283    )
+284
+285    os.remove('ranking_checkpoint_tmp.tsv')
+
+ + + + +
+
+ + diff --git a/docs/outrank/task_selftest.html b/docs/outrank/task_selftest.html new file mode 100644 index 0000000..f4ab0cf --- /dev/null +++ b/docs/outrank/task_selftest.html @@ -0,0 +1,342 @@ + + + + + + + outrank.task_selftest API documentation + + + + + + + + + +
+
+

+outrank.task_selftest

+ + + + + + +
 1# helper set of methods that enable anywhere verification of core functions
+ 2from __future__ import annotations
+ 3
+ 4import logging
+ 5import os
+ 6import shutil
+ 7import subprocess
+ 8
+ 9import pandas as pd
+10
+11logging.basicConfig(
+12    format='%(asctime)s - %(message)s',
+13    datefmt='%d-%b-%y %H:%M:%S',
+14)
+15logger = logging.getLogger('syn-logger')
+16logger.setLevel(logging.DEBUG)
+17
+18
+19def conduct_self_test():
+20    # Simulate full flow, ranking only
+21    subprocess.run(
+22        'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
+23    )
+24    subprocess.run(
+25        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
+26        shell=True,
+27    )
+28
+29    dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t')
+30
+31    logger.info("Verifying output's properties ..")
+32    assert dfx.shape[0] == 201
+33    assert dfx.shape[1] == 3
+34    assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)'
+35
+36    to_remove = ['ranking_outputs', 'test_data_synthetic']
+37    for path in to_remove:
+38        if os.path.exists(path) and os.path.isdir(path):
+39            logger.info(f'Removing {path} as part of cleanup ..')
+40            shutil.rmtree(path)
+41
+42    logger.info('All tests passed, OutRank seems in shape \N{winking face}')
+
+ + +
+
+
+ logger = +<Logger syn-logger (DEBUG)> + + +
+ + + + +
+
+ +
+ + def + conduct_self_test(): + + + +
+ +
20def conduct_self_test():
+21    # Simulate full flow, ranking only
+22    subprocess.run(
+23        'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
+24    )
+25    subprocess.run(
+26        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
+27        shell=True,
+28    )
+29
+30    dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t')
+31
+32    logger.info("Verifying output's properties ..")
+33    assert dfx.shape[0] == 201
+34    assert dfx.shape[1] == 3
+35    assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)'
+36
+37    to_remove = ['ranking_outputs', 'test_data_synthetic']
+38    for path in to_remove:
+39        if os.path.exists(path) and os.path.isdir(path):
+40            logger.info(f'Removing {path} as part of cleanup ..')
+41            shutil.rmtree(path)
+42
+43    logger.info('All tests passed, OutRank seems in shape \N{winking face}')
+
+ + + + +
+
+ + diff --git a/docs/outrank/task_summary.html b/docs/outrank/task_summary.html new file mode 100644 index 0000000..4c65d91 --- /dev/null +++ b/docs/outrank/task_summary.html @@ -0,0 +1,401 @@ + + + + + + + outrank.task_summary API documentation + + + + + + + + + +
+
+

+outrank.task_summary

+ + + + + + +
 1from __future__ import annotations
+ 2
+ 3import logging
+ 4import os
+ 5from collections import defaultdict
+ 6
+ 7import numpy as np
+ 8import pandas as pd
+ 9
+10logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
+11
+12
+13def outrank_task_result_summary(args):
+14    triplets = pd.read_csv(
+15        os.path.join(args.output_folder, 'pairwise_ranks.tsv'), sep='\t',
+16    )
+17    triplets = triplets.sort_values(by=['Score'], ascending=False)
+18    final_ranking = []
+19    for enx, row in triplets.iterrows():
+20        final_row = None
+21        if args.label_column == row['FeatureA'].split('-')[0]:
+22            final_row = [row['FeatureB'], row['Score']]
+23        if args.label_column == row['FeatureB'].split('-')[0]:
+24            final_row = [row['FeatureA'], row['Score']]
+25        if final_row and args.label_column != final_row[0]:
+26            final_ranking.append(final_row)
+27
+28    final_df = pd.DataFrame(final_ranking)
+29    final_df.columns = ['Feature', f'Score {args.heuristic}']
+30    final_df.index = np.arange(1, final_df.shape[0] + 1, 1)
+31    final_df = (
+32        final_df.groupby(by=['Feature'])
+33        .median()
+34        .reset_index()
+35        .sort_values(by=[f'Score {args.heuristic}'], ascending=False)
+36    )
+37
+38    min_score = np.min(final_df[f'Score {args.heuristic}'].values)
+39    max_score = np.max(final_df[f'Score {args.heuristic}'].values)
+40    final_df[f'Score {args.heuristic}'] = (
+41        final_df[f'Score {args.heuristic}'] - min_score
+42    ) / (max_score - min_score)
+43    logging.info(f'Storing summary files to {args.output_folder}')
+44    pd.set_option('display.max_rows', None, 'display.max_columns', None)
+45    singles_path = os.path.join(args.output_folder, 'feature_singles.tsv')
+46    final_df = final_df.reset_index(drop=True)
+47    final_df.to_csv(singles_path, sep='\t')
+48
+49    if args.interaction_order > 1:
+50        feature_store = defaultdict(list)
+51        for enx, row in final_df.iterrows():
+52            fname = row['Feature']
+53            score = row[f'Score {args.heuristic}']
+54            if 'AND' in fname:
+55                for el in fname.split('-')[0].split(' AND '):
+56                    feature_store[el].append(score)
+57
+58        final_aggregate_df = []
+59        for k, v in feature_store.items():
+60            final_aggregate_df.append(
+61                {
+62                    'Feature': k,
+63                    f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median(
+64                        v,
+65                    ),
+66                },
+67            )
+68        final_aggregate_df = pd.DataFrame(final_aggregate_df)
+69        final_aggregate_df.to_csv(
+70            os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t',
+71        )
+72
+73    final_df = final_df[final_df['Feature'].str.contains('_tr_')]
+74    final_df.to_csv(
+75        singles_path.replace('.tsv', '_transformers_only_imp.tsv'), sep='\t',
+76    )
+
+ + +
+
+ +
+ + def + outrank_task_result_summary(args): + + + +
+ +
14def outrank_task_result_summary(args):
+15    triplets = pd.read_csv(
+16        os.path.join(args.output_folder, 'pairwise_ranks.tsv'), sep='\t',
+17    )
+18    triplets = triplets.sort_values(by=['Score'], ascending=False)
+19    final_ranking = []
+20    for enx, row in triplets.iterrows():
+21        final_row = None
+22        if args.label_column == row['FeatureA'].split('-')[0]:
+23            final_row = [row['FeatureB'], row['Score']]
+24        if args.label_column == row['FeatureB'].split('-')[0]:
+25            final_row = [row['FeatureA'], row['Score']]
+26        if final_row and args.label_column != final_row[0]:
+27            final_ranking.append(final_row)
+28
+29    final_df = pd.DataFrame(final_ranking)
+30    final_df.columns = ['Feature', f'Score {args.heuristic}']
+31    final_df.index = np.arange(1, final_df.shape[0] + 1, 1)
+32    final_df = (
+33        final_df.groupby(by=['Feature'])
+34        .median()
+35        .reset_index()
+36        .sort_values(by=[f'Score {args.heuristic}'], ascending=False)
+37    )
+38
+39    min_score = np.min(final_df[f'Score {args.heuristic}'].values)
+40    max_score = np.max(final_df[f'Score {args.heuristic}'].values)
+41    final_df[f'Score {args.heuristic}'] = (
+42        final_df[f'Score {args.heuristic}'] - min_score
+43    ) / (max_score - min_score)
+44    logging.info(f'Storing summary files to {args.output_folder}')
+45    pd.set_option('display.max_rows', None, 'display.max_columns', None)
+46    singles_path = os.path.join(args.output_folder, 'feature_singles.tsv')
+47    final_df = final_df.reset_index(drop=True)
+48    final_df.to_csv(singles_path, sep='\t')
+49
+50    if args.interaction_order > 1:
+51        feature_store = defaultdict(list)
+52        for enx, row in final_df.iterrows():
+53            fname = row['Feature']
+54            score = row[f'Score {args.heuristic}']
+55            if 'AND' in fname:
+56                for el in fname.split('-')[0].split(' AND '):
+57                    feature_store[el].append(score)
+58
+59        final_aggregate_df = []
+60        for k, v in feature_store.items():
+61            final_aggregate_df.append(
+62                {
+63                    'Feature': k,
+64                    f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median(
+65                        v,
+66                    ),
+67                },
+68            )
+69        final_aggregate_df = pd.DataFrame(final_aggregate_df)
+70        final_aggregate_df.to_csv(
+71            os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t',
+72        )
+73
+74    final_df = final_df[final_df['Feature'].str.contains('_tr_')]
+75    final_df.to_csv(
+76        singles_path.replace('.tsv', '_transformers_only_imp.tsv'), sep='\t',
+77    )
+
+ + + + +
+
+ + diff --git a/docs/outrank/task_visualization.html b/docs/outrank/task_visualization.html new file mode 100644 index 0000000..157be8c --- /dev/null +++ b/docs/outrank/task_visualization.html @@ -0,0 +1,301 @@ + + + + + + + outrank.task_visualization API documentation + + + + + + + + + +
+
+

+outrank.task_visualization

+ + + + + + +
 1from __future__ import annotations
+ 2
+ 3import logging
+ 4import os
+ 5
+ 6import pandas as pd
+ 7
+ 8from outrank.visualizations.ranking_visualization import visualize_all
+ 9
+10logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
+11
+12
+13def outrank_task_visualize_results(args):
+14    logging.info(f'Beginning visualization based on: {args.output_folder}.')
+15
+16    triplets = pd.read_csv(
+17        os.path.join(args.output_folder, 'pairwise_ranks.tsv'), sep='\t',
+18    )
+19    visualize_all(
+20        triplets,
+21        args.output_folder,
+22        args.label_column,
+23        args.reference_model_JSON,
+24        image_format=args.image_format,
+25        heuristic=args.heuristic,
+26    )
+
+ + +
+
+ +
+ + def + outrank_task_visualize_results(args): + + + +
+ +
14def outrank_task_visualize_results(args):
+15    logging.info(f'Beginning visualization based on: {args.output_folder}.')
+16
+17    triplets = pd.read_csv(
+18        os.path.join(args.output_folder, 'pairwise_ranks.tsv'), sep='\t',
+19    )
+20    visualize_all(
+21        triplets,
+22        args.output_folder,
+23        args.label_column,
+24        args.reference_model_JSON,
+25        image_format=args.image_format,
+26        heuristic=args.heuristic,
+27    )
+
+ + + + +
+
+ + diff --git a/docs/outrank/visualizations.html b/docs/outrank/visualizations.html new file mode 100644 index 0000000..0d8575a --- /dev/null +++ b/docs/outrank/visualizations.html @@ -0,0 +1,237 @@ + + + + + + + outrank.visualizations API documentation + + + + + + + + + +
+
+

+outrank.visualizations

+ + + + + +
+
+ + diff --git a/docs/outrank/visualizations/ranking_visualization.html b/docs/outrank/visualizations/ranking_visualization.html new file mode 100644 index 0000000..8c8d06d --- /dev/null +++ b/docs/outrank/visualizations/ranking_visualization.html @@ -0,0 +1,980 @@ + + + + + + + outrank.visualizations.ranking_visualization API documentation + + + + + + + + + +
+
+

+outrank.visualizations.ranking_visualization

+ + + + + + +
  1from __future__ import annotations
+  2
+  3import logging
+  4import os
+  5
+  6import matplotlib.pyplot as plt
+  7import numpy as np
+  8import pandas as pd
+  9import seaborn as sns
+ 10from scipy.cluster import hierarchy
+ 11from sklearn.manifold import TSNE
+ 12from sklearn.metrics import silhouette_score
+ 13
+ 14from outrank.core_utils import read_reference_json
+ 15
+ 16logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
+ 17plt.rcParams['figure.figsize'] = (50, 30)
+ 18
+ 19
+ 20def visualize_hierarchical_clusters(
+ 21    triplet_dataframe: pd.DataFrame,
+ 22    output_folder: str,
+ 23    image_format: str = 'png',
+ 24    max_num_clusters: int = 100,
+ 25) -> None:
+ 26    """A method for visualization of hierarchical clusters w.r.t. different linkage functions"""
+ 27
+ 28    # Prepare the canvas
+ 29    plt.rcParams['figure.figsize'] = (10, 5)
+ 30    unique_features = triplet_dataframe.FeatureA.unique()
+ 31
+ 32    if len(unique_features) > 1000:
+ 33        logging.info('Trying to visualize too many features, exiting ..')
+ 34        exit()
+ 35
+ 36    dmat = np.zeros((len(unique_features), len(unique_features)))
+ 37    logging.info('Preparing the data for clustering ..')
+ 38
+ 39    if triplet_dataframe.shape[0] > 10**5:
+ 40        logging.info(
+ 41            'Trying to visualize more than 10 ** 5 triplets, exiting ..',
+ 42        )
+ 43        exit()
+ 44
+ 45    pivot_table = pd.pivot_table(
+ 46        triplet_dataframe,
+ 47        values='Score',
+ 48        index='FeatureA',
+ 49        columns='FeatureB',
+ 50        aggfunc=np.mean,
+ 51    )
+ 52
+ 53    # We need distances
+ 54    pivot_table.fillna(0, inplace=True)
+ 55    dmat = 1 - pivot_table.values
+ 56
+ 57    # Visualize different dendrograms
+ 58    logging.info('Clustering ..')
+ 59
+ 60    for linkage_heuristic in [
+ 61        # 'single', 'complete', 'average', 'weighted', 'centroid'
+ 62        'complete',
+ 63    ]:
+ 64        # Compute the linkage structure
+ 65        Z = hierarchy.linkage(dmat, linkage_heuristic)
+ 66
+ 67        # Visualize
+ 68        hierarchy.dendrogram(
+ 69            Z, above_threshold_color='y', orientation='top', labels=unique_features,
+ 70        )
+ 71        # Store
+ 72        plt.title(f'Linkage function: {linkage_heuristic}')
+ 73        plt.tight_layout()
+ 74        out_path = f'{output_folder}/dendrogram_{linkage_heuristic}.{image_format}'
+ 75        plt.savefig(out_path, dpi=300)
+ 76
+ 77        # Clean for subsequent plots
+ 78        plt.clf()
+ 79        plt.cla()
+ 80        logging.info(
+ 81            f'Visualized hierarchical clustering with linkage {linkage_heuristic} to {out_path}',
+ 82        )
+ 83
+ 84        # Step 1: Identify relevant distance threshold bounds
+ 85        range_min, range_max = np.min(
+ 86            pivot_table.values,
+ 87        ), np.max(pivot_table.values)
+ 88        spectrum = np.arange(
+ 89            range_min, range_max,
+ 90            (range_max - range_min) / 1000,
+ 91        )
+ 92        max_silhouette = 0
+ 93        top_clustering = []
+ 94        full_silhouette_space = []
+ 95
+ 96        # Step 2: Compute Silhouette for each threshold and store the results
+ 97        for possible_threshold in spectrum:
+ 98            cluster_assignments = hierarchy.fcluster(Z, possible_threshold)
+ 99            num_clusters = len(np.unique(cluster_assignments))
+100            if num_clusters > 2 and num_clusters < max_num_clusters:
+101                try:
+102                    sil_score = silhouette_score(
+103                        pivot_table, cluster_assignments,
+104                    )
+105
+106                except Exception:
+107                    continue
+108
+109                full_silhouette_space.append(
+110                    [
+111                        sil_score, possible_threshold, len(
+112                            np.unique(cluster_assignments),
+113                        ),
+114                    ],
+115                )
+116                if sil_score >= max_silhouette:
+117                    top_clustering = cluster_assignments
+118                    max_silhouette = sil_score
+119
+120        # Step 3: We are interested in the best clustering w.r.t. Silhouette
+121        dfx = pd.DataFrame(full_silhouette_space)
+122        if len(dfx) == 0:
+123            logging.info('Silhouette space empty, exiting')
+124            exit()
+125
+126        dfx.columns = ['Silhouette', 'threshold', 'numClusters']
+127        sns.lineplot(x=dfx.numClusters, y=dfx.Silhouette, color='black')
+128        plt.tight_layout()
+129        out_path = f'{output_folder}/SilhouetteProfile.{image_format}'
+130        plt.savefig(out_path, dpi=300)
+131        plt.clf()
+132        plt.cla()
+133        logging.info('Stored the Silhouette profile.')
+134
+135        final_feature_cluster_df = pd.DataFrame(
+136            list(zip(top_clustering, pivot_table.index)),
+137        )
+138        final_feature_cluster_df.columns = ['ClusterID', 'Feature']
+139        final_feature_cluster_df.to_csv(
+140            f'{output_folder}/TopClustering.tsv', sep='\t',
+141        )
+142
+143        # Get 2D embeddings of features and visualize them
+144        try:
+145            projected_data = TSNE().fit_transform(pivot_table.values)
+146            projected_data = pd.DataFrame(projected_data)
+147            projected_data.columns = ['Dim1', 'Dim2']
+148            projected_data['ClusterID'] = top_clustering
+149            projected_data['ClusterID'] = projected_data['ClusterID'].astype(
+150                str,
+151            )
+152            sns.scatterplot(
+153                x=projected_data.Dim1,
+154                y=projected_data.Dim2,
+155                hue=projected_data.ClusterID,
+156                palette='Set2',
+157            )
+158            plt.savefig(
+159                f'{output_folder}/clustersEmbeddingVisualization.pdf', dpi=300,
+160            )
+161            plt.clf()
+162            plt.cla()
+163        except:
+164            pass
+165
+166        # Step 4: We are interested in the best clustering w.r.t. Silhouette
+167        # Not here yet
+168
+169    plt.rcParams['figure.figsize'] = (50, 30)
+170
+171
+172def visualize_heatmap(
+173    triplets: pd.DataFrame, output_folder: str, image_format: str,
+174) -> None:
+175    # Compute the interaction pivot table
+176    sns.set(font_scale=2)
+177    fig, ax = plt.subplots()
+178    pivot_table = pd.pivot_table(
+179        triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc=np.mean,
+180    )
+181    mask = np.zeros_like(pivot_table.values)
+182    mask[np.triu_indices_from(mask)] = True
+183    fsize_heatmap = 20
+184    if pivot_table.shape[0] > 100:
+185        sns.set(font_scale=1)
+186        fsize_heatmap = 3
+187
+188    logging.info('Visualizing the heatmap ..')
+189
+190    if pivot_table.shape[0] > 500:
+191        logging.info(
+192            'Skipping heatmap visualization due to too many elements ..',
+193        )
+194        return
+195
+196    # Visualize the table
+197    plt.figure(figsize=(50, 50))
+198    plt.rcParams.update({'font.size': 1})
+199    sns.heatmap(
+200        pivot_table,
+201        annot=True,
+202        mask=mask,
+203        annot_kws={'size': fsize_heatmap},
+204        square=False,
+205        cmap='coolwarm',
+206        linecolor='black',
+207        linewidths=0.05,
+208    )
+209    plt.xlabel('')
+210    plt.ylabel('')
+211    plt.tight_layout()
+212    plt.savefig(f'{output_folder}/heatmap.{image_format}', dpi=500)
+213    plt.clf()
+214    plt.cla()
+215    logging.info(f'Stored heatmap to: {output_folder}/heatmap.{image_format}')
+216
+217
+218def visualize_barplots(
+219    triplets: pd.DataFrame,
+220    output_folder: str,
+221    reference_json: str,
+222    image_format: str,
+223    label: str,
+224    heuristic: str,
+225) -> None:
+226    # Extract only the interactions related to the target attribute
+227    sns.set(font_scale=8)
+228    feature_ranks_rows = []
+229    for enx, row in triplets.iterrows():
+230        feature_A = row['FeatureA']
+231        feature_B = row['FeatureB']
+232        if label in feature_A:
+233            feature_ranks_rows.append([feature_B, row.Score])
+234        elif label in feature_B:
+235            feature_ranks_rows.append([feature_A, row.Score])
+236
+237    # Align with an existing model
+238    feature_ranks: pd.DataFrame = pd.DataFrame(feature_ranks_rows)
+239    feature_ranks.columns = ['Feature', 'Value']
+240    feature_ranks = feature_ranks[
+241        ~feature_ranks['Feature'].str.contains(
+242            label,
+243        )
+244    ]
+245    if not os.path.exists(reference_json):
+246        reference_json = ''
+247
+248    if reference_json:
+249        ref_json = read_reference_json(reference_json)
+250        used_features = []
+251        if 'features' in ref_json['desc']:
+252            for feature in ref_json['desc']['features']:
+253                used_features.append(feature)
+254
+255        if 'fields' in ref_json['desc']:
+256            for field in ref_json['desc']['fields']:
+257                used_features.append(field)
+258    else:
+259        used_features = feature_ranks.keys()
+260
+261    feature_ranks['Feature'] = feature_ranks['Feature'].astype(str)
+262    feature_ranks['Value'] = feature_ranks['Value'].astype(float)
+263    feature_ranks = feature_ranks.groupby(
+264        by=['Feature'],
+265    ).median().reset_index()
+266    feature_ranks = feature_ranks.sort_values(by=['Value'], ascending=False)
+267
+268    subset_ranges = [10, 25, 50, 100, feature_ranks.shape[0]]
+269    sns.set_style('whitegrid')
+270
+271    for subset_range in subset_ranges:
+272        feature_ranks_reduced = feature_ranks.copy().iloc[:subset_range]
+273        plt.figure(figsize=(18, 12))
+274        fig, ax = plt.subplots()
+275
+276        if (
+277            feature_ranks_reduced.shape[0] > 45
+278            and feature_ranks_reduced.shape[0] <= 100
+279        ):
+280            ax.yaxis.set_tick_params(labelsize=8)
+281        elif feature_ranks_reduced.shape[0] > 100:
+282            ax.yaxis.set_tick_params(labelsize=2)
+283        else:
+284            ax.yaxis.set_tick_params(labelsize=25)
+285
+286        # Visualize the barplot
+287        plt.title(f'Ranking w.r.t "{label}"\n')
+288        sns.barplot(
+289            x='Value',
+290            y='Feature',
+291            errwidth=0.7,
+292            data=feature_ranks_reduced,
+293            palette='coolwarm_r',
+294        )
+295
+296        # Modify the ticks if needed
+297        for item in ax.get_yticklabels():
+298            for prod_feature in used_features:
+299                if item.get_text() in prod_feature:
+300                    item.set_fontweight('bold')
+301                    item.set_color('red')
+302                    break
+303
+304        plt.xlabel(f'Feature importance (based on heuristic {heuristic})')
+305        plt.ylabel('')
+306        plt.tight_layout()
+307        plt.savefig(
+308            f'{output_folder}/barplot_top_{subset_range}.{image_format}', dpi=300,
+309        )
+310        plt.clf()
+311        plt.cla()
+312
+313        logging.info(
+314            f'Stored barplot to: {output_folder}/barplot_top_{subset_range}_.{image_format}',
+315        )
+316
+317
+318def visualize_all(
+319    triplets: pd.DataFrame,
+320    output_folder: str,
+321    label: str = '',
+322    reference_json: str = '',
+323    image_format: str = 'png',
+324    heuristic: str = 'MI',
+325) -> None:
+326    """A method for visualization of the obtained feature interaction maps."""
+327
+328    if not os.path.exists(output_folder):
+329        os.makedirs(output_folder)
+330
+331    # Visualize feature clusters
+332    visualize_hierarchical_clusters(triplets, output_folder, image_format)
+333
+334    # Visualize heatmap
+335    visualize_heatmap(triplets, output_folder, image_format)
+336
+337    # visualize barplot
+338    visualize_barplots(
+339        triplets, output_folder, reference_json, image_format, label, heuristic,
+340    )
+
+ + +
+
+ +
+ + def + visualize_hierarchical_clusters( triplet_dataframe: pandas.core.frame.DataFrame, output_folder: str, image_format: str = 'png', max_num_clusters: int = 100) -> None: + + + +
+ +
 21def visualize_hierarchical_clusters(
+ 22    triplet_dataframe: pd.DataFrame,
+ 23    output_folder: str,
+ 24    image_format: str = 'png',
+ 25    max_num_clusters: int = 100,
+ 26) -> None:
+ 27    """A method for visualization of hierarchical clusters w.r.t. different linkage functions"""
+ 28
+ 29    # Prepare the canvas
+ 30    plt.rcParams['figure.figsize'] = (10, 5)
+ 31    unique_features = triplet_dataframe.FeatureA.unique()
+ 32
+ 33    if len(unique_features) > 1000:
+ 34        logging.info('Trying to visualize too many features, exiting ..')
+ 35        exit()
+ 36
+ 37    dmat = np.zeros((len(unique_features), len(unique_features)))
+ 38    logging.info('Preparing the data for clustering ..')
+ 39
+ 40    if triplet_dataframe.shape[0] > 10**5:
+ 41        logging.info(
+ 42            'Trying to visualize more than 10 ** 5 triplets, exiting ..',
+ 43        )
+ 44        exit()
+ 45
+ 46    pivot_table = pd.pivot_table(
+ 47        triplet_dataframe,
+ 48        values='Score',
+ 49        index='FeatureA',
+ 50        columns='FeatureB',
+ 51        aggfunc=np.mean,
+ 52    )
+ 53
+ 54    # We need distances
+ 55    pivot_table.fillna(0, inplace=True)
+ 56    dmat = 1 - pivot_table.values
+ 57
+ 58    # Visualize different dendrograms
+ 59    logging.info('Clustering ..')
+ 60
+ 61    for linkage_heuristic in [
+ 62        # 'single', 'complete', 'average', 'weighted', 'centroid'
+ 63        'complete',
+ 64    ]:
+ 65        # Compute the linkage structure
+ 66        Z = hierarchy.linkage(dmat, linkage_heuristic)
+ 67
+ 68        # Visualize
+ 69        hierarchy.dendrogram(
+ 70            Z, above_threshold_color='y', orientation='top', labels=unique_features,
+ 71        )
+ 72        # Store
+ 73        plt.title(f'Linkage function: {linkage_heuristic}')
+ 74        plt.tight_layout()
+ 75        out_path = f'{output_folder}/dendrogram_{linkage_heuristic}.{image_format}'
+ 76        plt.savefig(out_path, dpi=300)
+ 77
+ 78        # Clean for subsequent plots
+ 79        plt.clf()
+ 80        plt.cla()
+ 81        logging.info(
+ 82            f'Visualized hierarchical clustering with linkage {linkage_heuristic} to {out_path}',
+ 83        )
+ 84
+ 85        # Step 1: Identify relevant distance threshold bounds
+ 86        range_min, range_max = np.min(
+ 87            pivot_table.values,
+ 88        ), np.max(pivot_table.values)
+ 89        spectrum = np.arange(
+ 90            range_min, range_max,
+ 91            (range_max - range_min) / 1000,
+ 92        )
+ 93        max_silhouette = 0
+ 94        top_clustering = []
+ 95        full_silhouette_space = []
+ 96
+ 97        # Step 2: Compute Silhouette for each threshold and store the results
+ 98        for possible_threshold in spectrum:
+ 99            cluster_assignments = hierarchy.fcluster(Z, possible_threshold)
+100            num_clusters = len(np.unique(cluster_assignments))
+101            if num_clusters > 2 and num_clusters < max_num_clusters:
+102                try:
+103                    sil_score = silhouette_score(
+104                        pivot_table, cluster_assignments,
+105                    )
+106
+107                except Exception:
+108                    continue
+109
+110                full_silhouette_space.append(
+111                    [
+112                        sil_score, possible_threshold, len(
+113                            np.unique(cluster_assignments),
+114                        ),
+115                    ],
+116                )
+117                if sil_score >= max_silhouette:
+118                    top_clustering = cluster_assignments
+119                    max_silhouette = sil_score
+120
+121        # Step 3: We are interested in the best clustering w.r.t. Silhouette
+122        dfx = pd.DataFrame(full_silhouette_space)
+123        if len(dfx) == 0:
+124            logging.info('Silhouette space empty, exiting')
+125            exit()
+126
+127        dfx.columns = ['Silhouette', 'threshold', 'numClusters']
+128        sns.lineplot(x=dfx.numClusters, y=dfx.Silhouette, color='black')
+129        plt.tight_layout()
+130        out_path = f'{output_folder}/SilhouetteProfile.{image_format}'
+131        plt.savefig(out_path, dpi=300)
+132        plt.clf()
+133        plt.cla()
+134        logging.info('Stored the Silhouette profile.')
+135
+136        final_feature_cluster_df = pd.DataFrame(
+137            list(zip(top_clustering, pivot_table.index)),
+138        )
+139        final_feature_cluster_df.columns = ['ClusterID', 'Feature']
+140        final_feature_cluster_df.to_csv(
+141            f'{output_folder}/TopClustering.tsv', sep='\t',
+142        )
+143
+144        # Get 2D embeddings of features and visualize them
+145        try:
+146            projected_data = TSNE().fit_transform(pivot_table.values)
+147            projected_data = pd.DataFrame(projected_data)
+148            projected_data.columns = ['Dim1', 'Dim2']
+149            projected_data['ClusterID'] = top_clustering
+150            projected_data['ClusterID'] = projected_data['ClusterID'].astype(
+151                str,
+152            )
+153            sns.scatterplot(
+154                x=projected_data.Dim1,
+155                y=projected_data.Dim2,
+156                hue=projected_data.ClusterID,
+157                palette='Set2',
+158            )
+159            plt.savefig(
+160                f'{output_folder}/clustersEmbeddingVisualization.pdf', dpi=300,
+161            )
+162            plt.clf()
+163            plt.cla()
+164        except:
+165            pass
+166
+167        # Step 4: We are interested in the best clustering w.r.t. Silhouette
+168        # Not here yet
+169
+170    plt.rcParams['figure.figsize'] = (50, 30)
+
+ + +

A method for visualization of hierarchical clusters w.r.t. different linkage functions

+
+ + +
+
+ +
+ + def + visualize_heatmap( triplets: pandas.core.frame.DataFrame, output_folder: str, image_format: str) -> None: + + + +
+ +
173def visualize_heatmap(
+174    triplets: pd.DataFrame, output_folder: str, image_format: str,
+175) -> None:
+176    # Compute the interaction pivot table
+177    sns.set(font_scale=2)
+178    fig, ax = plt.subplots()
+179    pivot_table = pd.pivot_table(
+180        triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc=np.mean,
+181    )
+182    mask = np.zeros_like(pivot_table.values)
+183    mask[np.triu_indices_from(mask)] = True
+184    fsize_heatmap = 20
+185    if pivot_table.shape[0] > 100:
+186        sns.set(font_scale=1)
+187        fsize_heatmap = 3
+188
+189    logging.info('Visualizing the heatmap ..')
+190
+191    if pivot_table.shape[0] > 500:
+192        logging.info(
+193            'Skipping heatmap visualization due to too many elements ..',
+194        )
+195        return
+196
+197    # Visualize the table
+198    plt.figure(figsize=(50, 50))
+199    plt.rcParams.update({'font.size': 1})
+200    sns.heatmap(
+201        pivot_table,
+202        annot=True,
+203        mask=mask,
+204        annot_kws={'size': fsize_heatmap},
+205        square=False,
+206        cmap='coolwarm',
+207        linecolor='black',
+208        linewidths=0.05,
+209    )
+210    plt.xlabel('')
+211    plt.ylabel('')
+212    plt.tight_layout()
+213    plt.savefig(f'{output_folder}/heatmap.{image_format}', dpi=500)
+214    plt.clf()
+215    plt.cla()
+216    logging.info(f'Stored heatmap to: {output_folder}/heatmap.{image_format}')
+
+ + + + +
+
+ +
+ + def + visualize_barplots( triplets: pandas.core.frame.DataFrame, output_folder: str, reference_json: str, image_format: str, label: str, heuristic: str) -> None: + + + +
+ +
219def visualize_barplots(
+220    triplets: pd.DataFrame,
+221    output_folder: str,
+222    reference_json: str,
+223    image_format: str,
+224    label: str,
+225    heuristic: str,
+226) -> None:
+227    # Extract only the interactions related to the target attribute
+228    sns.set(font_scale=8)
+229    feature_ranks_rows = []
+230    for enx, row in triplets.iterrows():
+231        feature_A = row['FeatureA']
+232        feature_B = row['FeatureB']
+233        if label in feature_A:
+234            feature_ranks_rows.append([feature_B, row.Score])
+235        elif label in feature_B:
+236            feature_ranks_rows.append([feature_A, row.Score])
+237
+238    # Align with an existing model
+239    feature_ranks: pd.DataFrame = pd.DataFrame(feature_ranks_rows)
+240    feature_ranks.columns = ['Feature', 'Value']
+241    feature_ranks = feature_ranks[
+242        ~feature_ranks['Feature'].str.contains(
+243            label,
+244        )
+245    ]
+246    if not os.path.exists(reference_json):
+247        reference_json = ''
+248
+249    if reference_json:
+250        ref_json = read_reference_json(reference_json)
+251        used_features = []
+252        if 'features' in ref_json['desc']:
+253            for feature in ref_json['desc']['features']:
+254                used_features.append(feature)
+255
+256        if 'fields' in ref_json['desc']:
+257            for field in ref_json['desc']['fields']:
+258                used_features.append(field)
+259    else:
+260        used_features = feature_ranks.keys()
+261
+262    feature_ranks['Feature'] = feature_ranks['Feature'].astype(str)
+263    feature_ranks['Value'] = feature_ranks['Value'].astype(float)
+264    feature_ranks = feature_ranks.groupby(
+265        by=['Feature'],
+266    ).median().reset_index()
+267    feature_ranks = feature_ranks.sort_values(by=['Value'], ascending=False)
+268
+269    subset_ranges = [10, 25, 50, 100, feature_ranks.shape[0]]
+270    sns.set_style('whitegrid')
+271
+272    for subset_range in subset_ranges:
+273        feature_ranks_reduced = feature_ranks.copy().iloc[:subset_range]
+274        plt.figure(figsize=(18, 12))
+275        fig, ax = plt.subplots()
+276
+277        if (
+278            feature_ranks_reduced.shape[0] > 45
+279            and feature_ranks_reduced.shape[0] <= 100
+280        ):
+281            ax.yaxis.set_tick_params(labelsize=8)
+282        elif feature_ranks_reduced.shape[0] > 100:
+283            ax.yaxis.set_tick_params(labelsize=2)
+284        else:
+285            ax.yaxis.set_tick_params(labelsize=25)
+286
+287        # Visualize the barplot
+288        plt.title(f'Ranking w.r.t "{label}"\n')
+289        sns.barplot(
+290            x='Value',
+291            y='Feature',
+292            errwidth=0.7,
+293            data=feature_ranks_reduced,
+294            palette='coolwarm_r',
+295        )
+296
+297        # Modify the ticks if needed
+298        for item in ax.get_yticklabels():
+299            for prod_feature in used_features:
+300                if item.get_text() in prod_feature:
+301                    item.set_fontweight('bold')
+302                    item.set_color('red')
+303                    break
+304
+305        plt.xlabel(f'Feature importance (based on heuristic {heuristic})')
+306        plt.ylabel('')
+307        plt.tight_layout()
+308        plt.savefig(
+309            f'{output_folder}/barplot_top_{subset_range}.{image_format}', dpi=300,
+310        )
+311        plt.clf()
+312        plt.cla()
+313
+314        logging.info(
+315            f'Stored barplot to: {output_folder}/barplot_top_{subset_range}_.{image_format}',
+316        )
+
+ + + + +
+
+ +
+ + def + visualize_all( triplets: pandas.core.frame.DataFrame, output_folder: str, label: str = '', reference_json: str = '', image_format: str = 'png', heuristic: str = 'MI') -> None: + + + +
+ +
319def visualize_all(
+320    triplets: pd.DataFrame,
+321    output_folder: str,
+322    label: str = '',
+323    reference_json: str = '',
+324    image_format: str = 'png',
+325    heuristic: str = 'MI',
+326) -> None:
+327    """A method for visualization of the obtained feature interaction maps."""
+328
+329    if not os.path.exists(output_folder):
+330        os.makedirs(output_folder)
+331
+332    # Visualize feature clusters
+333    visualize_hierarchical_clusters(triplets, output_folder, image_format)
+334
+335    # Visualize heatmap
+336    visualize_heatmap(triplets, output_folder, image_format)
+337
+338    # visualize barplot
+339    visualize_barplots(
+340        triplets, output_folder, reference_json, image_format, label, heuristic,
+341    )
+
+ + +

A method for visualization of the obtained feature interaction maps.

+
+ + +
+
+ + diff --git a/docs/search.js b/docs/search.js new file mode 100644 index 0000000..66d1dc9 --- /dev/null +++ b/docs/search.js @@ -0,0 +1,46 @@ +window.pdocSearch = (function(){ +/** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();o\u2591\u2588\u2588\u2588\u2588\u2588\u2557\u2591\u2588\u2588\u2557\u2591\u2591\u2591\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2591\u2591\u2588\u2588\u2588\u2588\u2588\u2557\u2591\u2588\u2588\u2588\u2557\u2591\u2591\u2588\u2588\u2557\u2588\u2588\u2557\u2591\u2591\u2588\u2588\u2557\n\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2551\u2591\u2591\u2591\u2588\u2588\u2551\u255a\u2550\u2550\u2588\u2588\u2554\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2557\u2591\u2588\u2588\u2551\u2588\u2588\u2551\u2591\u2588\u2588\u2554\u255d\n\u2588\u2588\u2551\u2591\u2591\u2588\u2588\u2551\u2588\u2588\u2551\u2591\u2591\u2591\u2588\u2588\u2551\u2591\u2591\u2591\u2588\u2588\u2551\u2591\u2591\u2591\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2554\u2588\u2588\u2557\u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2550\u255d\u2591\n\u2588\u2588\u2551\u2591\u2591\u2588\u2588\u2551\u2588\u2588\u2551\u2591\u2591\u2591\u2588\u2588\u2551\u2591\u2591\u2591\u2588\u2588\u2551\u2591\u2591\u2591\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2551\u2588\u2588\u2551\u255a\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2588\u2588\u2557\u2591\n\u255a\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u255a\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2591\u2591\u2591\u2588\u2588\u2551\u2591\u2591\u2591\u2588\u2588\u2551\u2591\u2591\u2588\u2588\u2551\u2588\u2588\u2551\u2591\u2591\u2588\u2588\u2551\u2588\u2588\u2551\u2591\u255a\u2588\u2588\u2588\u2551\u2588\u2588\u2551\u2591\u255a\u2588\u2588\u2557\n\u2591\u255a\u2550\u2550\u2550\u2550\u255d\u2591\u2591\u255a\u2550\u2550\u2550\u2550\u2550\u255d\u2591\u2591\u2591\u2591\u255a\u2550\u255d\u2591\u2591\u2591\u255a\u2550\u255d\u2591\u2591\u255a\u2550\u255d\u255a\u2550\u255d\u2591\u2591\u255a\u2550\u255d\u255a\u2550\u255d\u2591\u2591\u255a\u2550\u2550\u255d\u255a\u2550\u255d\u2591\u2591\u255a\u2550\u255d\n\n\n

Welcome to OutRank's documentation!

\n\n

All functions/methods can be searched-for (search bar on the left).

\n"}, "outrank.algorithms": {"fullname": "outrank.algorithms", "modulename": "outrank.algorithms", "kind": "module", "doc": "

\n"}, "outrank.algorithms.feature_ranking": {"fullname": "outrank.algorithms.feature_ranking", "modulename": "outrank.algorithms.feature_ranking", "kind": "module", "doc": "

\n"}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "kind": "module", "doc": "

\n"}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "numba_unique", "kind": "function", "doc": "

Identify unique elements in an array, fast

\n", "signature": "(a):", "funcdef": "def"}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_conditional_entropy", "kind": "function", "doc": "

\n", "signature": "(Y_classes, class_values, class_var_shape, initial_prob):", "funcdef": "def"}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_entropies", "kind": "function", "doc": "

Core entropy computation function

\n", "signature": "(X, Y, all_events, f_values, f_value_counts, cardinality_correction):", "funcdef": "def"}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "mutual_info_estimator_numba", "kind": "function", "doc": "

Core estimator logic. Compute unique elements, subset if required

\n", "signature": "(Y, X, approximation_factor=1, cardinality_correction=False):", "funcdef": "def"}, "outrank.algorithms.importance_estimator": {"fullname": "outrank.algorithms.importance_estimator", "modulename": "outrank.algorithms.importance_estimator", "kind": "module", "doc": "

\n"}, "outrank.algorithms.importance_estimator.sklearn_MI": {"fullname": "outrank.algorithms.importance_estimator.sklearn_MI", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_MI", "kind": "function", "doc": "

\n", "signature": "(vector_first: Any, vector_second: Any) -> float:", "funcdef": "def"}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"fullname": "outrank.algorithms.importance_estimator.sklearn_surrogate", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_surrogate", "kind": "function", "doc": "

\n", "signature": "(vector_first: Any, vector_second: Any, surrogate_model: str) -> float:", "funcdef": "def"}, "outrank.algorithms.importance_estimator.numba_mi": {"fullname": "outrank.algorithms.importance_estimator.numba_mi", "modulename": "outrank.algorithms.importance_estimator", "qualname": "numba_mi", "kind": "function", "doc": "

\n", "signature": "(vector_first, vector_second, heuristic):", "funcdef": "def"}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"fullname": "outrank.algorithms.importance_estimator.sklearn_mi_adj", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_mi_adj", "kind": "function", "doc": "

\n", "signature": "(vector_first, vector_second):", "funcdef": "def"}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_pairwise", "kind": "function", "doc": "

A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel.

\n", "signature": "(combination, args, tmp_df):", "funcdef": "def"}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"fullname": "outrank.algorithms.importance_estimator.rank_features_3MR", "modulename": "outrank.algorithms.importance_estimator", "qualname": "rank_features_3MR", "kind": "function", "doc": "

\n", "signature": "(\trelevance_dict: dict[str, float],\tredundancy_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\trelational_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\tstrategy: str = 'median',\talpha: float = 1,\tbeta: float = 1) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_nonmyopic", "kind": "function", "doc": "

\n", "signature": "(args: Any, tmp_df: pandas.core.frame.DataFrame):", "funcdef": "def"}, "outrank.algorithms.sketches": {"fullname": "outrank.algorithms.sketches", "modulename": "outrank.algorithms.sketches", "kind": "module", "doc": "

\n"}, "outrank.algorithms.sketches.counting_ultiloglog": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "kind": "module", "doc": "

This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory

\n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache", "kind": "class", "doc": "

\n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.__init__", "kind": "function", "doc": "

\n", "signature": "(error_rate=0.005)"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.p", "kind": "variable", "doc": "

\n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.m", "kind": "variable", "doc": "

\n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_set", "kind": "variable", "doc": "

\n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_size", "kind": "variable", "doc": "

\n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.width", "kind": "variable", "doc": "

\n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.hll_flag", "kind": "variable", "doc": "

\n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.add", "kind": "function", "doc": "

\n", "signature": "(self, value):", "funcdef": "def"}, "outrank.algorithms.synthetic_data_generators": {"fullname": "outrank.algorithms.synthetic_data_generators", "modulename": "outrank.algorithms.synthetic_data_generators", "kind": "module", "doc": "

\n"}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "kind": "module", "doc": "

\n"}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "qualname": "generate_random_matrix", "kind": "function", "doc": "

\n", "signature": "(num_features=100, size=20000):", "funcdef": "def"}, "outrank.core_ranking": {"fullname": "outrank.core_ranking", "modulename": "outrank.core_ranking", "kind": "module", "doc": "

\n"}, "outrank.core_ranking.logger": {"fullname": "outrank.core_ranking.logger", "modulename": "outrank.core_ranking", "qualname": "logger", "kind": "variable", "doc": "

\n", "default_value": "<Logger syn-logger (DEBUG)>"}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"fullname": "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_CARDINALITY_STORAGE", "kind": "variable", "doc": "

\n", "annotation": ": dict[typing.Any, typing.Any]", "default_value": "{}"}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"fullname": "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_RARE_VALUE_STORAGE", "kind": "variable", "doc": "

\n", "annotation": ": dict[str, typing.Any]", "default_value": "Counter()"}, "outrank.core_ranking.IGNORED_VALUES": {"fullname": "outrank.core_ranking.IGNORED_VALUES", "modulename": "outrank.core_ranking", "qualname": "IGNORED_VALUES", "kind": "variable", "doc": "

\n", "default_value": "set()"}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"fullname": "outrank.core_ranking.HYPERLL_ERROR_BOUND", "modulename": "outrank.core_ranking", "qualname": "HYPERLL_ERROR_BOUND", "kind": "variable", "doc": "

\n", "default_value": "0.02"}, "outrank.core_ranking.encode_int_column": {"fullname": "outrank.core_ranking.encode_int_column", "modulename": "outrank.core_ranking", "qualname": "encode_int_column", "kind": "function", "doc": "

Encode column values as categoric (at a batch level!)

\n", "signature": "(input_tuple: tuple[str, typing.Any]) -> tuple[typing.Any, list[int]]:", "funcdef": "def"}, "outrank.core_ranking.mixed_rank_graph": {"fullname": "outrank.core_ranking.mixed_rank_graph", "modulename": "outrank.core_ranking", "qualname": "mixed_rank_graph", "kind": "function", "doc": "

Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic

\n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any,\tcpu_pool: Any,\tpbar: Any) -> outrank.core_utils.BatchRankingSummary:", "funcdef": "def"}, "outrank.core_ranking.enrich_with_transformations": {"fullname": "outrank.core_ranking.enrich_with_transformations", "modulename": "outrank.core_ranking", "qualname": "enrich_with_transformations", "kind": "function", "doc": "

Construct a collection of new features based on pre-defined transformations/rules

\n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnum_col_types: set[str],\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.compute_combined_features": {"fullname": "outrank.core_ranking.compute_combined_features", "modulename": "outrank.core_ranking", "qualname": "compute_combined_features", "kind": "function", "doc": "

Compute higher order features via xxhash-based trick.

\n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any,\tis_3mr: bool = False) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.compute_expanded_multivalue_features": {"fullname": "outrank.core_ranking.compute_expanded_multivalue_features", "modulename": "outrank.core_ranking", "qualname": "compute_expanded_multivalue_features", "kind": "function", "doc": "

Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value \"a,b,c\" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice.

\n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.compute_subfeatures": {"fullname": "outrank.core_ranking.compute_subfeatures", "modulename": "outrank.core_ranking", "qualname": "compute_subfeatures", "kind": "function", "doc": "

Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction.\n->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered.\n<->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)

\n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.include_noisy_features": {"fullname": "outrank.core_ranking.include_noisy_features", "modulename": "outrank.core_ranking", "qualname": "include_noisy_features", "kind": "function", "doc": "

Add randomized features that serve as a sanity check

\n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.compute_coverage": {"fullname": "outrank.core_ranking.compute_coverage", "modulename": "outrank.core_ranking", "qualname": "compute_coverage", "kind": "function", "doc": "

Compute coverage of features, incrementally

\n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, "outrank.core_ranking.compute_feature_memory_consumption": {"fullname": "outrank.core_ranking.compute_feature_memory_consumption", "modulename": "outrank.core_ranking", "qualname": "compute_feature_memory_consumption", "kind": "function", "doc": "

An approximation of how much feature take up

\n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, "outrank.core_ranking.compute_value_counts": {"fullname": "outrank.core_ranking.compute_value_counts", "modulename": "outrank.core_ranking", "qualname": "compute_value_counts", "kind": "function", "doc": "

Update the count structure

\n", "signature": "(input_dataframe: pandas.core.frame.DataFrame, args: Any):", "funcdef": "def"}, "outrank.core_ranking.compute_cardinalities": {"fullname": "outrank.core_ranking.compute_cardinalities", "modulename": "outrank.core_ranking", "qualname": "compute_cardinalities", "kind": "function", "doc": "

Compute cardinalities of features, incrementally

\n", "signature": "(input_dataframe: pandas.core.frame.DataFrame, pbar: Any) -> None:", "funcdef": "def"}, "outrank.core_ranking.compute_bounds_increment": {"fullname": "outrank.core_ranking.compute_bounds_increment", "modulename": "outrank.core_ranking", "qualname": "compute_bounds_increment", "kind": "function", "doc": "

\n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnumeric_column_types: set[str]) -> dict[str, typing.Any]:", "funcdef": "def"}, "outrank.core_ranking.compute_batch_ranking": {"fullname": "outrank.core_ranking.compute_batch_ranking", "modulename": "outrank.core_ranking", "qualname": "compute_batch_ranking", "kind": "function", "doc": "

Enrich the feature space and compute the batch importances

\n", "signature": "(\tline_tmp_storage: list[list[typing.Any]],\tnumeric_column_types: set[str],\targs: Any,\tcpu_pool: Any,\tcolumn_descriptions: list[str],\tlogger: Any,\tpbar: Any) -> tuple[outrank.core_utils.BatchRankingSummary, dict[str, typing.Any], dict[str, set[str]], dict[str, set[str]]]:", "funcdef": "def"}, "outrank.core_ranking.get_num_of_instances": {"fullname": "outrank.core_ranking.get_num_of_instances", "modulename": "outrank.core_ranking", "qualname": "get_num_of_instances", "kind": "function", "doc": "

Count the number of lines in a file, fast - useful for progress logging

\n", "signature": "(fname: str) -> int:", "funcdef": "def"}, "outrank.core_ranking.get_grouped_df": {"fullname": "outrank.core_ranking.get_grouped_df", "modulename": "outrank.core_ranking", "qualname": "get_grouped_df", "kind": "function", "doc": "

A helper method that enables median-based aggregation after processing

\n", "signature": "(\timportances_df_list: list[tuple[str, str, float]]) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.checkpoint_importances_df": {"fullname": "outrank.core_ranking.checkpoint_importances_df", "modulename": "outrank.core_ranking", "qualname": "checkpoint_importances_df", "kind": "function", "doc": "

A helper which stores intermediary state - useful for longer runs

\n", "signature": "(importances_batch: list[tuple[str, str, float]]) -> None:", "funcdef": "def"}, "outrank.core_ranking.estimate_importances_minibatches": {"fullname": "outrank.core_ranking.estimate_importances_minibatches", "modulename": "outrank.core_ranking", "qualname": "estimate_importances_minibatches", "kind": "function", "doc": "

Interaction score estimator - suitable for example for csv-like input data types.\nThis type of data is normally a single large csv, meaning that minibatch processing needs to\nhappen during incremental handling of the file (that\"s not the case for pre-separated ob data)

\n", "signature": "(\tinput_file: str,\tcolumn_descriptions: list,\tfw_col_mapping: dict[str, str],\tnumeric_column_types: set,\tbatch_size: int = 100000,\targs: Any = None,\tdata_encoding: str = 'utf-8',\tcpu_pool: Any = None,\tdelimiter: str = '\\t',\tfeature_construction_mode: bool = False,\tlogger: Any = None) -> tuple[list[dict[str, typing.Any]], typing.Any, dict[typing.Any, typing.Any], list[dict[str, typing.Any]], list[dict[str, set[str]]], collections.defaultdict[str, list[set[str]]], dict[str, typing.Any]]:", "funcdef": "def"}, "outrank.core_selftest": {"fullname": "outrank.core_selftest", "modulename": "outrank.core_selftest", "kind": "module", "doc": "

\n"}, "outrank.core_utils": {"fullname": "outrank.core_utils", "modulename": "outrank.core_utils", "kind": "module", "doc": "

\n"}, "outrank.core_utils.pro_tips": {"fullname": "outrank.core_utils.pro_tips", "modulename": "outrank.core_utils", "qualname": "pro_tips", "kind": "variable", "doc": "

\n", "default_value": "['OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"', 'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).', 'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!', 'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).', 'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)', 'Visualization part also includes clustering - this might be very insightful!', 'By default OutRank includes feature cardinality and coverage in feature names (card; cov)', 'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.', 'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).', 'Give it as many threads as physically possible (--num_threads).', 'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.', 'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).', 'Your target can be any feature! (explaining one feature with others)', 'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).', 'Each feature is named as featureName(cardinality, coverage in percents) in the final files.', 'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.']"}, "outrank.core_utils.internal_hash": {"fullname": "outrank.core_utils.internal_hash", "modulename": "outrank.core_utils", "qualname": "internal_hash", "kind": "function", "doc": "

A generic internal hash used throughout ranking procedure - let's hardcode seed here for sure

\n", "signature": "(input_obj: str) -> str:", "funcdef": "def"}, "outrank.core_utils.DatasetInformationStorage": {"fullname": "outrank.core_utils.DatasetInformationStorage", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage", "kind": "class", "doc": "

A generic class for holding properties of a given type of dataset

\n"}, "outrank.core_utils.DatasetInformationStorage.__init__": {"fullname": "outrank.core_utils.DatasetInformationStorage.__init__", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.__init__", "kind": "function", "doc": "

\n", "signature": "(\tdata_path: str,\tcolumn_names: list[str],\tcolumn_types: set[str],\tcol_delimiter: str | None,\tencoding: str,\tfw_map: dict[str, str] | None)"}, "outrank.core_utils.DatasetInformationStorage.data_path": {"fullname": "outrank.core_utils.DatasetInformationStorage.data_path", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.data_path", "kind": "variable", "doc": "

\n", "annotation": ": str"}, "outrank.core_utils.DatasetInformationStorage.column_names": {"fullname": "outrank.core_utils.DatasetInformationStorage.column_names", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_names", "kind": "variable", "doc": "

\n", "annotation": ": list[str]"}, "outrank.core_utils.DatasetInformationStorage.column_types": {"fullname": "outrank.core_utils.DatasetInformationStorage.column_types", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_types", "kind": "variable", "doc": "

\n", "annotation": ": set[str]"}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"fullname": "outrank.core_utils.DatasetInformationStorage.col_delimiter", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.col_delimiter", "kind": "variable", "doc": "

\n", "annotation": ": str | None"}, "outrank.core_utils.DatasetInformationStorage.encoding": {"fullname": "outrank.core_utils.DatasetInformationStorage.encoding", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.encoding", "kind": "variable", "doc": "

\n", "annotation": ": str"}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"fullname": "outrank.core_utils.DatasetInformationStorage.fw_map", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.fw_map", "kind": "variable", "doc": "

\n", "annotation": ": dict[str, str] | None"}, "outrank.core_utils.NumericFeatureSummary": {"fullname": "outrank.core_utils.NumericFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary", "kind": "class", "doc": "

A generic class storing numeric feature statistics

\n"}, "outrank.core_utils.NumericFeatureSummary.__init__": {"fullname": "outrank.core_utils.NumericFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.__init__", "kind": "function", "doc": "

\n", "signature": "(\tfeature_name: str,\tminimum: float,\tmaximum: float,\tmedian: float,\tnum_unique: int)"}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"fullname": "outrank.core_utils.NumericFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.feature_name", "kind": "variable", "doc": "

\n", "annotation": ": str"}, "outrank.core_utils.NumericFeatureSummary.minimum": {"fullname": "outrank.core_utils.NumericFeatureSummary.minimum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.minimum", "kind": "variable", "doc": "

\n", "annotation": ": float"}, "outrank.core_utils.NumericFeatureSummary.maximum": {"fullname": "outrank.core_utils.NumericFeatureSummary.maximum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.maximum", "kind": "variable", "doc": "

\n", "annotation": ": float"}, "outrank.core_utils.NumericFeatureSummary.median": {"fullname": "outrank.core_utils.NumericFeatureSummary.median", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.median", "kind": "variable", "doc": "

\n", "annotation": ": float"}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"fullname": "outrank.core_utils.NumericFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.num_unique", "kind": "variable", "doc": "

\n", "annotation": ": int"}, "outrank.core_utils.NominalFeatureSummary": {"fullname": "outrank.core_utils.NominalFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary", "kind": "class", "doc": "

A generic class storing numeric feature statistics

\n"}, "outrank.core_utils.NominalFeatureSummary.__init__": {"fullname": "outrank.core_utils.NominalFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.__init__", "kind": "function", "doc": "

\n", "signature": "(feature_name: str, num_unique: int)"}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"fullname": "outrank.core_utils.NominalFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.feature_name", "kind": "variable", "doc": "

\n", "annotation": ": str"}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"fullname": "outrank.core_utils.NominalFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.num_unique", "kind": "variable", "doc": "

\n", "annotation": ": int"}, "outrank.core_utils.BatchRankingSummary": {"fullname": "outrank.core_utils.BatchRankingSummary", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary", "kind": "class", "doc": "

A generic class representing batched ranking results

\n"}, "outrank.core_utils.BatchRankingSummary.__init__": {"fullname": "outrank.core_utils.BatchRankingSummary.__init__", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.__init__", "kind": "function", "doc": "

\n", "signature": "(\ttriplet_scores: list[tuple[str, str, float]],\tstep_times: dict[str, typing.Any])"}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"fullname": "outrank.core_utils.BatchRankingSummary.triplet_scores", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.triplet_scores", "kind": "variable", "doc": "

\n", "annotation": ": list[tuple[str, str, float]]"}, "outrank.core_utils.BatchRankingSummary.step_times": {"fullname": "outrank.core_utils.BatchRankingSummary.step_times", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.step_times", "kind": "variable", "doc": "

\n", "annotation": ": dict[str, typing.Any]"}, "outrank.core_utils.display_random_tip": {"fullname": "outrank.core_utils.display_random_tip", "modulename": "outrank.core_utils", "qualname": "display_random_tip", "kind": "function", "doc": "

\n", "signature": "() -> None:", "funcdef": "def"}, "outrank.core_utils.get_dataset_info": {"fullname": "outrank.core_utils.get_dataset_info", "modulename": "outrank.core_utils", "qualname": "get_dataset_info", "kind": "function", "doc": "

\n", "signature": "(args: Any):", "funcdef": "def"}, "outrank.core_utils.display_tool_name": {"fullname": "outrank.core_utils.display_tool_name", "modulename": "outrank.core_utils", "qualname": "display_tool_name", "kind": "function", "doc": "

\n", "signature": "() -> None:", "funcdef": "def"}, "outrank.core_utils.parse_ob_line": {"fullname": "outrank.core_utils.parse_ob_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_line", "kind": "function", "doc": "

Outbrain line parsing - generic TSVs

\n", "signature": "(line_string: str, delimiter: str = '\\t', args: Any = None) -> list[str]:", "funcdef": "def"}, "outrank.core_utils.parse_ob_line_vw": {"fullname": "outrank.core_utils.parse_ob_line_vw", "modulename": "outrank.core_utils", "qualname": "parse_ob_line_vw", "kind": "function", "doc": "

Parse a sparse vw line into a pandas df with pre-defined namespace

\n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping=None,\ttable_header=None,\tinclude_namespace_info=False) -> list[str | None]:", "funcdef": "def"}, "outrank.core_utils.parse_ob_csv_line": {"fullname": "outrank.core_utils.parse_ob_csv_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_csv_line", "kind": "function", "doc": "

Data can have commas within JSON field dumps

\n", "signature": "(line_string: str, delimiter: str = ',', args: Any = None) -> list[str]:", "funcdef": "def"}, "outrank.core_utils.generic_line_parser": {"fullname": "outrank.core_utils.generic_line_parser", "modulename": "outrank.core_utils", "qualname": "generic_line_parser", "kind": "function", "doc": "

A generic method aimed to parse data from different sources.

\n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping: Any = None,\ttable_header: Any = None) -> list[typing.Any]:", "funcdef": "def"}, "outrank.core_utils.read_reference_json": {"fullname": "outrank.core_utils.read_reference_json", "modulename": "outrank.core_utils", "qualname": "read_reference_json", "kind": "function", "doc": "

A helper method for reading a JSON

\n", "signature": "(json_path) -> dict[str, dict]:", "funcdef": "def"}, "outrank.core_utils.parse_namespace": {"fullname": "outrank.core_utils.parse_namespace", "modulename": "outrank.core_utils", "qualname": "parse_namespace", "kind": "function", "doc": "

Parse the feature namespace for type awareness

\n", "signature": "(namespace_path: str) -> tuple[set[str], dict[str, str]]:", "funcdef": "def"}, "outrank.core_utils.read_column_names": {"fullname": "outrank.core_utils.read_column_names", "modulename": "outrank.core_utils", "qualname": "read_column_names", "kind": "function", "doc": "

Read the col. header

\n", "signature": "(mapping_file: str) -> list[str]:", "funcdef": "def"}, "outrank.core_utils.parse_ob_vw_feature_information": {"fullname": "outrank.core_utils.parse_ob_vw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_vw_feature_information", "kind": "function", "doc": "

A generic parser of ob-based data

\n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.parse_ob_raw_feature_information": {"fullname": "outrank.core_utils.parse_ob_raw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_raw_feature_information", "kind": "function", "doc": "

A generic parser of ob-based data

\n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.parse_ob_feature_information": {"fullname": "outrank.core_utils.parse_ob_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_feature_information", "kind": "function", "doc": "

A generic parser of ob-based data

\n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.parse_csv_with_description_information": {"fullname": "outrank.core_utils.parse_csv_with_description_information", "modulename": "outrank.core_utils", "qualname": "parse_csv_with_description_information", "kind": "function", "doc": "

\n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.parse_csv_raw": {"fullname": "outrank.core_utils.parse_csv_raw", "modulename": "outrank.core_utils", "qualname": "parse_csv_raw", "kind": "function", "doc": "

\n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.extract_features_from_reference_JSON": {"fullname": "outrank.core_utils.extract_features_from_reference_JSON", "modulename": "outrank.core_utils", "qualname": "extract_features_from_reference_JSON", "kind": "function", "doc": "

Given a model's JSON, extract unique features

\n", "signature": "(json_path: str) -> set[typing.Any]:", "funcdef": "def"}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"fullname": "outrank.core_utils.summarize_feature_bounds_for_transformers", "modulename": "outrank.core_utils", "qualname": "summarize_feature_bounds_for_transformers", "kind": "function", "doc": "

summarization auxilliary method for generating JSON-based specs

\n", "signature": "(\tbounds_object_storage: Any,\tfeature_types: list[str],\ttask_name: str,\tlabel_name: str,\tgranularity: int = 15,\toutput_summary_table_only: bool = False):", "funcdef": "def"}, "outrank.core_utils.summarize_rare_counts": {"fullname": "outrank.core_utils.summarize_rare_counts", "modulename": "outrank.core_utils", "qualname": "summarize_rare_counts", "kind": "function", "doc": "

Write rare values

\n", "signature": "(\tterm_counter: Any,\targs: Any,\tcardinality_object: Any,\tobject_info: outrank.core_utils.DatasetInformationStorage) -> None:", "funcdef": "def"}, "outrank.feature_transformations": {"fullname": "outrank.feature_transformations", "modulename": "outrank.feature_transformations", "kind": "module", "doc": "

\n"}, "outrank.feature_transformations.feature_transformer_vault": {"fullname": "outrank.feature_transformations.feature_transformer_vault", "modulename": "outrank.feature_transformations.feature_transformer_vault", "kind": "module", "doc": "

\n"}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "kind": "module", "doc": "

\n"}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "MINIMAL_TRANSFORMERS", "kind": "variable", "doc": "

\n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)'}"}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "DEFAULT_TRANSFORMERS", "kind": "variable", "doc": "

\n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)'}"}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "kind": "module", "doc": "

\n"}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "FW_TRANSFORMERS", "kind": "variable", "doc": "

\n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)', '_tr_fw_sqrt_res_1_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*1,0), 0))', '_tr_fw_log_res_1_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*1,0), 0))', '_tr_fw_log_res_1_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*1,0), 0))', '_tr_fw_log_res_1_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*1,0), 0))', '_tr_fw_log_res_1_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*1,0), 0))', '_tr_fw_log_res_1_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*1,0), 0))', '_tr_fw_log_res_1_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*1,0), 0))', '_tr_fw_log_res_1_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*1,0), 0))', '_tr_fw_log_res_1_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*1,0), 0))', '_tr_fw_sqrt_res_10_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*10,0), 0))', '_tr_fw_log_res_10_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*10,0), 0))', '_tr_fw_log_res_10_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*10,0), 0))', '_tr_fw_log_res_10_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*10,0), 0))', '_tr_fw_log_res_10_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*10,0), 0))', '_tr_fw_log_res_10_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*10,0), 0))', '_tr_fw_log_res_10_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*10,0), 0))', '_tr_fw_log_res_10_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*10,0), 0))', '_tr_fw_log_res_10_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*10,0), 0))', '_tr_fw_sqrt_res_50_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*50,0), 0))', '_tr_fw_log_res_50_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*50,0), 0))', '_tr_fw_log_res_50_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*50,0), 0))', '_tr_fw_log_res_50_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*50,0), 0))', '_tr_fw_log_res_50_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*50,0), 0))', '_tr_fw_log_res_50_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*50,0), 0))', '_tr_fw_log_res_50_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*50,0), 0))', '_tr_fw_log_res_50_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*50,0), 0))', '_tr_fw_log_res_50_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*50,0), 0))', '_tr_fw_sqrt_res_100_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*100,0), 0))', '_tr_fw_log_res_100_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*100,0), 0))', '_tr_fw_log_res_100_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*100,0), 0))', '_tr_fw_log_res_100_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*100,0), 0))', '_tr_fw_log_res_100_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*100,0), 0))', '_tr_fw_log_res_100_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*100,0), 0))', '_tr_fw_log_res_100_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*100,0), 0))', '_tr_fw_log_res_100_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*100,0), 0))', '_tr_fw_log_res_100_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*100,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*1,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*10,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*50,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*100,0), 0))'}"}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "resolution_range", "kind": "variable", "doc": "

\n", "default_value": "[1, 10, 50, 100]"}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "greater_than_range", "kind": "variable", "doc": "

\n", "default_value": "[1, 2, 4, 8, 16, 32, 64, 96]"}, "outrank.feature_transformations.ranking_transformers": {"fullname": "outrank.feature_transformations.ranking_transformers", "modulename": "outrank.feature_transformations.ranking_transformers", "kind": "module", "doc": "

\n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise", "kind": "class", "doc": "

\n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.noise_preset", "kind": "variable", "doc": "

\n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.construct_new_features", "kind": "function", "doc": "

Generate a few standard noise distributions

\n", "signature": "(self, dataframe: pandas.core.frame.DataFrame, label_column=None):", "funcdef": "def"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric", "kind": "class", "doc": "

\n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.__init__", "kind": "function", "doc": "

\n", "signature": "(numeric_column_names: set[str], preset: str = 'default')"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.numeric_column_names", "kind": "variable", "doc": "

\n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.constructed_feature_names", "kind": "variable", "doc": "

\n", "annotation": ": set[str]"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.max_maj_support", "kind": "variable", "doc": "

\n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.nan_prop_support", "kind": "variable", "doc": "

\n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.get_vals", "kind": "function", "doc": "

\n", "signature": "(self, tmp_df: pandas.core.frame.DataFrame, col_name: str) -> Any:", "funcdef": "def"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_baseline_features", "kind": "function", "doc": "

\n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_new_features", "kind": "function", "doc": "

\n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.task_generators": {"fullname": "outrank.task_generators", "modulename": "outrank.task_generators", "kind": "module", "doc": "

\n"}, "outrank.task_generators.logger": {"fullname": "outrank.task_generators.logger", "modulename": "outrank.task_generators", "qualname": "logger", "kind": "variable", "doc": "

\n", "default_value": "<Logger syn-logger (DEBUG)>"}, "outrank.task_generators.outrank_task_generate_data_set": {"fullname": "outrank.task_generators.outrank_task_generate_data_set", "modulename": "outrank.task_generators", "qualname": "outrank_task_generate_data_set", "kind": "function", "doc": "

Core method for generating data sets

\n", "signature": "(args):", "funcdef": "def"}, "outrank.task_ranking": {"fullname": "outrank.task_ranking", "modulename": "outrank.task_ranking", "kind": "module", "doc": "

\n"}, "outrank.task_ranking.outrank_task_conduct_ranking": {"fullname": "outrank.task_ranking.outrank_task_conduct_ranking", "modulename": "outrank.task_ranking", "qualname": "outrank_task_conduct_ranking", "kind": "function", "doc": "

\n", "signature": "(args: Any):", "funcdef": "def"}, "outrank.task_selftest": {"fullname": "outrank.task_selftest", "modulename": "outrank.task_selftest", "kind": "module", "doc": "

\n"}, "outrank.task_selftest.logger": {"fullname": "outrank.task_selftest.logger", "modulename": "outrank.task_selftest", "qualname": "logger", "kind": "variable", "doc": "

\n", "default_value": "<Logger syn-logger (DEBUG)>"}, "outrank.task_selftest.conduct_self_test": {"fullname": "outrank.task_selftest.conduct_self_test", "modulename": "outrank.task_selftest", "qualname": "conduct_self_test", "kind": "function", "doc": "

\n", "signature": "():", "funcdef": "def"}, "outrank.task_summary": {"fullname": "outrank.task_summary", "modulename": "outrank.task_summary", "kind": "module", "doc": "

\n"}, "outrank.task_summary.outrank_task_result_summary": {"fullname": "outrank.task_summary.outrank_task_result_summary", "modulename": "outrank.task_summary", "qualname": "outrank_task_result_summary", "kind": "function", "doc": "

\n", "signature": "(args):", "funcdef": "def"}, "outrank.task_visualization": {"fullname": "outrank.task_visualization", "modulename": "outrank.task_visualization", "kind": "module", "doc": "

\n"}, "outrank.task_visualization.outrank_task_visualize_results": {"fullname": "outrank.task_visualization.outrank_task_visualize_results", "modulename": "outrank.task_visualization", "qualname": "outrank_task_visualize_results", "kind": "function", "doc": "

\n", "signature": "(args):", "funcdef": "def"}, "outrank.visualizations": {"fullname": "outrank.visualizations", "modulename": "outrank.visualizations", "kind": "module", "doc": "

\n"}, "outrank.visualizations.ranking_visualization": {"fullname": "outrank.visualizations.ranking_visualization", "modulename": "outrank.visualizations.ranking_visualization", "kind": "module", "doc": "

\n"}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"fullname": "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_hierarchical_clusters", "kind": "function", "doc": "

A method for visualization of hierarchical clusters w.r.t. different linkage functions

\n", "signature": "(\ttriplet_dataframe: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str = 'png',\tmax_num_clusters: int = 100) -> None:", "funcdef": "def"}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"fullname": "outrank.visualizations.ranking_visualization.visualize_heatmap", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_heatmap", "kind": "function", "doc": "

\n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str) -> None:", "funcdef": "def"}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"fullname": "outrank.visualizations.ranking_visualization.visualize_barplots", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_barplots", "kind": "function", "doc": "

\n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\treference_json: str,\timage_format: str,\tlabel: str,\theuristic: str) -> None:", "funcdef": "def"}, "outrank.visualizations.ranking_visualization.visualize_all": {"fullname": "outrank.visualizations.ranking_visualization.visualize_all", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_all", "kind": "function", "doc": "

A method for visualization of the obtained feature interaction maps.

\n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\tlabel: str = '',\treference_json: str = '',\timage_format: str = 'png',\theuristic: str = 'MI') -> None:", "funcdef": "def"}}, "docInfo": {"outrank": {"qualname": 0, "fullname": 1, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 32}, "outrank.algorithms": {"qualname": 0, "fullname": 2, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.feature_ranking": {"qualname": 0, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"qualname": 0, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"qualname": 2, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 11, "bases": 0, "doc": 9}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"qualname": 3, "fullname": 10, "annotation": 0, "default_value": 0, "signature": 31, "bases": 0, "doc": 3}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"qualname": 2, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 41, "bases": 0, "doc": 6}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"qualname": 4, "fullname": 11, "annotation": 0, "default_value": 0, "signature": 38, "bases": 0, "doc": 11}, "outrank.algorithms.importance_estimator": {"qualname": 0, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.sklearn_MI": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 31, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 42, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.numba_mi": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 23, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"qualname": 3, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 18, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"qualname": 4, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 22, "bases": 0, "doc": 21}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"qualname": 3, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 204, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"qualname": 4, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 42, "bases": 0, "doc": 3}, "outrank.algorithms.sketches": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog": {"qualname": 0, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 26}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"qualname": 1, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.algorithms.synthetic_data_generators": {"qualname": 0, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"qualname": 0, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"qualname": 3, "fullname": 10, "annotation": 0, "default_value": 0, "signature": 27, "bases": 0, "doc": 3}, "outrank.core_ranking": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.logger": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 8, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"qualname": 3, "fullname": 6, "annotation": 5, "default_value": 1, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"qualname": 4, "fullname": 7, "annotation": 4, "default_value": 2, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.IGNORED_VALUES": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 2, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 2, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.encode_int_column": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 58, "bases": 0, "doc": 12}, "outrank.core_ranking.mixed_rank_graph": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 81, "bases": 0, "doc": 19}, "outrank.core_ranking.enrich_with_transformations": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 92, "bases": 0, "doc": 13}, "outrank.core_ranking.compute_combined_features": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 103, "bases": 0, "doc": 11}, "outrank.core_ranking.compute_expanded_multivalue_features": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 84, "bases": 0, "doc": 41}, "outrank.core_ranking.compute_subfeatures": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 84, "bases": 0, "doc": 70}, "outrank.core_ranking.include_noisy_features": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 73, "bases": 0, "doc": 11}, "outrank.core_ranking.compute_coverage": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 63, "bases": 0, "doc": 7}, "outrank.core_ranking.compute_feature_memory_consumption": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 63, "bases": 0, "doc": 10}, "outrank.core_ranking.compute_value_counts": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 42, "bases": 0, "doc": 6}, "outrank.core_ranking.compute_cardinalities": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 45, "bases": 0, "doc": 7}, "outrank.core_ranking.compute_bounds_increment": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 72, "bases": 0, "doc": 3}, "outrank.core_ranking.compute_batch_ranking": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 197, "bases": 0, "doc": 11}, "outrank.core_ranking.get_num_of_instances": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 19, "bases": 0, "doc": 15}, "outrank.core_ranking.get_grouped_df": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 59, "bases": 0, "doc": 12}, "outrank.core_ranking.checkpoint_importances_df": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 42, "bases": 0, "doc": 12}, "outrank.core_ranking.estimate_importances_minibatches": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 376, "bases": 0, "doc": 48}, "outrank.core_selftest": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.pro_tips": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 303, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.internal_hash": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 20, "bases": 0, "doc": 17}, "outrank.core_utils.DatasetInformationStorage": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 14}, "outrank.core_utils.DatasetInformationStorage.__init__": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 111, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.data_path": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.column_names": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.column_types": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"qualname": 3, "fullname": 6, "annotation": 4, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.encoding": {"qualname": 2, "fullname": 5, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"qualname": 3, "fullname": 6, "annotation": 5, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 9}, "outrank.core_utils.NumericFeatureSummary.__init__": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 61, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.minimum": {"qualname": 2, "fullname": 5, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.maximum": {"qualname": 2, "fullname": 5, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.median": {"qualname": 2, "fullname": 5, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NominalFeatureSummary": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 9}, "outrank.core_utils.NominalFeatureSummary.__init__": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 3}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.BatchRankingSummary": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 9}, "outrank.core_utils.BatchRankingSummary.__init__": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 67, "bases": 0, "doc": 3}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"qualname": 3, "fullname": 6, "annotation": 4, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.BatchRankingSummary.step_times": {"qualname": 3, "fullname": 6, "annotation": 4, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.display_random_tip": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 10, "bases": 0, "doc": 3}, "outrank.core_utils.get_dataset_info": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.core_utils.display_tool_name": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 10, "bases": 0, "doc": 3}, "outrank.core_utils.parse_ob_line": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 68, "bases": 0, "doc": 7}, "outrank.core_utils.parse_ob_line_vw": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 100, "bases": 0, "doc": 15}, "outrank.core_utils.parse_ob_csv_line": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 63, "bases": 0, "doc": 10}, "outrank.core_utils.generic_line_parser": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 100, "bases": 0, "doc": 13}, "outrank.core_utils.read_reference_json": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 27, "bases": 0, "doc": 9}, "outrank.core_utils.parse_namespace": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 46, "bases": 0, "doc": 9}, "outrank.core_utils.read_column_names": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 6}, "outrank.core_utils.parse_ob_vw_feature_information": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 9}, "outrank.core_utils.parse_ob_raw_feature_information": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 9}, "outrank.core_utils.parse_ob_feature_information": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 9}, "outrank.core_utils.parse_csv_with_description_information": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 3}, "outrank.core_utils.parse_csv_raw": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 3}, "outrank.core_utils.extract_features_from_reference_JSON": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 31, "bases": 0, "doc": 10}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 100, "bases": 0, "doc": 10}, "outrank.core_utils.summarize_rare_counts": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 67, "bases": 0, "doc": 5}, "outrank.feature_transformations": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault": {"qualname": 0, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"qualname": 0, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"qualname": 2, "fullname": 10, "annotation": 0, "default_value": 56, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"qualname": 2, "fullname": 10, "annotation": 0, "default_value": 173, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"qualname": 0, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"qualname": 2, "fullname": 10, "annotation": 0, "default_value": 4589, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"qualname": 2, "fullname": 10, "annotation": 0, "default_value": 4, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"qualname": 3, "fullname": 11, "annotation": 0, "default_value": 8, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers": {"qualname": 0, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"qualname": 1, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 47, "bases": 0, "doc": 8}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"qualname": 1, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 43, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"qualname": 4, "fullname": 9, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 51, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 39, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 39, "bases": 0, "doc": 3}, "outrank.task_generators": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_generators.logger": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 8, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_generators.outrank_task_generate_data_set": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 11, "bases": 0, "doc": 8}, "outrank.task_ranking": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_ranking.outrank_task_conduct_ranking": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.task_selftest": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_selftest.logger": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 8, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_selftest.conduct_self_test": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 7, "bases": 0, "doc": 3}, "outrank.task_summary": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_summary.outrank_task_result_summary": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 11, "bases": 0, "doc": 3}, "outrank.task_visualization": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_visualization.outrank_task_visualize_results": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 11, "bases": 0, "doc": 3}, "outrank.visualizations": {"qualname": 0, "fullname": 2, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.visualizations.ranking_visualization": {"qualname": 0, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"qualname": 3, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 91, "bases": 0, "doc": 15}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 59, "bases": 0, "doc": 3}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 93, "bases": 0, "doc": 3}, "outrank.visualizations.ranking_visualization.visualize_all": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 135, "bases": 0, "doc": 13}}, "length": 138, "save": true}, "index": {"qualname": {"root": {"3": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}, "docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 6, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 3, "b": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}}, "df": 3}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}}, "df": 1, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}}, "df": 7}}}}}}}}}}}}}}}}}}}}, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}}, "df": 1}}}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}}, "df": 1}}}, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 4}}}}}}}}}}}}}}}}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 3, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 4, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "w": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 2}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 3}}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 11}}}}, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}}}}, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 2}}}}, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 3, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 1}}}}}}}}}, "l": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}}, "df": 1, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}}, "df": 5}}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_coverage": {"tf": 1}}, "df": 1}}}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}}}}}}, "s": {"docs": {}, "df": 0, "v": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 3}}, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}, "e": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1}}}}, "x": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}}, "df": 1, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}}, "i": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 3, "x": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "x": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 1}}}}, "p": {"docs": {"outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 1}, "x": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}}, "df": 1}}}}}, "j": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}}, "df": 2, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 4}}}}}}}}}, "i": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 6}}, "t": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}}}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 4}}}}}}}}}}, "g": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 3}}}}}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}}, "df": 1}}}}}}}, "b": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}}, "y": {"docs": {"outrank.task_summary.outrank_task_result_summary": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 2}}}}}}, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}, "l": {"docs": {}, "df": 0, "f": {"docs": {"outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 1}}}, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}}, "df": 1}}}, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}}, "df": 2}}}}}, "e": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 1}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "j": {"docs": {"outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 1}, "d": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 1}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 1}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 6}, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}}}, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}}, "df": 2}}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 1}}}}}}}, "p": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}}, "df": 1, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}}, "df": 1}}, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 9, "r": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "p": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 2, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}}, "df": 2}}}}, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}}, "df": 2}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}, "w": {"docs": {"outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 2}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}}, "df": 2}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 2}}}}}}}, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 1}}}}}}}, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_summary.outrank_task_result_summary": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 1}}}}}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 8, "s": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 8}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 3}}}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 9}}}}}}}}}}}}}}}}}}}}}}}}, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}}, "df": 1}}}, "w": {"docs": {"outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 9}}}}}}}}}}}, "l": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {"outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}}, "df": 1}}}}}}}, "w": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}}, "df": 2}}}}}, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}}, "df": 1}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 2}}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.logger": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 3}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 4}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1}}}, "s": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 1}}}, "w": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}}, "df": 2}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 5}}}}}}}}}, "b": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 2}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 1, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 4}}}}}}}}}}}}}}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}}, "df": 1}}}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 4}}}}}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.display_random_tip": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 1}}}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "k": {"docs": {"outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 4}}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "f": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}, "b": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 6}, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 4}}}}}}}, "d": {"docs": {}, "df": 0, "f": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 2}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.get_dataset_info": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 8}}}}}}}}}}}}}}}}}}}}}}}}, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 1}}}}}}}}}, "f": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 2}}}}}}}, "j": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 2}}}}}}, "fullname": {"root": {"3": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}, "docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 6, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms": {"tf": 1}, "outrank.algorithms.feature_ranking": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.algorithms.sketches": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}, "outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_ranking": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_selftest": {"tf": 1}, "outrank.core_utils": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}, "outrank.feature_transformations": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.task_generators": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1.4142135623730951}, "outrank.task_ranking": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1.4142135623730951}, "outrank.task_selftest": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}, "outrank.task_summary": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1.4142135623730951}, "outrank.task_visualization": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1.4142135623730951}, "outrank.visualizations": {"tf": 1}, "outrank.visualizations.ranking_visualization": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 138}}}}}}, "f": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}, "b": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 6}}, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms": {"tf": 1}, "outrank.algorithms.feature_ranking": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.algorithms.sketches": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}, "outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 29}}}}}}}}, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 1}}, "d": {"docs": {}, "df": 0, "j": {"docs": {"outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 1}, "d": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 1}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 35, "s": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 8}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 3}}}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 9}}}}}}}}}}}}}}}}}}}}}}}}, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}}, "df": 1}}}, "w": {"docs": {"outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 5}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 2, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.feature_ranking": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1.4142135623730951}, "outrank.core_ranking": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.task_ranking": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 49}}}}, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}}, "df": 2}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}, "w": {"docs": {"outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 2}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}}, "df": 2}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 2}}}}}}}, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 1}}}}}}}, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_summary.outrank_task_result_summary": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}}, "df": 1, "i": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 8, "x": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}}, "df": 1}}}}}}, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "x": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 1}}}}, "p": {"docs": {"outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 1}, "x": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}}, "df": 1}}}}}, "j": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 3, "b": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}}, "df": 6}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}}, "df": 1, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}}, "df": 7}}}}}}}}}}}}}}}}}}}}, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}}, "df": 1}}}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}}, "df": 1}}}, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 4}}}}}}}}}}}}}}}}}}}}, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 2}}}, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 3, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 4, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "w": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 2}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 3}}}}}, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 10}}}}}}}}}, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 44}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 11}}}}, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}}}}, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 2}}}}, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 3, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 1}}}}}}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 10}}}, "s": {"docs": {"outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_selftest": {"tf": 1}, "outrank.core_utils": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 68}}, "l": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}}, "df": 1, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}}, "df": 5}}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_coverage": {"tf": 1}}, "df": 1}}}}}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}}}}}}, "s": {"docs": {}, "df": 0, "v": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 3}}, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}}, "df": 9}}, "e": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1}}}}, "x": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}}, "df": 2, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 4}}}}}}}}}, "i": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 6}}, "t": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}}}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}}, "df": 8, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 4}}}}}}}}}}, "g": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 3}}}}}, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 11}}}}}}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}}, "df": 1}}}}}}}, "b": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}}, "y": {"docs": {"outrank.task_summary": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1.4142135623730951}}, "df": 2}}}}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 2}}}}}}, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}, "l": {"docs": {}, "df": 0, "f": {"docs": {"outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 1, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_selftest": {"tf": 1}, "outrank.task_selftest": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 4}}}}}}}, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}}, "df": 1}}}, "y": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 3}}}}}}}}, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}}, "df": 2}}}}}, "e": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 1}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 6}, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.task_generators": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 6}}}, "e": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}}}, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}}, "df": 2}}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 1}}}}}}}, "p": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}}, "df": 1, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}}, "df": 1}}, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 9, "r": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "p": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}}, "df": 1}}}}}}, "h": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 9}}}}}}}}}}}, "l": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {"outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}}, "df": 1}}}}}}}, "w": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}}, "df": 2}}}}}, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}}, "df": 1}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 2}}}}, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 5, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.get_dataset_info": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 8}}}}}}}}}}}}}}}}}}}}}}}}, "f": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 2}, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 1}}}}}}}}}, "f": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}}, "df": 3}}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 2}}}}}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.logger": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 3}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 4}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1}}}, "s": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 1}}, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.feature_transformer_vault": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 8}}}}, "w": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}}, "df": 2}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.task_visualization": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}, "outrank.visualizations.ranking_visualization": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 7, "s": {"docs": {"outrank.visualizations": {"tf": 1}, "outrank.visualizations.ranking_visualization": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 6}}}}}}, "e": {"docs": {"outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 5}}}}}}}}}, "b": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 2}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 1, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 4}}}}}}}}}}}}}}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}}, "df": 1}}}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.feature_transformations": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 23}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 8, "s": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 21}}}}}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.display_random_tip": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 1}}}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "k": {"docs": {"outrank.task_generators": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1.4142135623730951}, "outrank.task_ranking": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1.4142135623730951}, "outrank.task_selftest": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}, "outrank.task_summary": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1.4142135623730951}, "outrank.task_visualization": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1.4142135623730951}}, "df": 12}}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 1}}}}, "j": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 2}}}}}}, "annotation": {"root": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1.4142135623730951}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 18, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "[": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}}, "df": 1}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 3}}}}}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1.4142135623730951}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 3}}}, "t": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 3}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 7}}, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "[": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 2}}}}}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "[": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}}, "df": 1}}}, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "[": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}}}}}}}}}, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 2}}}}, "f": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 4}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 2}}}}}, "default_value": {"root": {"0": {"1": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}}, "df": 1}, "2": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}}, "df": 2}, "4": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}}, "df": 1}, "8": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}}, "df": 1}, "docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 22.715633383201094}}, "df": 3}, "1": {"0": {"0": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8.06225774829855}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 3}, "docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 2}, "6": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "docs": {"outrank.core_utils.pro_tips": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 2.8284271247461903}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 10.198039027185569}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 6}, "2": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.830951894845301}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 3}, "3": {"2": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "4": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "5": {"0": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "6": {"4": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "8": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "9": {"6": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "docs": {"outrank.core_ranking.logger": {"tf": 1.4142135623730951}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2.8284271247461903}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 4.795831523312719}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 12.288205727444508}, "outrank.task_generators.logger": {"tf": 1.4142135623730951}, "outrank.task_selftest.logger": {"tf": 1.4142135623730951}}, "df": 10, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.logger": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.313708498984761}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 5}, "o": {"docs": {}, "df": 0, "g": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 3.1622776601683795}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.74734012447073}}, "df": 3, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.logger": {"tf": 1.4142135623730951}, "outrank.task_generators.logger": {"tf": 1.4142135623730951}, "outrank.task_selftest.logger": {"tf": 1.4142135623730951}}, "df": 3}}}, "*": {"1": {"0": {"0": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2}}}}}}, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "s": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.logger": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 3}}, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "o": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "q": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 2.6457513110645907}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.61895003862225}}, "df": 3}}}}, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.logger": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 3}}}, "f": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "f": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "v": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}}, "df": 2, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}}, "df": 2}}}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "g": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.logger": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 16}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 5}, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "c": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}}, "df": 1}}}}}, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}, "v": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2.23606797749979}}, "df": 1, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "o": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "x": {"2": {"7": {"docs": {"outrank.core_utils.pro_tips": {"tf": 5.656854249492381}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 4}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 6.324555320336759}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 23.49468024894146}}, "df": 4}, "docs": {}, "df": 0}, "docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2.449489742783178}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 4.795831523312719}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 23.130067012440755}}, "df": 3, "+": {"1": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 3}, "docs": {}, "df": 0}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1}}}}}}, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "f": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 3.872983346207417}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "w": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.313708498984761}}, "df": 2}}, "b": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}, "u": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "x": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2.23606797749979}}, "df": 1, "r": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1}, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "l": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1}, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "b": {"docs": {}, "df": 0, "s": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 2.8284271247461903}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 2.8284271247461903}}, "df": 3}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2.23606797749979}}, "df": 1, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "n": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}, "x": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}}, "df": 2}}, "i": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}}}}}, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}}, "o": {"docs": {}, "df": 0, "w": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}}, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "b": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "n": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2}}}}}}, "p": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2.449489742783178}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 4.58257569495584}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 23.08679276123039}}, "df": 3}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "k": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2.23606797749979}}, "df": 1}}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}}}}, "s": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.313708498984761}}, "df": 1}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.445523142259598}}, "df": 2}}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}, "w": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}}}}, "s": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "s": {"docs": {}, "df": 0, "v": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 3.1622776601683795}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.74734012447073}}, "df": 3, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}}}}}}, "w": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 16.0312195418814}}, "df": 2}}}}}, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "o": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "w": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2, "f": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "b": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}}, "df": 1}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "k": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "p": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "y": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "signature": {"root": {"0": {"0": {"5": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}}, "df": 1}, "1": {"0": {"0": {"0": {"0": {"0": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "5": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}, "docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.4142135623730951}}, "df": 2}, "2": {"0": {"0": {"0": {"0": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "3": {"9": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 2}, "outrank.core_utils.parse_ob_line": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 2.8284271247461903}}, "df": 7}, "docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}, "8": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}, "docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 3.1622776601683795}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 4.69041575982343}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 5.477225575051661}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 5.477225575051661}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 4.898979485566356}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 5.656854249492381}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 4.242640687119285}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 3.7416573867739413}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 4.242640687119285}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 12.806248474865697}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 5.830951894845301}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 3.4641016151377544}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 3.7416573867739413}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 4.69041575982343}, "outrank.core_ranking.encode_int_column": {"tf": 6.855654600401044}, "outrank.core_ranking.mixed_rank_graph": {"tf": 8}, "outrank.core_ranking.enrich_with_transformations": {"tf": 8.54400374531753}, "outrank.core_ranking.compute_combined_features": {"tf": 9.1104335791443}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 8.246211251235321}, "outrank.core_ranking.compute_subfeatures": {"tf": 8.246211251235321}, "outrank.core_ranking.include_noisy_features": {"tf": 7.681145747868608}, "outrank.core_ranking.compute_coverage": {"tf": 7.14142842854285}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 7.14142842854285}, "outrank.core_ranking.compute_value_counts": {"tf": 5.830951894845301}, "outrank.core_ranking.compute_cardinalities": {"tf": 6}, "outrank.core_ranking.compute_bounds_increment": {"tf": 7.54983443527075}, "outrank.core_ranking.compute_batch_ranking": {"tf": 12.449899597988733}, "outrank.core_ranking.get_num_of_instances": {"tf": 4}, "outrank.core_ranking.get_grouped_df": {"tf": 6.855654600401044}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 5.830951894845301}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 17.204650534085253}, "outrank.core_utils.internal_hash": {"tf": 4}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 9.38083151964686}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 7}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 4.47213595499958}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 7.3484692283495345}, "outrank.core_utils.display_random_tip": {"tf": 3}, "outrank.core_utils.get_dataset_info": {"tf": 3.7416573867739413}, "outrank.core_utils.display_tool_name": {"tf": 3}, "outrank.core_utils.parse_ob_line": {"tf": 7.416198487095663}, "outrank.core_utils.parse_ob_line_vw": {"tf": 8.831760866327848}, "outrank.core_utils.parse_ob_csv_line": {"tf": 7.14142842854285}, "outrank.core_utils.generic_line_parser": {"tf": 8.94427190999916}, "outrank.core_utils.read_reference_json": {"tf": 4.69041575982343}, "outrank.core_utils.parse_namespace": {"tf": 6.082762530298219}, "outrank.core_utils.read_column_names": {"tf": 4.58257569495584}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 4.47213595499958}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 4.47213595499958}, "outrank.core_utils.parse_ob_feature_information": {"tf": 4.47213595499958}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 4.47213595499958}, "outrank.core_utils.parse_csv_raw": {"tf": 4.47213595499958}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 5}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 8.774964387392123}, "outrank.core_utils.summarize_rare_counts": {"tf": 7.211102550927978}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 6.164414002968976}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 5.744562646538029}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 6.324555320336759}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 5.656854249492381}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 5.656854249492381}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 3.1622776601683795}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 3.7416573867739413}, "outrank.task_selftest.conduct_self_test": {"tf": 2.6457513110645907}, "outrank.task_summary.outrank_task_result_summary": {"tf": 3.1622776601683795}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 3.1622776601683795}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 8.366600265340756}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 6.855654600401044}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 8.602325267042627}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 10.295630140987}}, "df": 68, "a": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}}, "df": 1, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}, "p": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "x": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "n": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 2.449489742783178}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.encode_int_column": {"tf": 1.4142135623730951}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.7320508075688772}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 2.449489742783178}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 3}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 2}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}}, "df": 31}}, "r": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 23}}}}, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 3}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1.4142135623730951}}, "df": 1, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}}, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 2}}}}}}}, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.4142135623730951}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 29}}, "m": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}}}}, "l": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 6, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 6}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}}}, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}}}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 3}}}}}}}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 2}}}}, "r": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1.4142135623730951}}, "df": 4}}}}}}, "s": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 4}}}}, "l": {"docs": {}, "df": 0, "f": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 5}}, "t": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.7320508075688772}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 10}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.4142135623730951}, "outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_batch_ranking": {"tf": 2.6457513110645907}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 3.4641016151377544}, "outrank.core_utils.internal_hash": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 2.6457513110645907}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_line": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1.7320508075688772}, "outrank.core_utils.generic_line_parser": {"tf": 1.4142135623730951}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 2}, "outrank.core_utils.read_column_names": {"tf": 1.4142135623730951}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 2.23606797749979}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 2.23606797749979}}, "df": 32, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 4}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 2}}}}}, "e": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}}, "df": 1}}}, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}}, "df": 14}}}, "t": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 7}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}}}}}, "f": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}}, "s": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 2}}}}}}}}}, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}}, "p": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "b": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 22}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 9}}}, "o": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}}, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 6}}}, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 2}}}, "x": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 2}, "e": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}}, "df": 1}}}}, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}}, "df": 2}}}}}}}}, "f": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1.4142135623730951}}, "df": 1, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 5}}}}, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 4}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}}, "df": 2}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.7320508075688772}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}}, "df": 7}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 22}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 4, "s": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}, "w": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 4}, "o": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}}}, "m": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1, "l": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}}, "df": 2}}}}}, "a": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}}, "df": 1, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}}, "df": 4}}}}}, "x": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {"outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 1, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}}, "df": 1}}}}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 3}}}}}}}, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 2}}}}}}, "t": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}}, "df": 2, "m": {"docs": {}, "df": 0, "p": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 4}}, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.4142135623730951}, "outrank.core_ranking.encode_int_column": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 8}}}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 2.449489742783178}, "outrank.core_ranking.encode_int_column": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 2.449489742783178}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 8}}}, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 6}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 3}}}}}}}, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 3}}}, "s": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 1}}}}, "d": {"docs": {}, "df": 0, "f": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 4}, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 2.449489742783178}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.7320508075688772}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 2.449489742783178}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 10}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 7, "f": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.4142135623730951}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_combined_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}, "outrank.core_ranking.include_noisy_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_coverage": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_value_counts": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_cardinalities": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 22}}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 6}}}}}}}}}}}}}}}}}}}}}}}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}}}}}}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 6}}}}}}}, "f": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 1, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}}}}}, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}}}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 2}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}}, "df": 1}}}}, "b": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 2}}}}}}}}}}}}}}}}}}, "o": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 3}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 5, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 4}}}}}}, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1.4142135623730951}, "outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 2}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1.7320508075688772}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 16}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 4, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 2, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 2}}}}}}}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.7320508075688772}, "outrank.core_ranking.get_grouped_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 2.23606797749979}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 13}}, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 5}}}, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 7}}}}}, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 8}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 5}}}}}, "b": {"docs": {}, "df": 0, "j": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1.4142135623730951}}, "df": 2}}}}}, "n": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 8}}}, "f": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}}, "df": 2}}}}}}, "j": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}, "g": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}}}}}}}}}}, "bases": {"root": {"docs": {}, "df": 0}}, "doc": {"root": {"docs": {"outrank": {"tf": 4}, "outrank.algorithms": {"tf": 1.7320508075688772}, "outrank.algorithms.feature_ranking": {"tf": 1.7320508075688772}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1.7320508075688772}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1.7320508075688772}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1.4142135623730951}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1.7320508075688772}, "outrank.algorithms.synthetic_data_generators": {"tf": 1.7320508075688772}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1.7320508075688772}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1.7320508075688772}, "outrank.core_ranking": {"tf": 1.7320508075688772}, "outrank.core_ranking.logger": {"tf": 1.7320508075688772}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1.7320508075688772}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1.7320508075688772}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1.7320508075688772}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1.7320508075688772}, "outrank.core_ranking.encode_int_column": {"tf": 1.7320508075688772}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.4142135623730951}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_subfeatures": {"tf": 2.23606797749979}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_value_counts": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_cardinalities": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_num_of_instances": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_grouped_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_selftest": {"tf": 1.7320508075688772}, "outrank.core_utils": {"tf": 1.7320508075688772}, "outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}, "outrank.core_utils.internal_hash": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1.4142135623730951}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1.7320508075688772}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1.4142135623730951}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1.7320508075688772}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1.7320508075688772}, "outrank.core_utils.BatchRankingSummary": {"tf": 1.4142135623730951}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1.7320508075688772}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1.7320508075688772}, "outrank.core_utils.display_random_tip": {"tf": 1.7320508075688772}, "outrank.core_utils.get_dataset_info": {"tf": 1.7320508075688772}, "outrank.core_utils.display_tool_name": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_line": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1.4142135623730951}, "outrank.core_utils.generic_line_parser": {"tf": 1.7320508075688772}, "outrank.core_utils.read_reference_json": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_namespace": {"tf": 1.4142135623730951}, "outrank.core_utils.read_column_names": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_csv_raw": {"tf": 1.7320508075688772}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1.4142135623730951}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1.4142135623730951}, "outrank.core_utils.summarize_rare_counts": {"tf": 1.4142135623730951}, "outrank.feature_transformations": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1.7320508075688772}, "outrank.task_generators": {"tf": 1.7320508075688772}, "outrank.task_generators.logger": {"tf": 1.7320508075688772}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1.4142135623730951}, "outrank.task_ranking": {"tf": 1.7320508075688772}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1.7320508075688772}, "outrank.task_selftest": {"tf": 1.7320508075688772}, "outrank.task_selftest.logger": {"tf": 1.7320508075688772}, "outrank.task_selftest.conduct_self_test": {"tf": 1.7320508075688772}, "outrank.task_summary": {"tf": 1.7320508075688772}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1.7320508075688772}, "outrank.task_visualization": {"tf": 1.7320508075688772}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1.7320508075688772}, "outrank.visualizations": {"tf": 1.7320508075688772}, "outrank.visualizations.ranking_visualization": {"tf": 1.7320508075688772}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1.7320508075688772}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1.7320508075688772}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1.7320508075688772}}, "df": 138, "w": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 3}}}}, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 3, "i": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 1}}}}}, "t": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1, "o": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 5}, "h": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 11}, "i": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}}, "df": 4}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "/": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}}}}}}}}}}}}}}}}, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}, "w": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}}, "df": 1}}, "a": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 3, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "s": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}, "b": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 4, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 2, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}, "f": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 14}, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}}, "b": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 4, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 1}}}}}}}}, "s": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 4, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}}}}, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1, "d": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 1}}}, "u": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}}}, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}}, "df": 2}}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}}, "df": 2}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}}, "df": 2}}}}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "d": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}}, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 1, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}}, "df": 1}}}}, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 2}}, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}}}, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 8, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 2}}}}}, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}}, "f": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 2}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}}}}}}}}}}}}, "a": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1.4142135623730951}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1.4142135623730951}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 26, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 2}}, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 2, "d": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 1}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}}, "df": 1}}}, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 3}, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}, "u": {"docs": {}, "df": 0, "x": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}}}}}}, "t": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}}, "df": 1}, "|": {"docs": {}, "df": 0, "*": {"docs": {}, "df": 0, "|": {"docs": {}, "df": 0, "b": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}, "d": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "x": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "g": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}}}}}}}, "f": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}, "w": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 1}}}}}}}}}, "f": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1, "/": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}}}}}}}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 13, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 2}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 2}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 9, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 8}}}}}}, "w": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 1}}, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 2}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 2}}}}, "c": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 3}, "l": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 1}}}}}}}}}}}, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 3}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}}}}}}}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 8, "d": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {"outrank.core_utils.read_column_names": {"tf": 1}}, "df": 1, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}}, "df": 1}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}}}}}, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}}, "df": 1}}}}}}}}, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_coverage": {"tf": 1}}, "df": 1}}}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 2}}}}, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}, "s": {"docs": {}, "df": 0, "v": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}}, "df": 1}}, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 4}}}, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}, "b": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2, "e": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 3, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 2, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 1}}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 9}}}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}}, "df": 1}}}, "t": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}, "g": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}}, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}}, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}, "k": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}, "k": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}, "t": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 5, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 3, "s": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}, "o": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}}, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}}}, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}}}, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 2}}}}}}}}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "f": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 2}}}}}}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}}}}}}}}, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 4}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 3}}}}}, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 2}}}, "d": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}, "p": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_value_counts": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 2}}}}}}}, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}}, "df": 1, "d": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}}}}, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 2}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}, "x": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}}}}}, "p": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 1}}}}}}}}}}, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.read_column_names": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 2}}}}, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 1}}}, "i": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 8}}}}, "m": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}}, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}, "t": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}, "i": {"docs": {}, "df": 0, "x": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}}, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 1}}}}, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1.4142135623730951}}, "df": 1}}}}}, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}}, "df": 1}}}, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 3, "r": {"docs": {"outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 3}}}}, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}}}}}}}, "g": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}}}, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}}, "df": 1}}}}}}}}, "e": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 3, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}, "t": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 2, "s": {"docs": {"outrank.core_ranking.encode_int_column": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 4}}}}}, "i": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 2}}}}}}}}}}}}, "w": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}}, "g": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 3}}}}, "o": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 10}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}}}, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}}}}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}}}}}, "l": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}}, "df": 3}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.read_column_names": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}}}}, "o": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}, "w": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}, "l": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}, "s": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}}, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "w": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}}, "df": 2}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 2}}}}}}}}}, "x": {"docs": {}, "df": 0, "x": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}}}, "j": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 4}}}}}}}, "pipeline": ["trimmer"], "_isPrebuiltIndex": true}; + + // mirrored in build-search-index.js (part 1) + // Also split on html tags. this is a cheap heuristic, but good enough. + elasticlunr.tokenizer.setSeperator(/[\s\-.;&_'"=,()]+|<[^>]*>/); + + let searchIndex; + if (docs._isPrebuiltIndex) { + console.info("using precompiled search index"); + searchIndex = elasticlunr.Index.load(docs); + } else { + console.time("building search index"); + // mirrored in build-search-index.js (part 2) + searchIndex = elasticlunr(function () { + this.pipeline.remove(elasticlunr.stemmer); + this.pipeline.remove(elasticlunr.stopWordFilter); + this.addField("qualname"); + this.addField("fullname"); + this.addField("annotation"); + this.addField("default_value"); + this.addField("signature"); + this.addField("bases"); + this.addField("doc"); + this.setRef("fullname"); + }); + for (let doc of docs) { + searchIndex.addDoc(doc); + } + console.timeEnd("building search index"); + } + + return (term) => searchIndex.search(term, { + fields: { + qualname: {boost: 4}, + fullname: {boost: 2}, + annotation: {boost: 2}, + default_value: {boost: 2}, + signature: {boost: 2}, + bases: {boost: 2}, + doc: {boost: 1}, + }, + expand: true + }); +})(); diff --git a/outrank/__init__.py b/outrank/__init__.py index ca9802c..31be32d 100644 --- a/outrank/__init__.py +++ b/outrank/__init__.py @@ -1,3 +1,6 @@ +""" +.. include:: ../DOCS.md +""" from __future__ import annotations import logging diff --git a/run_build_docs.sh b/run_build_docs.sh new file mode 100644 index 0000000..1e911ad --- /dev/null +++ b/run_build_docs.sh @@ -0,0 +1 @@ +rm -rf docs; pdoc ./outrank -o docs; diff --git a/setup.py b/setup.py index 13df678..65f088c 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import os import setuptools -# test2 + def _parse_requirements(file): required_packages = []