Skip to content

Commit

Permalink
algo improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
SkBlaz committed Mar 25, 2024
1 parent e9f8cf6 commit b8de664
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 94 deletions.
190 changes: 97 additions & 93 deletions outrank/algorithms/sketches/counting_ultiloglog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@
"""
from __future__ import annotations

import sys

import numpy as np
import xxhash


class HyperLogLogWCache:
def __init__(self, error_rate=0.005):
def __init__(self, max_prehash_size=1000000):
# int(np.ceil(np.log2((1.04 / error_rate) ** 2)))
self.p = 19
self.m = 1 << self.p
self.warmup_set = set()
self.warmup_size = int(self.m / 2)
self.warmup_size = max_prehash_size
self.width = 64 - self.p
self.hll_flag = False

Expand All @@ -33,7 +35,7 @@ def _hasher_update(self, value):
self.M[j] = max(self.M[j], rho)

def add(self, value):
if len(self.warmup_set) < self.warmup_size and not self.hll_flag:
if sys.getsizeof(self.warmup_set) < self.warmup_size and not self.hll_flag:
self.warmup_set.add(value)
elif not self.hll_flag:
if not self.hll_flag:
Expand All @@ -59,6 +61,38 @@ def __len__(self):
return len(self.warmup_set)


def cardinality_kernel(algo = 'cache'):

start_time = time.time()

if algo == 'Hhll (10)':
GLOBAL_CARDINALITY_STORAGE = {1: None}
GLOBAL_CARDINALITY_STORAGE[1] = HyperLogLogWCache(10)
elif algo == 'Hhll (10000)':
GLOBAL_CARDINALITY_STORAGE = {1: None}
GLOBAL_CARDINALITY_STORAGE[1] = HyperLogLogWCache(10000)
elif algo == 'hll+ (0.005)':
import hyperloglog
GLOBAL_CARDINALITY_STORAGE = {1: None}
GLOBAL_CARDINALITY_STORAGE[1] = hyperloglog.HyperLogLog(0.005)
elif algo == 'hll+ (0.01)':
import hyperloglog
GLOBAL_CARDINALITY_STORAGE = {1: None}
GLOBAL_CARDINALITY_STORAGE[1] = hyperloglog.HyperLogLog(0.01)
elif algo == 'set':
GLOBAL_CARDINALITY_STORAGE = {1: set()}

for j in ground:
GLOBAL_CARDINALITY_STORAGE[1].add(j)

size1 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE)
error1 = 100 * \
(1 - len(GLOBAL_CARDINALITY_STORAGE[1]) / len(set(ground)))
end_time = time.time()
tp1 = end_time - start_time
return tp1, error1


if __name__ == '__main__':
import random
import string
Expand All @@ -69,7 +103,11 @@ def __len__(self):
import seaborn as sns
import tqdm
from pympler import asizeof

# sns.set_style("whitegrid")
plt.rcParams.update({
'text.usetex': True,
'font.family': 'Helvetica',
})
def get_random_string(length):
# choose from all lowercase letter
letters = string.ascii_lowercase
Expand All @@ -78,100 +116,66 @@ def get_random_string(length):

# results_df = []
# num_vals = 100000
# nbits = 16
# for _ in range(3):
# for j in tqdm.tqdm(range(1000000, 10000000, 1000)):
# for _ in range(10):
# for j in tqdm.tqdm(range(1000, 100000, 1000)):
# ground = list(set(np.random.randint(0, j, num_vals).tolist()))
# ground = ground + [
# get_random_string(random.randint(1, 15)) for k in range(j)
# ]

# start_time = time.time()
# GLOBAL_CARDINALITY_STORAGE = {}
# GLOBAL_CARDINALITY_STORAGE[1] = HyperLogLogWCache(0.005)

# for j in ground:
# GLOBAL_CARDINALITY_STORAGE[1].add(j)

# size1 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE)
# error1 = 100 * \
# (1 - len(GLOBAL_CARDINALITY_STORAGE[1]) / len(set(ground)))
# end_time = time.time()
# tp1 = end_time - start_time

# import hyperloglog

# start_time = time.time()
# GLOBAL_CARDINALITY_STORAGE = {}
# GLOBAL_CARDINALITY_STORAGE[1] = hyperloglog.HyperLogLog(0.005)

# for j in ground:
# GLOBAL_CARDINALITY_STORAGE[1].add(j)
# size2 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE)
# error2 = 100 * \
# (1 - len(GLOBAL_CARDINALITY_STORAGE[1]) / len(set(ground)))
# end_time = time.time()
# tp2 = end_time - start_time

# start_time = time.time()
# GLOBAL_CARDINALITY_STORAGE = set()

# for j in ground:
# GLOBAL_CARDINALITY_STORAGE.add(j)

# size3 = asizeof.asizeof(GLOBAL_CARDINALITY_STORAGE)
# error3 = 100 * \
# (1 - len(GLOBAL_CARDINALITY_STORAGE) / len(set(ground)))
# end_time = time.time()
# tp3 = end_time - start_time

# results_df.append(
# {
# 'num_samples': len(ground),
# 'time': tp3,
# 'algo': 'set',
# 'error': error3,
# },
# )
# results_df.append(
# {
# 'num_samples': len(ground),
# 'time': tp2,
# 'algo': 'default',
# 'error': error2,
# },
# )
# results_df.append(
# {
# 'num_samples': len(ground),
# 'time': tp1,
# 'algo': f'hllc ({nbits}, mixed)',
# 'error': error1,
# },
# )

# for algo in ['Hhll (10)', 'Hhll (10000)', 'hll+ (0.005)', 'hll+ (0.01)', 'set']:
# tp, error = cardinality_kernel(algo)
# results_df.append(
# {
# 'num_samples': len(ground),
# 'time': tp,
# 'algo': algo,
# 'error': error,
# }
# )


# out_df = pd.DataFrame(results_df)
# out_df.to_csv('backup.csv')
# print(out_df)
# print(out_df.groupby('algo').mean())
# sns.lineplot(
# x=out_df.num_samples, y=out_df.error,
# hue=out_df.algo, alpha=0.5,
# )
# plt.tight_layout()
# plt.ylabel('Num. of unique values in data')
# plt.ylabel('Abs error')
# plt.savefig('linep.pdf')
# plt.clf()
# plt.cla()

# sns.lineplot(
# x=out_df.num_samples.astype(
# float,
# ), y=out_df.time, hue=out_df.algo,
# )
# plt.tight_layout()
# plt.ylabel('Time (s)')
# plt.savefig('barp.pdf')
# plt.clf()
# plt.cla()
pals = 'coolwarm'
out_df = pd.read_csv('backup.csv')
print(out_df)
print(out_df.groupby('algo').mean())
g = sns.jointplot(
y=out_df.num_samples, x=out_df.error,
hue=out_df.algo, alpha=0.6, palette=pals,
)
plt.tight_layout()
g.ax_marg_y.remove()
plt.ylim(0, max(out_df.num_samples.astype(float)))

plt.ylabel('Num. of unique values in data')
plt.xlabel('Abs error')
plt.savefig('hllErr.pdf')
plt.clf()
plt.cla()


sns.histplot(
y=out_df.num_samples.astype(
float,
), x=out_df.time, hue=out_df.algo,
alpha=0.3,
palette=pals,

)
g = sns.jointplot(
y=out_df.num_samples.astype(
float,
), x=out_df.time, hue=out_df.algo, alpha=.6, style=out_df.algo,
palette=pals,
)
g.ax_marg_y.remove()
plt.ylim(0, max(out_df.num_samples.astype(float)))
plt.tight_layout()
plt.xlabel('Num. of unique values in data')
plt.ylabel('Time (s)')
plt.savefig('hllTime.pdf')
plt.clf()
plt.cla()
2 changes: 2 additions & 0 deletions outrank/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def write_json_dump_to_file(args: Any, config_name: str) -> None:

def internal_hash(input_obj: str) -> int:
"""A generic internal hash used throughout ranking procedure - let's hardcode seed here for sure"""
if isinstance(input_obj, int):
input_obj = str(input_obj)
return xxhash.xxh32(input_obj, seed=20141025).intdigest()


Expand Down
2 changes: 1 addition & 1 deletion tests/hll_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_hll_update(self):
def test_stress_multi_feature(self):
GLOBAL_CARDINALITY_STORAGE = dict()
for j in range(10):
GLOBAL_CARDINALITY_STORAGE[j] = HyperLogLog(0.01)
GLOBAL_CARDINALITY_STORAGE[j] = HyperLogLog(100000)
for j in range(1000):
for k in range(len(GLOBAL_CARDINALITY_STORAGE)):
GLOBAL_CARDINALITY_STORAGE[k].add(1232 + j)
Expand Down

0 comments on commit b8de664

Please sign in to comment.