Skip to content

Commit 01df4d1

Browse files
committed
ccc: rename pvalue-related parameters and add pvalue_chunksize
1 parent 2ded154 commit 01df4d1

File tree

2 files changed

+42
-44
lines changed

2 files changed

+42
-44
lines changed

libs/ccc/coef/impl.py

+20-20
Original file line numberDiff line numberDiff line change
@@ -310,10 +310,11 @@ def ccc(
310310
internal_n_clusters: Union[int, Iterable[int]] = None,
311311
return_parts: bool = False,
312312
n_chunks_threads_ratio: int = 1,
313-
pvalue_n_permutations: int = None,
313+
pvalue_n_perms: int = None,
314314
random_state: int = None,
315315
n_jobs: int = 1,
316-
n_jobs_permutations: int = 1,
316+
pvalue_n_jobs: int = 1,
317+
pvalue_chunksize: int = 100,
317318
) -> tuple[NDArray[float], NDArray[float], NDArray[np.uint64], NDArray[np.int16]]:
318319
"""
319320
This is the main function that computes the Clustermatch Correlation
@@ -334,15 +335,18 @@ def ccc(
334335
n_chunks_threads_ratio: allows to modify how pairwise comparisons are
335336
split across different threads. It's given as the ratio parameter of
336337
function get_chunks.
337-
pvalue_n_permutations: if given, it computes the p-value of the
338+
pvalue_n_perms: if given, it computes the p-value of the
338339
coefficient using the given number of permutations.
339340
random_state: seed for the random number generator. This is used to compute
340341
the p-value of the coefficient using permutations.
341342
n_jobs: number of CPU cores to use for parallelization. The value
342343
None will use all available cores (`os.cpu_count()`), and negative
343344
values will use `os.cpu_count() - n_jobs`. Default is 1.
344-
n_jobs_permutations: number of CPU cores to use for parallelization when
345+
pvalue_n_jobs: number of CPU cores to use for parallelization when
345346
computing the p-value of the coefficient using permutations.
347+
pvalue_chunksize: number of permutations to compute in each chunk when
348+
computing the p-value of the coefficient using permutations. It's the
349+
chunksize parameter of the executor.map function.
346350
347351
348352
Returns:
@@ -545,9 +549,9 @@ def compute_coef(idx_list):
545549
max_ari_list[idx] = np.max((comp_values[max_idx], 0.0))
546550

547551
# compute p-value if requested
548-
if pvalue_n_permutations is not None and pvalue_n_permutations > 0:
552+
if pvalue_n_perms is not None and pvalue_n_perms > 0:
549553
with ThreadPoolExecutor(
550-
max_workers=n_jobs_permutations
554+
max_workers=pvalue_n_jobs
551555
) as executor_perms:
552556
# select the variable that generated more partitions as the one
553557
# to permute
@@ -560,7 +564,7 @@ def compute_coef(idx_list):
560564
obj_parts_sel_j = obji_parts
561565

562566
cdist_here = cdist_parts_basic
563-
if n_jobs_permutations == 1:
567+
if pvalue_n_jobs == 1:
564568
cdist_here = cdist_func
565569

566570
def compute_permutations(_):
@@ -586,24 +590,20 @@ def compute_permutations(_):
586590
)
587591
return np.max((p_comp_values[p_max_idx], 0.0))
588592

589-
p_ccc_values = np.full(
590-
pvalue_n_permutations, np.nan, dtype=float
591-
)
593+
p_ccc_values = np.full(pvalue_n_perms, np.nan, dtype=float)
592594
for p_idx, p_ccc_val in zip(
593-
np.arange(pvalue_n_permutations),
595+
np.arange(pvalue_n_perms),
594596
executor_perms.map(
595597
compute_permutations,
596-
np.arange(pvalue_n_permutations),
597-
chunksize=100,
598+
np.arange(pvalue_n_perms),
599+
chunksize=pvalue_chunksize,
598600
),
599601
):
600-
# for i in range(pvalue_n_permutations):
601602
p_ccc_values[p_idx] = p_ccc_val
602-
# p_ccc_values[i] = compute_permutations()
603603

604604
# compute p-value
605605
pvalues[idx] = (np.sum(p_ccc_values >= max_ari_list[idx]) + 1) / (
606-
pvalue_n_permutations + 1
606+
pvalue_n_perms + 1
607607
)
608608

609609
return max_ari_list, max_part_idx_list, pvalues
@@ -621,23 +621,23 @@ def compute_permutations(_):
621621
# return an array of values or a single scalar, depending on the input data
622622
if cm_values.shape[0] == 1:
623623
if return_parts:
624-
if pvalue_n_permutations is not None and pvalue_n_permutations > 0:
624+
if pvalue_n_perms is not None and pvalue_n_perms > 0:
625625
return (cm_values[0], cm_pvalues[0]), max_parts[0], parts
626626
else:
627627
return cm_values[0], max_parts[0], parts
628628
else:
629-
if pvalue_n_permutations is not None and pvalue_n_permutations > 0:
629+
if pvalue_n_perms is not None and pvalue_n_perms > 0:
630630
return cm_values[0], cm_pvalues[0]
631631
else:
632632
return cm_values[0]
633633

634634
if return_parts:
635-
if pvalue_n_permutations is not None and pvalue_n_permutations > 0:
635+
if pvalue_n_perms is not None and pvalue_n_perms > 0:
636636
return (cm_values, cm_pvalues), max_parts, parts
637637
else:
638638
return cm_values, max_parts, parts
639639
else:
640-
if pvalue_n_permutations is not None and pvalue_n_permutations > 0:
640+
if pvalue_n_perms is not None and pvalue_n_perms > 0:
641641
return cm_values, cm_pvalues
642642
else:
643643
return cm_values

tests/test_coef_pval.py

+22-24
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def test_cm_basic_pvalue_n_permutations_not_given():
2020
feature1 = rs.rand(100)
2121

2222
# Run
23-
cm_value = ccc(feature0, feature1, pvalue_n_permutations=None)
23+
cm_value = ccc(feature0, feature1, pvalue_n_perms=None)
2424

2525
# Validate
2626
assert cm_value is not None
@@ -37,7 +37,7 @@ def test_cm_basic_pvalue_n_permutations_is_zero():
3737
feature1 = rs.rand(100)
3838

3939
# Run
40-
cm_value = ccc(feature0, feature1, pvalue_n_permutations=0)
40+
cm_value = ccc(feature0, feature1, pvalue_n_perms=0)
4141

4242
# Validate
4343
assert cm_value is not None
@@ -54,7 +54,7 @@ def test_cm_basic_pvalue_n_permutations_is_1():
5454
feature1 = rs.rand(100)
5555

5656
# Run
57-
res = ccc(feature0, feature1, pvalue_n_permutations=1)
57+
res = ccc(feature0, feature1, pvalue_n_perms=1)
5858

5959
# Validate
6060
assert len(res) == 2
@@ -78,7 +78,7 @@ def test_cm_basic_pvalue_n_permutations_is_10():
7878
feature1 = rs.rand(100)
7979

8080
# Run
81-
res = ccc(feature0, feature1, pvalue_n_permutations=10)
81+
res = ccc(feature0, feature1, pvalue_n_perms=10)
8282

8383
# Validate
8484
assert len(res) == 2
@@ -101,7 +101,7 @@ def test_cm_linear_pvalue_n_permutations_10():
101101
feature1 = feature0 * 5.0
102102

103103
# Run
104-
res = ccc(feature0, feature1, pvalue_n_permutations=10)
104+
res = ccc(feature0, feature1, pvalue_n_perms=10)
105105

106106
# Validate
107107
assert len(res) == 2
@@ -124,7 +124,7 @@ def test_cm_linear_pvalue_n_permutations_100():
124124
feature1 = feature0 * 5.0
125125

126126
# Run
127-
res = ccc(feature0, feature1, pvalue_n_permutations=100)
127+
res = ccc(feature0, feature1, pvalue_n_perms=100)
128128

129129
# Validate
130130
assert len(res) == 2
@@ -147,7 +147,7 @@ def test_cm_quadratic_pvalue():
147147
feature1 = np.power(feature0, 2.0)
148148

149149
# Run
150-
res = ccc(feature0, feature1, pvalue_n_permutations=100)
150+
res = ccc(feature0, feature1, pvalue_n_perms=100)
151151

152152
# Validate
153153
assert len(res) == 2
@@ -170,7 +170,7 @@ def test_cm_quadratic_noisy_pvalue_with_random_state():
170170
feature1 = np.power(feature0, 2.0) + (2.0 * rs.rand(feature0.shape[0]))
171171

172172
# Run
173-
res = ccc(feature0, feature1, pvalue_n_permutations=100, random_state=2)
173+
res = ccc(feature0, feature1, pvalue_n_perms=100, random_state=2)
174174

175175
# Validate
176176
assert len(res) == 2
@@ -196,7 +196,7 @@ def test_cm_one_feature_with_all_same_values_pvalue():
196196
feature1 = np.array([5] * feature0.shape[0])
197197

198198
# Run
199-
res = ccc(feature0, feature1, pvalue_n_permutations=100)
199+
res = ccc(feature0, feature1, pvalue_n_perms=100)
200200

201201
# Validate
202202
assert len(res) == 2
@@ -222,7 +222,7 @@ def test_cm_single_argument_is_matrix():
222222
input_data = np.array([feature0, feature1, feature2])
223223

224224
# Run
225-
res = ccc(input_data, pvalue_n_permutations=100, random_state=1)
225+
res = ccc(input_data, pvalue_n_perms=100, random_state=1)
226226

227227
# Validate
228228
assert len(res) == 2
@@ -252,11 +252,11 @@ def test_cm_large_n_objects_pvalue_computation_is_parallelized():
252252

253253
# Run
254254
start_time = time.time()
255-
res = ccc(feature0, feature1, pvalue_n_permutations=50, n_jobs=1)
255+
res = ccc(feature0, feature1, pvalue_n_perms=50, n_jobs=1)
256256
elapsed_time_single_thread = time.time() - start_time
257257

258258
start_time = time.time()
259-
res = ccc(feature0, feature1, pvalue_n_permutations=50, n_jobs=2)
259+
res = ccc(feature0, feature1, pvalue_n_perms=50, n_jobs=2)
260260
elapsed_time_multi_thread = time.time() - start_time
261261

262262
# Validate
@@ -273,11 +273,11 @@ def test_cm_medium_n_objects_with_many_pvalue_computation_is_parallelized():
273273

274274
# Run
275275
start_time = time.time()
276-
res = ccc(feature0, feature1, pvalue_n_permutations=1000, n_jobs=1)
276+
res = ccc(feature0, feature1, pvalue_n_perms=1000, n_jobs=1)
277277
elapsed_time_single_thread = time.time() - start_time
278278

279279
start_time = time.time()
280-
res = ccc(feature0, feature1, pvalue_n_permutations=1000, n_jobs_permutations=2)
280+
res = ccc(feature0, feature1, pvalue_n_perms=1000, pvalue_n_jobs=2)
281281
elapsed_time_multi_thread = time.time() - start_time
282282

283283
# Validate
@@ -298,7 +298,7 @@ def test_cm_return_parts_quadratic_pvalue():
298298
feature1,
299299
internal_n_clusters=[2, 3],
300300
return_parts=True,
301-
pvalue_n_permutations=10,
301+
pvalue_n_perms=10,
302302
)
303303

304304
# Validate
@@ -350,7 +350,7 @@ def test_cm_numerical_and_categorical_features_perfect_relationship_pvalue():
350350
res = ccc(
351351
numerical_feature0,
352352
categorical_feature1,
353-
pvalue_n_permutations=100,
353+
pvalue_n_perms=100,
354354
)
355355

356356
# Validate
@@ -366,9 +366,7 @@ def test_cm_numerical_and_categorical_features_perfect_relationship_pvalue():
366366
assert pvalue == (0 + 1) / (100 + 1)
367367

368368
# Run with flipped variables (symmetry)
369-
assert (
370-
ccc(categorical_feature1, numerical_feature0, pvalue_n_permutations=100) == res
371-
)
369+
assert ccc(categorical_feature1, numerical_feature0, pvalue_n_perms=100) == res
372370

373371

374372
def test_cm_numerical_and_categorical_features_weakly_relationship_pvalue():
@@ -397,7 +395,7 @@ def test_cm_numerical_and_categorical_features_weakly_relationship_pvalue():
397395
res = ccc(
398396
categorical_feature1,
399397
numerical_feature0,
400-
pvalue_n_permutations=100,
398+
pvalue_n_perms=100,
401399
random_state=1,
402400
)
403401

@@ -418,7 +416,7 @@ def test_cm_numerical_and_categorical_features_weakly_relationship_pvalue():
418416
ccc(
419417
numerical_feature0,
420418
categorical_feature1,
421-
pvalue_n_permutations=100,
419+
pvalue_n_perms=100,
422420
random_state=1,
423421
)
424422
== res
@@ -442,7 +440,7 @@ def test_cm_numerical_and_categorical_features_a_single_categorical_value():
442440
res = ccc(
443441
numerical_feature0,
444442
categorical_feature1,
445-
pvalue_n_permutations=100,
443+
pvalue_n_perms=100,
446444
random_state=1,
447445
)
448446

@@ -463,7 +461,7 @@ def test_cm_numerical_and_categorical_features_a_single_categorical_value():
463461
ccc(
464462
categorical_feature1,
465463
numerical_feature0,
466-
pvalue_n_permutations=100,
464+
pvalue_n_perms=100,
467465
random_state=1,
468466
)
469467
== res
@@ -478,7 +476,7 @@ def test_cm_with_pandas_dataframe_several_features():
478476
data = pd.DataFrame(rs.rand(20, 50))
479477

480478
# Run
481-
res = ccc(data, internal_n_clusters=3, pvalue_n_permutations=10, random_state=1)
479+
res = ccc(data, internal_n_clusters=3, pvalue_n_perms=10, random_state=1)
482480

483481
# Validate
484482
assert len(res) == 2

0 commit comments

Comments
 (0)