From 51716244d919572e8a6f39003d5088d157fee415 Mon Sep 17 00:00:00 2001 From: nunofachada <3018963+nunofachada@users.noreply.github.com> Date: Tue, 23 Jan 2024 12:33:50 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20clugen/p?= =?UTF-8?q?yclugen@49fcc577c353a920630755cd795cae641e88379e=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- generated/gallery/gallery_jupyter.zip | Bin 73057 -> 73057 bytes generated/gallery/gallery_python.zip | Bin 35575 -> 35575 bytes .../gallery/mg_execution_times/index.html | 14 +++++++------- .../gallery/plot_1_1d_examples/index.html | 2 +- .../gallery/plot_1_1d_examples_codeobj.pickle | Bin 671 -> 671 bytes .../gallery/plot_2_2d_examples/index.html | 2 +- .../gallery/plot_2_2d_examples_codeobj.pickle | Bin 5657 -> 5642 bytes .../gallery/plot_3_3d_examples/index.html | 2 +- .../gallery/plot_3_3d_examples_codeobj.pickle | Bin 5807 -> 5789 bytes .../gallery/plot_4_nd_examples/index.html | 2 +- .../gallery/plot_4_nd_examples_codeobj.pickle | Bin 863 -> 863 bytes .../gallery/plot_5_mrg_examples/index.html | 2 +- .../plot_5_mrg_examples_codeobj.pickle | Bin 2335 -> 2335 bytes generated/gallery/plot_functions/index.html | 2 +- .../gallery/plot_functions_codeobj.pickle | Bin 3437 -> 3434 bytes search/search_index.json | 2 +- sitemap.xml.gz | Bin 307 -> 307 bytes 17 files changed, 14 insertions(+), 14 deletions(-) diff --git a/generated/gallery/gallery_jupyter.zip b/generated/gallery/gallery_jupyter.zip index aceda97c9a773f56d225cb44c9bdd5453082e42a..500ee36cdedda439ac50bf59def345878b210445 100644 GIT binary patch delta 111 zcmaF3i{;@i7M=iaW)=|!5YSKA$m1QuWS+7)GUkj7h*qx@RA(|x*=)67z9^Gs%I4c! z!+4mCQZ^eOYK~;mPno_$n$ZQQZu&oIMpqE+D#PdiR6o641}LtXGW{%wHcFW;EX(Kz E0BPMJ0ssI2 delta 111 zcmaF3i{;@i7M=iaW)=|!5LlbMk;glRX>0Q4$e1%SAX>drP@QRW@@A_A^F^6fByYaG zHH?R8L-Jv9CDotz1z^iw9U>GB2u DZ0R90 delta 105 zcmex9mFfFbCY}IqW)=|!5Ll7Ck!PJY)8^#Od$g;0n6@TwwzOcEX4;Uv*(unYjcG;l z=Ke@^7N)hyn|H)6(+2V$|g`FW=;Q1N87E=Q1DCXia2Jb6u*HvoK- BClvqy diff --git a/generated/gallery/mg_execution_times/index.html b/generated/gallery/mg_execution_times/index.html index 23fbd3c..c6e94b9 100644 --- a/generated/gallery/mg_execution_times/index.html +++ b/generated/gallery/mg_execution_times/index.html @@ -629,19 +629,19 @@

Computation times

-

00:26.400 total execution time for generated_gallery files:

+

00:29.296 total execution time for generated_gallery files:

+----------------------------------------------------------------------------------------+-----------+--------+ -| plot_2_2d_examples (docs/examples/plot_2_2d_examples.py) | 00:10.292 | 0.0 MB | +| plot_2_2d_examples (docs/examples/plot_2_2d_examples.py) | 00:11.459 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ -| plot_4_nd_examples (docs/examples/plot_4_nd_examples.py) | 00:06.376 | 0.0 MB | +| plot_4_nd_examples (docs/examples/plot_4_nd_examples.py) | 00:07.035 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ -| plot_3_3d_examples (docs/examples/plot_3_3d_examples.py) | 00:05.237 | 0.0 MB | +| plot_3_3d_examples (docs/examples/plot_3_3d_examples.py) | 00:05.724 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ -| plot_5_mrg_examples (docs/examples/plot_5_mrg_examples.py) | 00:03.156 | 0.0 MB | +| plot_5_mrg_examples (docs/examples/plot_5_mrg_examples.py) | 00:03.588 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ -| plot_1_1d_examples (docs/examples/plot_1_1d_examples.py) | 00:01.333 | 0.0 MB | +| plot_1_1d_examples (docs/examples/plot_1_1d_examples.py) | 00:01.484 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ -| plot_functions (docs/examples/plot_functions.py) | 00:00.005 | 0.0 MB | +| plot_functions (docs/examples/plot_functions.py) | 00:00.006 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+

diff --git a/generated/gallery/plot_1_1d_examples/index.html b/generated/gallery/plot_1_1d_examples/index.html index 307d43e..25032f3 100644 --- a/generated/gallery/plot_1_1d_examples/index.html +++ b/generated/gallery/plot_1_1d_examples/index.html @@ -749,7 +749,7 @@

Basic 1D example with density plot e084, "e084: custom proj_dist_fn (Weibull)")

e082: proj_dist_fn = 'norm' (default), e083: proj_dist_fn = 'unif', e084: custom proj_dist_fn (Weibull)

-

Total running time of the script: ( 0 minutes 1.333 seconds)

+

Total running time of the script: ( 0 minutes 1.484 seconds)

Download Python source code: plot_1_1d_examples.py

diff --git a/generated/gallery/plot_1_1d_examples_codeobj.pickle b/generated/gallery/plot_1_1d_examples_codeobj.pickle index c9fdce14d51a3d4e4d1e0bc5d27411275ad08cd2..9599db03eeb6fe59b3e3fd4b970ca89839381ee3 100644 GIT binary patch delta 229 zcmXv{Jqp4w7$v6GiWXd4oJ5y`P%DCn=;Sd(N^OG)q)|*WD0l!NgVJyBncTgHt7(hV zd%WMGSu~&19SN3m*n@WmPg^2N320}_mCVRRg0|CBqPWrA^EKw`NQ9q8On~mzMkwj& zl9|fiHk8RQ=8=?;-_R)NQTL&Uwj?2m`7t-Nr7;s3(6)9Cit&sI3A97|WiraYXl4&8 s`uePpTaMr{b@KvEv1jm&hW4Zy&Hcs#2Sz$J0z5akMw`0etgC?Xe^;MR1ONa4 delta 188 zcmbQwI-hlden4?*YRZ(@DYa8HdRX!jb5p1EFlXkKOzC0E%}*)K0dhH#N;7jxGV_Y3 z^zcB$;)^r#i%O=`2m`VM#TxFrMtdWWvZZ*^g0)M>K<{vosYf zY=RJGmdoIoY|dm0QN*kcPcxb^TLL9nm<%{< Iz>=kU07`s30ssI2 diff --git a/generated/gallery/plot_2_2d_examples/index.html b/generated/gallery/plot_2_2d_examples/index.html index 6f76309..e4bc257 100644 --- a/generated/gallery/plot_2_2d_examples/index.html +++ b/generated/gallery/plot_2_2d_examples/index.html @@ -1175,7 +1175,7 @@

Direct specification of opt e042, "e042: direct params 3")

e040: direct params 1, e041: direct params 2, e042: direct params 3

-

Total running time of the script: ( 0 minutes 10.292 seconds)

+

Total running time of the script: ( 0 minutes 11.459 seconds)

Download Python source code: plot_2_2d_examples.py

diff --git a/generated/gallery/plot_2_2d_examples_codeobj.pickle b/generated/gallery/plot_2_2d_examples_codeobj.pickle index 973cbac0e325619bb4898ceb00136295c15989e0..0641ffd6a7fcae278e1d7fb1f1513423854e7bbd 100644 GIT binary patch delta 816 zcmY*XOK1~O6zzMN$>cMW$)|CpB3)>esF|d;R;^emLd7n+Ng7BkQyIMuro=_WidE5t zA1WlAxQIKeSnz``3gX6H5nQO?LPQsS7Gf8*MFe-6Hhhug3VtcPo?5Q-}LQ+_X zWjS3&nDwex1#F5KDJO7JCn#1-$ADhSBaAG4p66{R1o0M5zJXt0G ztUOgJ6sM~cq$fC}8=Mw!USH4Y8SeNdG2m0M7}>1QqH`2SBYA~hyXSsKk{Z2pa1o{_ z>2~r7AHDtRmlmGvaq%T6!9JCKwg>~GY)t**m^q~ybzzn<9E=Zc^|Fo*!MIsH1@r8G zVQqhdw_pc$1?*5`G%=&(5l}-=!#RCZQzY!u{lXsaRSiEvISgnT49^(1_u^a3(udYc zQ;t*`sbd(G%z&L~ve+3N*C(2j^9D|8iRxJ8vB~YzI3DizpJ_-;d{*+@z6e#d*freK zhS^OlYh8MItw&n?C|Op+Ow3|)xUE~#!#a-zsW)IhZCk11>^X)z6Z(rL=JY0t&q^=F z#_`L8?#@R9IxU8=J&~B&{d$CXyBsVOLhh(bPf+R14S2BNN$@0ywH{Ls7Ck7Uh#tJUGfmdheDizXyzjlaV=kHa8eY(M zwy-HV`Id7Ny2%O9Q4G4+O6;NJcEfGTzLkaHbXlYOIq*3c;XtBR>#2tdFqW|-dK4~Y zh9z189ee0F)QBt5lVBhZRlpbwE0hsE6j^ ze9|R!G0d}UD9L5?5Ej&e_DDPxV~ldFJ^Sx-1Xv4!lM~=o2yF2Ly#XcWp%1XCxeDDB zRhW)fwJ)JDyB+xhvPv%cn`0O5B@Pq%S7;t@=iz#)EJsiz0weJpW|*sW)s>hI%NZYX z8i7y7B&-DIm_Df<4C>oiL+L7^o4(sp>f zI^Soo602ag$jvNf#@YQ|!M&?mNw(U*@o^m3GA86F+*(IRFzayDLBT}NxmuEon5}WO zDccfz2_%^9HM}Q2dJ9+5W^yB_g+FQK#2noo^P!WjqR(I_O8dU{*lha%Rhm0$?o6Ai hO+3}8OManipulating cluster sizes e084, "e084: equal size (custom)")

e082: normal dist. (default), e083: unif. dist. (custom), e084: equal size (custom)

-

Total running time of the script: ( 0 minutes 5.237 seconds)

+

Total running time of the script: ( 0 minutes 5.724 seconds)

Download Python source code: plot_3_3d_examples.py

diff --git a/generated/gallery/plot_3_3d_examples_codeobj.pickle b/generated/gallery/plot_3_3d_examples_codeobj.pickle index 79799d1f45a56b025718a19594d08f0c9ee9984d..f8b286529d8ac89b35e94108bd8d9472868ca918 100644 GIT binary patch literal 5789 zcmcJTJ!lj`6oAjSl?GlR^llu!+;$f$PNIDtDySeNRd%K(c(Ig@Stb}dU zwXm_Uv9PfT*eHUPAT}b1kT#8=r4|alx!IeYO*lO<@5*D|`{w=3``OLH!~4^@-Y+V( z9Lr*@z;tXsj!KukHuG4!X|{s+K|FgUo{5UKJGa3+$B(bYGq>T7yiL0-j*edsO)D^M z(h2RJJaIhIWr5+e<7lkwv_lI{6_Zn-ye$7CzTtR5Y#fS;rcYWH_5Jv62#SsIs31Yc z!Kh$|U3Wb;O2(8?z8kVA&*rQ3ths;TRkcHk46oFM`0x)u$X=DL2(7u7dYKJEz(vfd8TB1F>iTrzqk znnh)5ue6y(mgY&v22@hC2a+BM5*2-+j+L5EQjga(4s`v?R9(I&lTr9u zDijSH>nXgIh*8%HQeRzyt6W`o*&gfPi_~H|R6jIrldcSx znxCZPBs?SE++aRot0A?9KyFJxk<(ZSw<8cN8iR0m=^ETg%yba#W8|C|8ZIQmT+{Sj z8ng`NWwYgr5SJ38LOv7uuTri`+YWVnf{)^ zgx^x95BAnpm;tph^B?wBzL{u}y=jbkJym*p;&+r$Vm84FP^44pSOI#dQ>s|c^RZ5; zU^UIBI;Doy0-h_%gqq7j+w7!>p6LWiaDWgFlg@Gv8DC_g?-R&-WyPN&MBq%6W zk?XSQ#76Bzu+c`aQ3MgJ1Q9H3ENm=nEEJq~Z*TT(j*#a`OUnU`M=q_ncI9Y z*46xJ43=Hn=H<|Iogi)uF8USj^UAzg4&#mZ*it;v=;2evnfP)%aYOv!usV+${ip3H z2)Pd?y>+(|+3>$%Z?$YkHSSE-nd!uhA^CwG7_J|JN$X?y)eQ8q&4M6akHD$Xr}#4l zlqZcrW7HU4kGKdTUr2)3`urBKF{pX~>zxn>5K)>ULL8BZA?DO7+}3BO^r{1>B(iGc zGz$=mhw7xic|=8jNAy<)OjXXAYfgUh!B_J3>J9T^cy(L!Ydt&$#Y@X9Fqc-)V-3S?S%>UT+;~y-O;CYzT$Nd-WwU- z2u?p=j+otn;XBMLPj%!UWL_{~1uF6ToFb@1>}!gk8shJWD4ir~|DT8`5G^xSZdX@V zN+D7sYa~MtwV_;c^l`XCs>bi zPE_|+VUx8EBWeIp%!fso{~haShSi(6%D~1E)(0@x;YwQlt#sOwIG%Cb(=^MtJ%Mh; zuoLZTR!ODgsVH+QrARc7)&f#SJ=+aI?OENkVSAeqmy&p)6#LG$6 zzby~qrO8@C=n%ZwI(p4_uXVJKSr-$_=0&IxNv(ASid4e+O{Em5B<%-M#KOo#5z?ys$-?+6aU{DmCp~3~)3rO
plot_examples_nd(e087, "e087: 4D with custom proj_dist_fn (Beta)")
 

e087: 4D with custom proj_dist_fn (Beta)

-

Total running time of the script: ( 0 minutes 6.376 seconds)

+

Total running time of the script: ( 0 minutes 7.035 seconds)

Download Python source code: plot_4_nd_examples.py

diff --git a/generated/gallery/plot_4_nd_examples_codeobj.pickle b/generated/gallery/plot_4_nd_examples_codeobj.pickle index 88be522b49e038fa2176fa9860efb3402182baee..6b6684b87cb2c3e13f30bb67a93b92fe8d25106e 100644 GIT binary patch delta 170 zcmcc5cAss6-sDY;5`sM}d5O8HQ+hbkO7oISGV}8$IvY=Z!>Gr|Kk=$2n@9#vXKCW( z8%(_SSdzeL z4APL=!<0AizC51=*q|P^ L5=S6dxKs}SU{N;? delta 171 zcmcc5cAss6o+wjZ%9PkCwNo^DSn?8cQ>XMWXXcem3^$&9o-vM5bmC4;HmMAr&eFum z8<;d0wHfR^Ectn<#Zxjkdsy>Ia|HrOrFkY%n=Au IlUS+;0E|mHQUCw| diff --git a/generated/gallery/plot_5_mrg_examples/index.html b/generated/gallery/plot_5_mrg_examples/index.html index fc03666..64f16fa 100644 --- a/generated/gallery/plot_5_mrg_examples/index.html +++ b/generated/gallery/plot_5_mrg_examples/index.html @@ -844,7 +844,7 @@

Merging with data not gener clusters_field="hclusters")

e097: generated w/ make_moons(), e098: generated w/ clugen(), e099: merged data

-

Total running time of the script: ( 0 minutes 3.156 seconds)

+

Total running time of the script: ( 0 minutes 3.588 seconds)

Download Python source code: plot_5_mrg_examples.py

diff --git a/generated/gallery/plot_5_mrg_examples_codeobj.pickle b/generated/gallery/plot_5_mrg_examples_codeobj.pickle index e06040347f6d9e44017d7ac085592dc6679b95e8..85d487ac7f8af6b8a0bda3ec2180070c68c5b87b 100644 GIT binary patch delta 597 zcmYLG&ubGw7-jd{%x*Ocp=vL&X&Vx{Pzk|ENGVc(Tr5~Y3Zf7Ru`b!b?lxw-V3msE zMPUlo84rpOkV1`Xmw9i#@4b05-wcKa!%r$r!>u+N z^15_ZrNfP|9k`NP=lwWZcKea!ELgf3`>5Hyl|?`9xM3w|cSFguX`>Z+OZ{b7{Onp$~n+WO1Zw}(1K^Le<{%w!0!TGgCVbtQWH3%6{rPlYRBmYaMW^GNBP8MvqU~h zfAkvN0-mH>>!sOV)q>8N2&RvR%UA{jp+nEMR$Y7=^1c8 z1*=K};9VZ;uDqG=ak-K=h$9KUmElD6{mhT8Z3#6 GD}Mp_6Tz(j delta 588 zcmbO)G+$_fUJyrePHApxQF`i>*eM#dQ#5*5@)C1Xr}S{7mF6XvWaj5h>0!&wPbtj- zi6xb0=9FaS6;J8mfr!NyXXF=^Ov&Kt;m9nGPtHj!E}k;^G`kWTe+ExyY3k%kW({VM z44%nLm{pj?f%Iu+Ic6y!{f1eVSr$n1u*fkh0BKzo8D?c5eUeRENdu%6=#mWf9;SlK zDH)tSta+um1(j1WxHGgN#$I4mWi|jR&tcbKGX)7HPM*P}%4P*&1~Mx#+XH1^Gublo zOs-{B;_(1$>tRVXur!`Lk4c`{7bv}u#h5t=NMC2MX8}9*H>(bFG*Cc`O@S>AVv!t& zDqAwd)FwtxplN3q&D7z}<8)8WOD#$)$uB~6WdTI1?Ca%|DtAm&% z26S~ZgsI77#@qo^lEh@d+zq6C*)(`2fTci=wXm4%!D<5W1KUi9p`IL0Z1W(RrC61i z7XvlfvqJsJwg#f6ibaKaV+PN}TfWRdw@!R7Ex=Jw35@6T)I3li?gHuwWmIS03#5Hm zG?)(o=?XRj=HozmHJdcgDX=Nv&@u;w7SDOGeP96-aPVCLs&r)6;J68vDAfZ1#|69V diff --git a/generated/gallery/plot_functions/index.html b/generated/gallery/plot_functions/index.html index f25db20..44b1a5d 100644 --- a/generated/gallery/plot_functions/index.html +++ b/generated/gallery/plot_functions/index.html @@ -1015,7 +1015,7 @@

plot_examples_nd g.axes[i, j].set_ylim([xmins.iloc[i], xmaxs.iloc[i]]) g.axes[i, j].set_aspect(1) -

Total running time of the script: ( 0 minutes 0.005 seconds)

+

Total running time of the script: ( 0 minutes 0.006 seconds)

Download Python source code: plot_functions.py

diff --git a/generated/gallery/plot_functions_codeobj.pickle b/generated/gallery/plot_functions_codeobj.pickle index 56c8375d9615a7dc65e8a83e754e628c077fe22e..01117e602048b27d37a63e6e5e366672d80be493 100644 GIT binary patch delta 556 zcmY*VJ7^OD819#QCAq5wEfm_6v`r$ZO2A^ULL?m|D2Np6Ad2Xa}XRH6a*av2VV$+9Sa@YL-<50ivNIFzCLMET&a`arWQMQf1L&gMnU5IV9P1oZj2=U~ z=}dD~%_Bic+m0lk$(NDk;dlx=d0a75BZhXvS@g4p;wncDlx9OJBj(Bt-74$0Z>UC| zUzIuvJQf{7kuRcsY(|fw6n4#+ibK?5^B50mg7<=Xf{%mwL{JkPi>o-bL3W1AI0uuT z#cg~kme3%iDa`X*B!vYQ)IGSu^XdSuu_f)pb*AMaN3`9z#j;lNR&%)H{Z8O6mxKx) z@VhXGhg=cTc*LdP{|T3+44$$NyYQR`(T`WWukBA7Z^8gJ-3c}QV(3h=oLb{jXzVRV zMGGtcG`$NoN$tV!X7n^HC9kew%U-QZ7yu2)y7=+nD6=#zUZ!M zTXt^NIm-`f7C-6BL-@s}x_8T;&AV(ha{8prVf1IJB04(S$U?CHPi$2Q?%XOArK%Qo K6G9>46 delta 498 zcmY*VPe>GT81?PU{AR|PrNTdvtmNvhZnbM3TsA>kq7WjIphLz|FzVK{Iit7@s402z z5Ddlerb9u8BIp)%6Y5gZEeN|5U7{cgAxwz6ba2Lm@b_ot8n8!VCpbPhTBhku3iL-bVp)?-zL1Ge5xJT;5Gxkdk zmibWX#&aH%Q~WAb@G3$*c+Jnzyd+ofCPF=U8>fkP@tf74(7_K{yY${RgWEZLWQ%bG zpEzt}@P*e!Kfdv)=)f8+vn1C4mA4l~{NibJ;}1`$CH&>QIz&e=Worr=8vfTOIo8aM Y_G((R50;BcEg6ILv{~D|?WlUao5IzhQvd(} diff --git a/search/search_index.json b/search/search_index.json index d036dc0..3750f22 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"pyclugen","text":"

pyclugen is Python package for generating multidimensional clusters. Each cluster is supported by a line segment, the position, orientation and length of which guide where the respective points are placed. The clugen() function is provided for this purpose, as well as a number of auxiliary functions, used internally and modularly by clugen(). Users can swap these auxiliary functions by their own customized versions, fine-tuning their cluster generation strategies, or even use them as the basis for their own generation algorithms.

"},{"location":"#installation","title":"Installation","text":"

Install from PyPI:

pip install --upgrade pip\npip install pyclugen\n

Or directly from GitHub:

pip install --upgrade pip\npip install git+https://github.com/clugen/pyclugen.git#egg=pyclugen\n
"},{"location":"#quick-start","title":"Quick start","text":"
from pyclugen import clugen\nimport matplotlib.pyplot as plt\n
out2 = clugen(2, 4, 400, [1, 0], 0.4, [50, 10], 20, 1, 2)\nplt.scatter(out2.points[:, 0], out2.points[:, 1], c=out2.clusters)\nplt.show()\n
out3 = clugen(3, 5, 10000, [0.5, 0.5, 0.5], 0.2, [10, 10, 10], 10, 1, 2)\nfig = plt.figure()\nax = fig.add_subplot(projection=\"3d\")\nax.scatter(out3.points[:, 0], out3.points[:, 1], out3.points[:, 2], c=out3.clusters)\nplt.show()\n
"},{"location":"#further-reading","title":"Further reading","text":"

The clugen algorithm and its several implementations are detailed in the following reference (please cite it if you use this software):

  • Fachada, N. & de Andrade, D. (2023). Generating multidimensional clusters with support lines. Knowledge-Based Systems, 277, 110836. https://doi.org/10.1016/j.knosys.2023.110836 (arXiv preprint)
"},{"location":"#also-in-this-documentation","title":"Also in this documentation","text":"
  • Theory: the clugen algorithm in detail
  • Detailed usage examples
  • Reference
  • Developing this package
"},{"location":"dev/","title":"Development","text":""},{"location":"dev/#installing-for-development-andor-improving-the-package","title":"Installing for development and/or improving the package","text":"
$ git clone https://github.com/clugen/pyclugen.git\n$ cd pyclugen\n$ python -m venv env\n$ source env/bin/activate\n$ pip install -e .[dev]\n$ pre-commit install\n

On Windows replace source env/bin/activate with . env\\Scripts\\activate.

"},{"location":"dev/#run-tests","title":"Run tests","text":"

Tests can be executed with the following command:

$ pytest\n

The previous command runs the tests at normal level by default. This test level can also be specified explicitly:

$ pytest --test-level=normal\n

There are four test levels, from fastest to slowest (i.e., from less thorough to more exhaustive): fast, ci, normal and full. The fast level tests all functions using typical parameters, just to check if everything is working. The ci level performs the minimal amount of testing that yields complete test coverage. Beyond complete coverage, the normal and full levels also test increasing combinations of parameters and PRNG seeds, which may be important to root out rare corner cases. Note that the full level can be extremely slow.

To generate a test coverage report, run pytest as follows:

$ pytest --cov=pyclugen --cov-report=html --test-level=ci\n
"},{"location":"dev/#build-docs","title":"Build docs","text":"

Considering we're in the pyclugen folder, run the following commands:

$ cd docs\n$ mkdocs build\n

The generated documentation will be placed in docs/site. Alternatively, the documentation can be generated and served locally with:

$ mkdocs serve\n
"},{"location":"dev/#code-style","title":"Code style","text":"

Code style is enforced with flake8 (and a number of plugins), black, and isort. Some highlights include, but are not limited to:

  • Encoding: UTF-8
  • Indentation: 4 spaces (no tabs)
  • Line size limit: 88 chars
  • Newlines: Unix style, i.e. LF or \\n
"},{"location":"reference/","title":"Reference","text":"

Various functions for multidimensional cluster generation in Python.

Note that:

  1. clugen() is the main function of the pyclugen package, and possibly the only function most users will need.
  2. Functions which accept rng as the last parameter are stochastic. Thus, in order to obtain the same result on separate invocations of these functions, pass them an instance of same pseudo-random number Generator initialized with the same seed.
"},{"location":"reference/#pyclugen.Clusters","title":"Clusters","text":"

Bases: NamedTuple

Read-only container for results returned by clugen().

The symbols presented in the instances variable below have the following meanings:

  • \\(n\\) : Number of dimensions.
  • \\(p\\) : Number of points.
  • \\(c\\) : Number of clusters.
Source code in pyclugen/main.py
class Clusters(NamedTuple):\n    r\"\"\"Read-only container for results returned by [`clugen()`][pyclugen.main.clugen].\n\n    The symbols presented in the instances variable below have the following\n    meanings:\n\n    - $n$ : Number of dimensions.\n    - $p$ : Number of points.\n    - $c$ : Number of clusters.\n    \"\"\"\n\n    points: NDArray\n    r\"\"\"$p \\times n$ matrix containing the generated points for all clusters.\"\"\"\n\n    clusters: NDArray\n    r\"\"\"Vector of size $p$ indicating the cluster each point in `points`\n    belongs to.\"\"\"\n\n    projections: NDArray\n    r\"\"\"$p \\times n$ matrix with the point projections on the cluster-supporting\n    lines.\"\"\"\n\n    sizes: NDArray\n    r\"\"\"Vector of size $c$ with the number of points in each cluster.\"\"\"\n\n    centers: NDArray\n    r\"\"\"$c \\times n$ matrix with the coordinates of the cluster centers.\"\"\"\n\n    directions: NDArray\n    r\"\"\"$c \\times n$ matrix with the direction of each cluster-supporting line.\"\"\"\n\n    angles: NDArray\n    r\"\"\"Vector of size $c$ with the angles between the cluster-supporting lines and\n    the main direction.\"\"\"\n\n    lengths: NDArray\n    r\"\"\"Vector of size $c$ with the lengths of the cluster-supporting lines.\"\"\"\n
"},{"location":"reference/#pyclugen.Clusters.angles","title":"angles instance-attribute","text":"
angles: NDArray\n

Vector of size \\(c\\) with the angles between the cluster-supporting lines and the main direction.

"},{"location":"reference/#pyclugen.Clusters.centers","title":"centers instance-attribute","text":"
centers: NDArray\n

\\(c \\times n\\) matrix with the coordinates of the cluster centers.

"},{"location":"reference/#pyclugen.Clusters.clusters","title":"clusters instance-attribute","text":"
clusters: NDArray\n

Vector of size \\(p\\) indicating the cluster each point in points belongs to.

"},{"location":"reference/#pyclugen.Clusters.directions","title":"directions instance-attribute","text":"
directions: NDArray\n

\\(c \\times n\\) matrix with the direction of each cluster-supporting line.

"},{"location":"reference/#pyclugen.Clusters.lengths","title":"lengths instance-attribute","text":"
lengths: NDArray\n

Vector of size \\(c\\) with the lengths of the cluster-supporting lines.

"},{"location":"reference/#pyclugen.Clusters.points","title":"points instance-attribute","text":"
points: NDArray\n

\\(p \\times n\\) matrix containing the generated points for all clusters.

"},{"location":"reference/#pyclugen.Clusters.projections","title":"projections instance-attribute","text":"
projections: NDArray\n

\\(p \\times n\\) matrix with the point projections on the cluster-supporting lines.

"},{"location":"reference/#pyclugen.Clusters.sizes","title":"sizes instance-attribute","text":"
sizes: NDArray\n

Vector of size \\(c\\) with the number of points in each cluster.

"},{"location":"reference/#pyclugen.angle_btw","title":"angle_btw","text":"
angle_btw(v1: NDArray, v2: NDArray) -> float\n

Angle between two \\(n\\)-dimensional vectors.

Typically, the angle between two vectors v1 and v2 can be obtained with:

arccos(dot(u, v) / (norm(u) * norm(v)))\n

However, this approach is numerically unstable. The version provided here is numerically stable and based on the AngleBetweenVectors Julia package by Jeffrey Sarnoff (MIT license), implementing an algorithm provided by Prof. W. Kahan in these notes (see page 15).

Examples:

>>> from numpy import array, degrees\n>>> from pyclugen import angle_btw\n>>> v1 = array([1.0, 1.0, 1.0, 1.0])\n>>> v2 = array([1.0, 0.0, 0.0, 0.0])\n>>> degrees(angle_btw(v1, v2))\n60.00000000000001\n

Parameters:

Name Type Description Default v1 NDArray

First vector.

required v2 NDArray

Second vector.

required

Returns:

Type Description float

Angle between v1 and v2 in radians.

Source code in pyclugen/helper.py
def angle_btw(v1: NDArray, v2: NDArray) -> float:\n    r\"\"\"Angle between two $n$-dimensional vectors.\n\n    Typically, the angle between two vectors `v1` and `v2` can be obtained with:\n\n    ```python\n    arccos(dot(u, v) / (norm(u) * norm(v)))\n    ```\n\n    However, this approach is numerically unstable. The version provided here is\n    numerically stable and based on the\n    [AngleBetweenVectors](https://github.com/JeffreySarnoff/AngleBetweenVectors.jl)\n    Julia package by Jeffrey Sarnoff (MIT license), implementing an algorithm\n    provided by Prof. W. Kahan in\n    [these notes](https://people.eecs.berkeley.edu/~wkahan/MathH110/Cross.pdf)\n    (see page 15).\n\n    Examples:\n        >>> from numpy import array, degrees\n        >>> from pyclugen import angle_btw\n        >>> v1 = array([1.0, 1.0, 1.0, 1.0])\n        >>> v2 = array([1.0, 0.0, 0.0, 0.0])\n        >>> degrees(angle_btw(v1, v2))\n        60.00000000000001\n\n    Args:\n      v1: First vector.\n      v2: Second vector.\n\n    Returns:\n      Angle between `v1` and `v2` in radians.\n    \"\"\"\n    u1 = v1 / norm(v1)\n    u2 = v2 / norm(v2)\n\n    y = u1 - u2\n    x = u1 + u2\n\n    return 2 * arctan(norm(y) / norm(x))\n
"},{"location":"reference/#pyclugen.angle_deltas","title":"angle_deltas","text":"
angle_deltas(\n    num_clusters: int, angle_disp: float, rng: Generator = _default_rng\n) -> NDArray\n

Get angles between average cluster direction and cluster-supporting lines.

Determine the angles between the average cluster direction and the cluster-supporting lines. These angles are obtained from a wrapped normal distribution ( \\(\\mu=0\\), \\(\\sigma=\\)angle_disp) with support in the interval \\(\\left[-\\pi/2,\\pi/2\\right]\\). Note this is different from the standard wrapped normal distribution, the support of which is given by the interval \\(\\left[-\\pi,\\pi\\right]\\).

Examples:

>>> from pyclugen import angle_deltas\n>>> from numpy import degrees, pi\n>>> from numpy.random import Generator, PCG64\n>>> prng = Generator(PCG64(123))\n>>> a_rad = angle_deltas(4, pi/8, rng=prng) # Angle dispersion of 22.5 degrees\n>>> a_rad\narray([-0.38842705, -0.14442948,  0.50576707,  0.07617358])\n>>> degrees(a_rad) # Show angle deltas in degrees\narray([-22.25523038,  -8.27519966,  28.97831838,   4.36442443])\n

Parameters:

Name Type Description Default num_clusters int

Number of clusters.

required angle_disp float

Angle dispersion, in radians.

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Angles between the average cluster direction and the cluster-supporting lines, given in radians in the interval \\(\\left[-\\pi/2,\\pi/2\\right]\\).

Source code in pyclugen/module.py
def angle_deltas(\n    num_clusters: int, angle_disp: float, rng: Generator = _default_rng\n) -> NDArray:\n    r\"\"\"Get angles between average cluster direction and cluster-supporting lines.\n\n    Determine the angles between the average cluster direction and the\n    cluster-supporting lines. These angles are obtained from a wrapped normal\n    distribution ( $\\mu=0$, $\\sigma=$`angle_disp`) with support in the interval\n    $\\left[-\\pi/2,\\pi/2\\right]$. Note this is different from the standard\n    wrapped normal distribution, the support of which is given by the interval\n    $\\left[-\\pi,\\pi\\right]$.\n\n    Examples:\n        >>> from pyclugen import angle_deltas\n        >>> from numpy import degrees, pi\n        >>> from numpy.random import Generator, PCG64\n        >>> prng = Generator(PCG64(123))\n        >>> a_rad = angle_deltas(4, pi/8, rng=prng) # Angle dispersion of 22.5 degrees\n        >>> a_rad\n        array([-0.38842705, -0.14442948,  0.50576707,  0.07617358])\n        >>> degrees(a_rad) # Show angle deltas in degrees\n        array([-22.25523038,  -8.27519966,  28.97831838,   4.36442443])\n\n    Args:\n      num_clusters: Number of clusters.\n      angle_disp: Angle dispersion, in radians.\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Angles between the average cluster direction and the cluster-supporting\n        lines, given in radians in the interval $\\left[-\\pi/2,\\pi/2\\right]$.\n    \"\"\"\n    # Get random angle differences using the normal distribution\n    angles = angle_disp * rng.normal(size=num_clusters)\n\n    # Reduce angle differences to the interval [-\u03c0, \u03c0]\n    angles = arctan2(sin(angles), cos(angles))\n\n    # Make sure angle differences are within interval [-\u03c0/2, \u03c0/2]\n    return where(abs(angles) > pi / 2, angles - sign(angles) * pi / 2, angles)\n
"},{"location":"reference/#pyclugen.clucenters","title":"clucenters","text":"
clucenters(\n    num_clusters: int,\n    clu_sep: NDArray,\n    clu_offset: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Determine cluster centers using the uniform distribution.

The number of clusters (num_clusters) and the average cluster separation (clu_sep) are taken into account.

More specifically, let \\(c=\\)num_clusters, \\(\\mathbf{s}=\\)clu_sep.reshape(-1,1), \\(\\mathbf{o}=\\)clu_offset.reshape(-1,1), \\(n=\\)clu_sep.size (i.e., number of dimensions). Cluster centers are obtained according to the following equation:

\\[ \\mathbf{C}=c\\mathbf{U} \\cdot \\operatorname{diag}(\\mathbf{s}) + \\mathbf{1}\\,\\mathbf{o}^T \\]

where \\(\\mathbf{C}\\) is the \\(c \\times n\\) matrix of cluster centers, \\(\\mathbf{U}\\) is an \\(c \\times n\\) matrix of random values drawn from the uniform distribution between -0.5 and 0.5, and \\(\\mathbf{1}\\) is an \\(c \\times 1\\) vector with all entries equal to 1.

Examples:

>>> from pyclugen import clucenters\n>>> from numpy import array\n>>> from numpy.random import Generator, PCG64\n>>> prng = Generator(PCG64(123))\n>>> clucenters(3, array([30,10]), array([-50,50]), rng=prng)\narray([[-33.58833231,  36.61463056],\n       [-75.16761145,  40.53115432],\n       [-79.1684689 ,  59.3628352 ]])\n

Parameters:

Name Type Description Default num_clusters int

Number of clusters.

required clu_sep NDArray

Average cluster separation ( \\(n \\times 1\\) vector).

required clu_offset NDArray

Cluster offsets ( \\(n \\times 1\\) vector).

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

A \\(c \\times n\\) matrix containing the cluster centers.

Source code in pyclugen/module.py
def clucenters(\n    num_clusters: int,\n    clu_sep: NDArray,\n    clu_offset: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Determine cluster centers using the uniform distribution.\n\n    The number of clusters (`num_clusters`) and the average cluster separation\n    (`clu_sep`) are taken into account.\n\n    More specifically, let $c=$`num_clusters`, $\\mathbf{s}=$`clu_sep.reshape(-1,1)`,\n    $\\mathbf{o}=$`clu_offset.reshape(-1,1)`, $n=$`clu_sep.size` (i.e., number of\n    dimensions). Cluster centers are obtained according to the following equation:\n\n    $$\n    \\mathbf{C}=c\\mathbf{U} \\cdot \\operatorname{diag}(\\mathbf{s}) +\n        \\mathbf{1}\\,\\mathbf{o}^T\n    $$\n\n    where $\\mathbf{C}$ is the $c \\times n$ matrix of cluster centers,\n    $\\mathbf{U}$ is an $c \\times n$ matrix of random values drawn from the\n    uniform distribution between -0.5 and 0.5, and $\\mathbf{1}$ is an $c \\times\n    1$ vector with all entries equal to 1.\n\n    Examples:\n        >>> from pyclugen import clucenters\n        >>> from numpy import array\n        >>> from numpy.random import Generator, PCG64\n        >>> prng = Generator(PCG64(123))\n        >>> clucenters(3, array([30,10]), array([-50,50]), rng=prng)\n        array([[-33.58833231,  36.61463056],\n               [-75.16761145,  40.53115432],\n               [-79.1684689 ,  59.3628352 ]])\n\n    Args:\n      num_clusters: Number of clusters.\n      clu_sep: Average cluster separation ( $n \\times 1$ vector).\n      clu_offset: Cluster offsets ( $n \\times 1$ vector).\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n        A $c \\times n$ matrix containing the cluster centers.\n    \"\"\"\n    # Obtain a num_clusters x num_dims matrix of uniformly distributed values\n    # between -0.5 and 0.5 representing the relative cluster centers\n    ctr_rel = rng.random((num_clusters, clu_sep.size)) - 0.5\n\n    return num_clusters * (ctr_rel @ diag(clu_sep)) + clu_offset\n
"},{"location":"reference/#pyclugen.clugen","title":"clugen","text":"
clugen(\n    num_dims: int,\n    num_clusters: int,\n    num_points: int,\n    direction: ArrayLike,\n    angle_disp: float,\n    cluster_sep: ArrayLike,\n    llength: float,\n    llength_disp: float,\n    lateral_disp: float,\n    allow_empty: bool = False,\n    cluster_offset: Optional[ArrayLike] = None,\n    proj_dist_fn: str | Callable[[float, int, Generator], NDArray] = \"norm\",\n    point_dist_fn: str\n    | Callable[\n        [NDArray, float, float, NDArray, NDArray, Generator], NDArray\n    ] = \"n-1\",\n    clusizes_fn: Callable[[int, int, bool, Generator], NDArray]\n    | ArrayLike = clusizes,\n    clucenters_fn: Callable[[int, NDArray, NDArray, Generator], NDArray]\n    | ArrayLike = clucenters,\n    llengths_fn: Callable[[int, float, float, Generator], NDArray]\n    | ArrayLike = llengths,\n    angle_deltas_fn: Callable[[int, float, Generator], NDArray]\n    | ArrayLike = angle_deltas,\n    rng: int | Generator = _default_rng,\n) -> Clusters\n

Generate multidimensional clusters.

Tip

This is the main function of the pyclugen package, and possibly the only function most users will need.

"},{"location":"reference/#pyclugen.clugen--examples","title":"Examples:","text":"
>>> import matplotlib.pyplot as plt\n>>> from pyclugen import clugen\n>>> from numpy import pi\n>>> out = clugen(2, 5, 10000, [1, 0.5], pi/16, [10, 40], 10, 1, 2, rng=321)\n>>> out.centers # What are the cluster centers?\narray([[ 20.02876212,  36.59611434],\n       [-15.60290734, -26.52169579],\n       [ 23.09775166,  91.66309916],\n       [ -5.76816015,  54.9775074 ],\n       [ -4.64224681,  78.40990876]])\n>>> plt.scatter(out.points[:,0],\n...             out.points[:,1],\n...             c=out.clusters) # doctest: +SKIP\n>>> plt.show() # doctest: +SKIP\n

Note

In the descriptions below, the terms \"average\" and \"dispersion\" refer to measures of central tendency and statistical dispersion, respectively. Their exact meaning depends on several optional arguments.

Parameters:

Name Type Description Default num_dims int

Number of dimensions.

required num_clusters int

Number of clusters to generate.

required num_points int

Total number of points to generate.

required direction ArrayLike

Average direction of the cluster-supporting lines. Can be a vector of length num_dims (same direction for all clusters) or a matrix of size num_clusters x num_dims (one direction per cluster).

required angle_disp float

Angle dispersion of cluster-supporting lines (radians).

required cluster_sep ArrayLike

Average cluster separation in each dimension (vector of size num_dims).

required llength float

Average length of cluster-supporting lines.

required llength_disp float

Length dispersion of cluster-supporting lines.

required lateral_disp float

Cluster lateral dispersion, i.e., dispersion of points from their projection on the cluster-supporting line.

required allow_empty bool

Allow empty clusters? False by default.

False cluster_offset Optional[ArrayLike]

Offset to add to all cluster centers (vector of size num_dims). By default the offset will be equal to numpy.zeros(num_dims).

None proj_dist_fn str | Callable[[float, int, Generator], NDArray]

Distribution of point projections along cluster-supporting lines, with three possible values:

  • \"norm\" (default): Distribute point projections along lines using a normal distribution (\u03bc=line center, \u03c3=llength/6).
  • \"unif\": Distribute points uniformly along the line.
  • User-defined function, which accepts three parameters, line length (float), number of points (int), and an instance of Generator, and returns an array containing the distance of each point projection to the center of the line. For example, the \"norm\" option roughly corresponds to lambda l, n, rg: l * rg.random((n, 1)) / 6.
'norm' point_dist_fn str | Callable[[NDArray, float, float, NDArray, NDArray, Generator], NDArray]

Controls how the final points are created from their projections on the cluster-supporting lines, with three possible values:

  • \"n-1\" (default): Final points are placed on a hyperplane orthogonal to the cluster-supporting line, centered at each point's projection, using the normal distribution (\u03bc=0, \u03c3=lateral_disp). This is done by the clupoints_n_1() function.
  • \"n\": Final points are placed around their projection on the cluster-supporting line using the normal distribution (\u03bc=0, \u03c3=lateral_disp). This is done by the clupoints_n() function.
  • User-defined function: The user can specify a custom point placement strategy by passing a function with the same signature as clupoints_n_1() and clupoints_n().
'n-1' clusizes_fn Callable[[int, int, bool, Generator], NDArray] | ArrayLike

Distribution of cluster sizes. By default, cluster sizes are determined by the clusizes() function, which uses the normal distribution (\u03bc=num_points/num_clusters, \u03c3=\u03bc/3), and assures that the final cluster sizes add up to num_points. This parameter allows the user to specify a custom function for this purpose, which must follow clusizes() signature. Note that custom functions are not required to strictly obey the num_points parameter. Alternatively, the user can specify an array of cluster sizes directly.

clusizes clucenters_fn Callable[[int, NDArray, NDArray, Generator], NDArray] | ArrayLike

Distribution of cluster centers. By default, cluster centers are determined by the clucenters() function, which uses the uniform distribution, and takes into account the num_clusters and cluster_sep parameters for generating well-distributed cluster centers. This parameter allows the user to specify a custom function for this purpose, which must follow clucenters() signature. Alternatively, the user can specify a matrix of size num_clusters x num_dims with the exact cluster centers.

clucenters llengths_fn Callable[[int, float, float, Generator], NDArray] | ArrayLike

Distribution of line lengths. By default, the lengths of cluster-supporting lines are determined by the llengths() function, which uses the folded normal distribution (\u03bc=llength, \u03c3=llength_disp). This parameter allows the user to specify a custom function for this purpose, which must follow llengths() signature. Alternatively, the user can specify an array of line lengths directly.

llengths angle_deltas_fn Callable[[int, float, Generator], NDArray] | ArrayLike

Distribution of line angle differences with respect to direction. By default, the angles between direction and the direction of cluster-supporting lines are determined by the angle_deltas() function, which uses the wrapped normal distribution (\u03bc=0, \u03c3=angle_disp) with support in the interval [-\u03c0/2, \u03c0/2]. This parameter allows the user to specify a custom function for this purpose, which must follow angle_deltas() signature. Alternatively, the user can specify an array of angle deltas directly.

angle_deltas rng int | Generator

The seed for the random number generator or an instance of Generator for reproducible executions.

_default_rng

Returns:

Type Description Clusters

The generated clusters and associated information in the form of a Clusters object.

Source code in pyclugen/main.py
def clugen(\n    num_dims: int,\n    num_clusters: int,\n    num_points: int,\n    direction: ArrayLike,\n    angle_disp: float,\n    cluster_sep: ArrayLike,\n    llength: float,\n    llength_disp: float,\n    lateral_disp: float,\n    allow_empty: bool = False,\n    cluster_offset: Optional[ArrayLike] = None,\n    proj_dist_fn: str | Callable[[float, int, Generator], NDArray] = \"norm\",\n    point_dist_fn: str\n    | Callable[[NDArray, float, float, NDArray, NDArray, Generator], NDArray] = \"n-1\",\n    clusizes_fn: Callable[[int, int, bool, Generator], NDArray] | ArrayLike = clusizes,\n    clucenters_fn: Callable[[int, NDArray, NDArray, Generator], NDArray]\n    | ArrayLike = clucenters,\n    llengths_fn: Callable[[int, float, float, Generator], NDArray]\n    | ArrayLike = llengths,\n    angle_deltas_fn: Callable[[int, float, Generator], NDArray]\n    | ArrayLike = angle_deltas,\n    rng: int | Generator = _default_rng,\n) -> Clusters:\n    \"\"\"Generate multidimensional clusters.\n\n    !!! tip\n        This is the main function of the **pyclugen** package, and possibly the\n        only function most users will need.\n\n    ## Examples:\n\n        >>> import matplotlib.pyplot as plt\n        >>> from pyclugen import clugen\n        >>> from numpy import pi\n        >>> out = clugen(2, 5, 10000, [1, 0.5], pi/16, [10, 40], 10, 1, 2, rng=321)\n        >>> out.centers # What are the cluster centers?\n        array([[ 20.02876212,  36.59611434],\n               [-15.60290734, -26.52169579],\n               [ 23.09775166,  91.66309916],\n               [ -5.76816015,  54.9775074 ],\n               [ -4.64224681,  78.40990876]])\n        >>> plt.scatter(out.points[:,0],\n        ...             out.points[:,1],\n        ...             c=out.clusters) # doctest: +SKIP\n        >>> plt.show() # doctest: +SKIP\n\n    ![clugen](https://user-images.githubusercontent.com/3018963/151056890-c83c9509-b40d-4ab2-a842-f2a4706344c6.png)\n\n    !!! Note\n        In the descriptions below, the terms \"average\" and \"dispersion\" refer to\n        measures of central tendency and statistical dispersion, respectively.\n        Their exact meaning depends on several optional arguments.\n\n    Args:\n      num_dims: Number of dimensions.\n      num_clusters: Number of clusters to generate.\n      num_points: Total number of points to generate.\n      direction: Average direction of the cluster-supporting lines. Can be a\n        vector of length `num_dims` (same direction for all clusters) or a\n        matrix of size `num_clusters` x `num_dims` (one direction per cluster).\n      angle_disp: Angle dispersion of cluster-supporting lines (radians).\n      cluster_sep: Average cluster separation in each dimension (vector of size\n        `num_dims`).\n      llength: Average length of cluster-supporting lines.\n      llength_disp: Length dispersion of cluster-supporting lines.\n      lateral_disp: Cluster lateral dispersion, i.e., dispersion of points from their\n        projection on the cluster-supporting line.\n      allow_empty: Allow empty clusters? `False` by default.\n      cluster_offset: Offset to add to all cluster centers (vector of size `num_dims`).\n        By default the offset will be equal to `numpy.zeros(num_dims)`.\n      proj_dist_fn: Distribution of point projections along cluster-supporting lines,\n        with three possible values:\n\n        - `\"norm\"` (default): Distribute point projections along lines using a normal\n          distribution (\u03bc=_line center_, \u03c3=`llength/6`).\n        - `\"unif\"`: Distribute points uniformly along the line.\n        - User-defined function, which accepts three parameters, line length (`float`),\n          number of points (`int`), and an instance of\n          [`Generator`](https://numpy.org/doc/stable/reference/random/generator.html?highlight=generator#numpy.random.Generator),\n          and returns an array containing the distance of each point projection to\n          the center of the line. For example, the `\"norm\"` option roughly corresponds\n          to `lambda l, n, rg: l * rg.random((n, 1)) / 6`.\n\n      point_dist_fn: Controls how the final points are created from their projections\n        on the cluster-supporting lines, with three possible values:\n\n        - `\"n-1\"` (default): Final points are placed on a hyperplane orthogonal to\n          the cluster-supporting line, centered at each point's projection, using the\n          normal distribution (\u03bc=0, \u03c3=`lateral_disp`). This is done by the\n          [`clupoints_n_1()`][pyclugen.module.clupoints_n_1] function.\n        - `\"n\"`: Final points are placed around their projection on the\n          cluster-supporting line using the normal distribution (\u03bc=0,\n          \u03c3=`lateral_disp`). This is done by the\n          [`clupoints_n()`][pyclugen.module.clupoints_n] function.\n        - User-defined function: The user can specify a custom point placement\n          strategy by passing a function with the same signature as\n          [`clupoints_n_1()`][pyclugen.module.clupoints_n_1] and\n          [`clupoints_n()`][pyclugen.module.clupoints_n].\n\n      clusizes_fn: Distribution of cluster sizes. By default, cluster sizes are\n        determined by the [`clusizes()`][pyclugen.module.clusizes] function, which\n        uses the normal distribution (\u03bc=`num_points`/`num_clusters`, \u03c3=\u03bc/3), and\n        assures that the final cluster sizes add up to `num_points`. This parameter\n        allows the user to specify a custom function for this purpose, which must\n        follow [`clusizes()`][pyclugen.module.clusizes] signature. Note that custom\n        functions are not required to strictly obey the `num_points` parameter.\n        Alternatively, the user can specify an array of cluster sizes directly.\n      clucenters_fn: Distribution of cluster centers. By default, cluster centers\n        are determined by the [`clucenters()`][pyclugen.module.clucenters] function,\n        which uses the uniform distribution, and takes into account the `num_clusters`\n        and `cluster_sep` parameters for generating well-distributed cluster centers.\n        This parameter allows the user to specify a custom function for this purpose,\n        which must follow [`clucenters()`][pyclugen.module.clucenters] signature.\n        Alternatively, the user can specify a matrix of size `num_clusters` x\n        `num_dims` with the exact cluster centers.\n      llengths_fn: Distribution of line lengths. By default, the lengths of\n        cluster-supporting lines are determined by the\n        [`llengths()`][pyclugen.module.llengths] function, which uses the folded\n        normal distribution (\u03bc=`llength`, \u03c3=`llength_disp`). This parameter allows\n        the user to specify a custom function for this purpose, which must follow\n        [`llengths()`][pyclugen.module.llengths] signature. Alternatively, the user\n        can specify an array of line lengths directly.\n      angle_deltas_fn: Distribution of line angle differences with respect to\n        `direction`. By default, the angles between `direction` and the direction of\n        cluster-supporting lines are determined by the\n        [`angle_deltas()`][pyclugen.module.angle_deltas] function, which uses the\n        wrapped normal distribution (\u03bc=0, \u03c3=`angle_disp`) with support in the interval\n        [-\u03c0/2, \u03c0/2]. This parameter allows the user to specify a custom function for\n        this purpose, which must follow [`angle_deltas()`][pyclugen.module.angle_deltas]\n        signature. Alternatively, the user can specify an array of angle deltas\n        directly.\n      rng: The seed for the random number generator or an instance of\n        [`Generator`][numpy.random.Generator] for reproducible executions.\n\n    Returns:\n      The generated clusters and associated information in the form of a\n        [`Clusters`][pyclugen.main.Clusters] object.\n    \"\"\"\n    # ############### #\n    # Validate inputs #\n    # ############### #\n\n    # Check that number of dimensions is > 0\n    if num_dims < 1:\n        raise ValueError(\"Number of dimensions, `num_dims`, must be > 0\")\n\n    # Check that number of clusters is > 0\n    if num_clusters < 1:\n        raise ValueError(\"Number of clusters, `num_clust`, must be > 0\")\n\n    # Convert given direction into a NumPy array\n    arrdir: NDArray = asarray(direction)\n\n    # Get number of dimensions in `direction` array\n    dir_ndims = arrdir.ndim\n\n    # Is direction a vector or a matrix?\n    if dir_ndims == 1:\n        # It's a vector, let's convert it into a row matrix, since this will be\n        # useful down the road\n        arrdir = arrdir.reshape((1, -1))\n    elif dir_ndims == 2:\n        # If a matrix was given (i.e. a main direction is given for each cluster),\n        # check if the number of directions is the same as the number of clusters\n        dir_size_1 = arrdir.shape[0]\n        if dir_size_1 != num_clusters:\n            raise ValueError(\n                \"Number of rows in `direction` must be the same as the \"\n                + f\"number of clusters ({dir_size_1} != {num_clusters})\"\n            )\n    else:\n        # The `directions` array must be a vector or a matrix, so if we get here\n        # it means we have invalid arguments\n        raise ValueError(\n            \"`direction` must be a vector (1D array) or a matrix (2D array), \"\n            + f\"but is {dir_ndims}D\"\n        )\n\n    # Check that direction has num_dims dimensions\n    dir_size_2 = arrdir.shape[1]\n    if dir_size_2 != num_dims:\n        raise ValueError(\n            \"Length of directions in `direction` must be equal to \"\n            + f\"`num_dims` ({dir_size_2} != {num_dims})\"\n        )\n\n    # Check that directions have magnitude > 0\n    dir_magnitudes = apply_along_axis(norm, 1, arrdir)\n    if any(isclose(dir_magnitudes, 0)):\n        raise ValueError(\"Directions in `direction` must have magnitude > 0\")\n\n    # If allow_empty is false, make sure there are enough points to distribute\n    # by the clusters\n    if (not allow_empty) and num_points < num_clusters:\n        raise ValueError(\n            f\"A total of {num_points} points is not enough for \"\n            + f\"{num_clusters} non-empty clusters\"\n        )\n\n    # Check that cluster_sep has num_dims dimensions\n    cluster_sep = asarray(cluster_sep)\n    if cluster_sep.size != num_dims:\n        raise ValueError(\n            \"Length of `cluster_sep` must be equal to `num_dims` \"\n            + f\"({cluster_sep.size} != {num_dims})\"\n        )\n\n    # If given, cluster_offset must have the correct number of dimensions,\n    # if not given then it will be a num_dims x 1 vector of zeros\n    if cluster_offset is None:\n        cluster_offset = zeros(num_dims)\n    else:\n        cluster_offset = asarray(cluster_offset)\n        if cluster_offset.size != num_dims:\n            raise ValueError(\n                \"Length of `cluster_offset` must be equal to `num_dims` \"\n                + f\"({cluster_offset.size} != {num_dims})\"\n            )\n\n    # If the user specified rng as an int, create a proper rng object\n    rng_sel: Generator\n    if isinstance(rng, Generator):\n        rng_sel = cast(Generator, rng)\n    elif isinstance(rng, int):\n        rng_sel = Generator(PCG64(cast(int, rng)))\n    else:\n        raise ValueError(\n            f\"`rng` must be an instance of int or Generator, but is {type(rng)}\"\n        )\n\n    # Check that proj_dist_fn specifies a valid way for projecting points along\n    # cluster-supporting lines i.e., either \"norm\" (default), \"unif\" or a\n    # user-defined function\n    pointproj_fn: Callable[[float, int, Generator], NDArray]\n\n    if callable(proj_dist_fn):\n        # Use user-defined distribution; assume function accepts length of line\n        # and number of points, and returns a number of points x 1 vector\n        pointproj_fn = proj_dist_fn\n\n    elif proj_dist_fn == \"unif\":\n        # Point projections will be uniformly placed along cluster-supporting lines\n        def pointproj_fn(length, n, rg):\n            return length * rg.random(n) - length / 2\n\n    elif proj_dist_fn == \"norm\":\n        # Use normal distribution for placing point projections along cluster-supporting\n        # lines, mean equal to line center, standard deviation equal to 1/6 of line\n        # length such that the line length contains \u224899.73% of the points\n        def pointproj_fn(length, n, rg):\n            return (1.0 / 6.0) * length * rg.normal(size=n)\n\n    else:\n        raise ValueError(\n            \"`proj_dist_fn` has to be either 'norm', 'unif' or user-defined function\"\n        )\n\n    # Check that point_dist_fn specifies a valid way for generating points given\n    # their projections along cluster-supporting lines, i.e., either \"n-1\"\n    # (default), \"n\" or a user-defined function\n    pt_from_proj_fn: Callable[\n        [NDArray, float, float, NDArray, NDArray, Generator], NDArray\n    ]\n\n    if num_dims == 1:\n        # If 1D was specified, point projections are the points themselves\n        def pt_from_proj_fn(projs, lat_disp, length, clu_dir, clu_ctr, rng=rng_sel):\n            return projs\n\n    elif callable(point_dist_fn):\n        # Use user-defined distribution; assume function accepts point projections\n        # on the line, lateral disp., cluster direction and cluster center, and\n        # returns a num_points x num_dims matrix containing the final points\n        # for the current cluster\n        pt_from_proj_fn = point_dist_fn\n\n    elif point_dist_fn == \"n-1\":\n        # Points will be placed on a hyperplane orthogonal to the cluster-supporting\n        # line using a normal distribution centered at their intersection\n        pt_from_proj_fn = clupoints_n_1\n\n    elif point_dist_fn == \"n\":\n        # Points will be placed using a multivariate normal distribution\n        # centered at the point projection\n        pt_from_proj_fn = clupoints_n\n\n    else:\n        raise ValueError(\n            \"point_dist_fn has to be either 'n-1', 'n' or a user-defined function\"\n        )\n\n    # ############################ #\n    # Determine cluster properties #\n    # ############################ #\n\n    # Normalize main direction(s)\n    arrdir = apply_along_axis(lambda a: a / norm(a), 1, arrdir)\n\n    # If only one main direction was given, expand it for all clusters\n    if dir_ndims == 1:\n        arrdir = repeat(arrdir, num_clusters, axis=0)\n\n    # Determine cluster sizes\n    if callable(clusizes_fn):\n        cluster_sizes = clusizes_fn(num_clusters, num_points, allow_empty, rng_sel)\n    elif len(asarray(clusizes_fn)) == num_clusters:\n        cluster_sizes = asarray(clusizes_fn)\n    else:\n        raise ValueError(\n            \"clusizes_fn has to be either a function or a `num_clusters`-sized array\"\n        )\n\n    # Custom clusizes_fn's are not required to obey num_points, so we update\n    # it here just in case it's different from what the user specified\n    num_points = sum(cluster_sizes)\n\n    # Determine cluster centers\n    if callable(clucenters_fn):\n        cluster_centers = clucenters_fn(\n            num_clusters, cluster_sep, cluster_offset, rng_sel\n        )\n    elif asarray(clucenters_fn).shape == (num_clusters, num_dims):\n        cluster_centers = asarray(clucenters_fn)\n    else:\n        raise ValueError(\n            \"clucenters_fn has to be either a function or a matrix of size \"\n            + \"`num_clusters` x `num_dims`\"\n        )\n\n    # Determine length of lines supporting clusters\n    if callable(llengths_fn):\n        cluster_lengths = llengths_fn(num_clusters, llength, llength_disp, rng_sel)\n    elif len(asarray(llengths_fn)) == num_clusters:\n        cluster_lengths = asarray(llengths_fn)\n    else:\n        raise ValueError(\n            \"llengths_fn has to be either a function or a `num_clusters`-sized array\"\n        )\n\n    # Obtain angles between main direction and cluster-supporting lines\n    if callable(angle_deltas_fn):\n        cluster_angles = angle_deltas_fn(num_clusters, angle_disp, rng_sel)\n    elif len(asarray(angle_deltas_fn)) == num_clusters:\n        cluster_angles = asarray(angle_deltas_fn)\n    else:\n        raise ValueError(\n            \"angle_deltas_fn has to be either a function or a \"\n            + \"`num_clusters`-sized array\"\n        )\n\n    # Determine normalized cluster directions by applying the obtained angles\n    cluster_directions = apply_along_axis(\n        lambda v, a: rand_vector_at_angle(v, next(a), rng_sel),\n        1,\n        arrdir,\n        iter(cluster_angles),\n    )\n\n    # ################################# #\n    # Determine points for each cluster #\n    # ################################# #\n\n    # Aux. vector with cumulative sum of number of points in each cluster\n    cumsum_points = concatenate((asarray([0]), cumsum(cluster_sizes)))\n\n    # Pre-allocate data structures for holding cluster info and points\n    point_clusters: NDArray = empty(\n        num_points, dtype=int32\n    )  # Cluster indices of each point\n    point_projections = empty((num_points, num_dims))  # Point projections on\n    #                                                  # cluster-supporting lines\n    points = empty((num_points, num_dims))  # Final points to be generated\n\n    # Loop through clusters and create points for each one\n    for i in range(num_clusters):\n        # Start and end indexes for points in current cluster\n        idx_start = cumsum_points[i]\n        idx_end = cumsum_points[i + 1]\n\n        # Update cluster indices of each point\n        point_clusters[idx_start:idx_end] = i\n\n        # Determine distance of point projections from the center of the line\n        ptproj_dist_fn_center = pointproj_fn(\n            cluster_lengths[i], cluster_sizes[i], rng_sel\n        )\n\n        # Determine coordinates of point projections on the line using the\n        # parametric line equation (this works since cluster direction is normalized)\n        point_projections[idx_start:idx_end, :] = points_on_line(\n            cluster_centers[i, :], cluster_directions[i, :], ptproj_dist_fn_center\n        )\n\n        # Determine points from their projections on the line\n        points[idx_start:idx_end, :] = pt_from_proj_fn(\n            point_projections[idx_start:idx_end, :],\n            lateral_disp,\n            cluster_lengths[i],\n            cluster_directions[i, :],\n            cluster_centers[i, :],\n            rng_sel,\n        )\n\n    return Clusters(\n        points,\n        point_clusters,\n        point_projections,\n        cluster_sizes,\n        cluster_centers,\n        cluster_directions,\n        cluster_angles,\n        cluster_lengths,\n    )\n
"},{"location":"reference/#pyclugen.clumerge","title":"clumerge","text":"
clumerge(\n    *data: NamedTuple | Mapping[str, ArrayLike],\n    fields: tuple[str, ...] = (\"points\", \"clusters\"),\n    clusters_field: str | None = \"clusters\"\n) -> dict[str, NDArray]\n

Merges the fields (specified in fields) of two or more data sets.

Merges the fields (specified in fields) of two or more data sets (named tuples or dictionaries). The fields to be merged need to have the same number of columns. The corresponding merged field will contain the rows of the fields to be merged, and will have a common supertype.

The clusters_field parameter specifies a field containing integers that identify the cluster to which the respective points belongs to. If clusters_field is specified (by default it's specified as \"clusters\"), cluster assignments in individual datasets will be updated in the merged dataset so that clusters are considered separate. This parameter can be set to None, in which case no field will be considered as a special cluster assignments field.

This function can be used to merge data sets generated with the clugen() function, by default merging the points and clusters fields in those data sets. It also works with arbitrary data by specifying alternative fields in the fields parameter. It can be used, for example, to merge third-party data with clugen()-generated data.

Examples:

>>> from pyclugen import clugen, clumerge\n>>> data1 = clugen(2, 5, 1000, [1, 1], 0.01, [20, 20], 14, 1.2, 1.5);\n>>> data2 = clugen(2, 3, 450, [0.8, -0.3], 0, [25, 21], 6, 0.4, 3.5);\n>>> data3 = clugen(2, 2, 600, [0, -0.7], 0.2, [15, 10], 1, 0.1, 5.2);\n>>> data_merged = clumerge(data1, data2, data3)\n

Parameters:

Name Type Description Default *data NamedTuple | Mapping[str, ArrayLike]

One or more cluster data sets whose fields are to be merged.

() fields tuple[str, ...]

Fields to be merged, which must exist in the data set given in *data.

('points', 'clusters') clusters_field str | None

Field containing the integer cluster labels. If specified, cluster assignments in individual datasets will be updated in the merged dataset so that clusters are considered separate.

'clusters'

Returns:

Type Description dict[str, NDArray]

A dictionary, where keys correspond to field names, and values to the merged numerical arrays.

Source code in pyclugen/main.py
def clumerge(\n    *data: NamedTuple | Mapping[str, ArrayLike],\n    fields: tuple[str, ...] = (\"points\", \"clusters\"),\n    clusters_field: str | None = \"clusters\",\n) -> dict[str, NDArray]:\n    r\"\"\"Merges the fields (specified in `fields`) of two or more `data` sets.\n\n    Merges the fields (specified in `fields`) of two or more `data` sets (named\n    tuples or dictionaries). The fields to be merged need to have the same\n    number of columns. The corresponding merged field will contain the rows of\n    the fields to be merged, and will have a common supertype.\n\n    The `clusters_field` parameter specifies a field containing integers that\n    identify the cluster to which the respective points belongs to. If\n    `clusters_field` is specified (by default it's specified as `\"clusters\"`),\n    cluster assignments in individual datasets will be updated in the merged\n    dataset so that clusters are considered separate. This parameter can be set\n    to `None`, in which case no field will be considered as a special cluster\n    assignments field.\n\n    This function can be used to merge data sets generated with the\n    [`clugen()`][pyclugen.main.clugen] function, by default merging the\n    `points` and `clusters` fields in those data sets. It also works with\n    arbitrary data by specifying alternative fields in the `fields` parameter.\n    It can be used, for example, to merge third-party data with\n    [`clugen()`][pyclugen.main.clugen]-generated data.\n\n    Examples:\n        >>> from pyclugen import clugen, clumerge\n        >>> data1 = clugen(2, 5, 1000, [1, 1], 0.01, [20, 20], 14, 1.2, 1.5);\n        >>> data2 = clugen(2, 3, 450, [0.8, -0.3], 0, [25, 21], 6, 0.4, 3.5);\n        >>> data3 = clugen(2, 2, 600, [0, -0.7], 0.2, [15, 10], 1, 0.1, 5.2);\n        >>> data_merged = clumerge(data1, data2, data3)\n\n    Args:\n      *data: One or more cluster data sets whose `fields` are to be merged.\n      fields: Fields to be merged, which must exist in the data set given in\n        `*data`.\n      clusters_field: Field containing the integer cluster labels. If specified,\n        cluster assignments in individual datasets will be updated in the merged\n        dataset so that clusters are considered separate.\n\n    Returns:\n      A dictionary, where keys correspond to field names, and values to the\n        merged numerical arrays.\n    \"\"\"\n    # Number of elements in each array the merged dataset\n    numel: int = 0\n\n    # Number of columns of values in each field\n    fields_info: dict[str, _FieldInfo] = {}\n\n    # Merged dataset to output, initially empty\n    output: dict[str, NDArray] = {}\n\n    # Create a fields set\n    fields_set: MutableSet[str] = set(fields)\n\n    # If a clusters field is given, add it\n    if clusters_field is not None:\n        fields_set.add(str(clusters_field))\n\n    # Data in dictionary format with NDArray views on data\n    ddata: MutableSequence[Mapping[str, NDArray]] = []\n    for dt in data:\n        # If dt is a named tuple, convert it into a dictionary\n        ddt: Mapping[str, ArrayLike]\n        if isinstance(dt, dict):\n            ddt = cast(dict, dt)\n        else:\n            ntdt = cast(NamedTuple, dt)\n            ddt = ntdt._asdict()\n\n        # Convert dictionary values to NDArrays\n        ddtnp: Mapping[str, NDArray] = {k: asarray(v) for k, v in ddt.items()}\n\n        # Add converted dictionary to our sequence of dictionaries\n        ddata.append(ddtnp)\n\n    # Cycle through data items\n    for dt in ddata:\n        # Number of elements in the current item\n        numel_i: int = -1\n\n        # Cycle through fields for the current item\n        for field in fields_set:\n            if field not in dt:\n                raise ValueError(f\"Data item does not contain required field `{field}`\")\n            elif field == clusters_field and not can_cast(\n                dt[clusters_field].dtype, int64\n            ):\n                raise ValueError(f\"`{clusters_field}` must contain integer types\")\n\n            # Get the field value\n            value: NDArray = dt[field]\n\n            # Number of elements in field value\n            numel_tmp = len(value)\n\n            # Check the number of elements in the field value\n            if numel_i == -1:\n                # First field: get number of elements in value (must be the same\n                # for the remaining field values)\n                numel_i = numel_tmp\n\n            elif numel_tmp != numel_i:\n                # Fields values after the first must have the same number of\n                # elements\n                raise ValueError(\n                    \"Data item contains fields with different sizes \"\n                    + f\"({numel_tmp} != {numel_i})\"\n                )\n\n            # Get/check info about the field value type\n            if field not in fields_info:\n                # If it's the first time this field appears, just get the info\n                fields_info[field] = _FieldInfo(value.dtype, _getcols(value))\n\n            else:\n                # If this field already appeared in previous data items, get the\n                # info and check/determine its compatibility with respect to\n                # previous data items\n                if _getcols(value) != fields_info[field].ncol:\n                    # Number of columns must be the same\n                    raise ValueError(f\"Dimension mismatch in field `{field}`\")\n\n                # Get the common supertype\n                fields_info[field].dtype = promote_types(\n                    fields_info[field].dtype, value.dtype\n                )\n\n        # Update total number of elements\n        numel += numel_i\n\n    # Initialize output dictionary fields with room for all items\n    for field in fields_info:\n        if fields_info[field].ncol == 1:\n            output[field] = empty((numel,), dtype=fields_info[field].dtype)\n        else:\n            output[field] = empty(\n                (numel, fields_info[field].ncol), dtype=fields_info[field].dtype\n            )\n\n    # Copy items from input data to output dictionary, field-wise\n    copied: int = 0\n    last_cluster: int = 0\n\n    # Create merged output\n    for dt in ddata:\n        # How many elements to copy for the current data item?\n        tocopy: int = len(dt[fields[0]])\n\n        # Cycle through each field and its information\n        for field in fields_info:\n            # Copy elements\n            if field == clusters_field:\n                # If this is a clusters field, update the cluster IDs\n                old_clusters = unique(dt[clusters_field])\n                new_clusters = list(\n                    range(last_cluster + 1, last_cluster + len(old_clusters) + 1)\n                )\n                old2new = zip(old_clusters, new_clusters)\n                mapping = dict(old2new)\n                last_cluster = new_clusters[-1]\n\n                output[field][copied : (copied + tocopy)] = [\n                    mapping[val] for val in dt[clusters_field]\n                ]\n\n            else:\n                # Otherwise just copy the elements\n                ncol: int = fields_info[field].ncol\n                output[field].flat[copied * ncol : (copied + tocopy) * ncol] = dt[field]\n\n        # Update how many were copied so far\n        copied += tocopy\n\n    # Return result\n    return output\n
"},{"location":"reference/#pyclugen.clupoints_n","title":"clupoints_n","text":"
clupoints_n(\n    projs: NDArray,\n    lat_disp: float,\n    line_len: float,\n    clu_dir: NDArray,\n    clu_ctr: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Generate points from their \\(n\\)-D projections on a cluster-supporting line.

Each point is placed around its projection using the normal distribution ( \\(\\mu=0\\), \\(\u03c3=\\)lat_disp).

This function's main intended use is by the clugen() function, generating the final points when the point_dist_fn parameter is set to \"n\".

Examples:

>>> from pyclugen import clupoints_n, points_on_line\n>>> from numpy import array, linspace\n>>> from numpy.random import Generator, PCG64\n>>> prng = Generator(PCG64(123))\n>>> projs = points_on_line(array([5,5]),     # Get 5 point projections\n...                        array([1,0]),     # on a 2D line\n...                        linspace(-4,4,5))\n>>> projs\narray([[1., 5.],\n       [3., 5.],\n       [5., 5.],\n       [7., 5.],\n       [9., 5.]])\n>>> clupoints_n(projs, 0.5, 1.0, array([1,0]), array([0,0]), rng=prng)\narray([[0.50543932, 4.81610667],\n       [3.64396263, 5.09698721],\n       [5.46011545, 5.2885519 ],\n       [6.68176818, 5.27097611],\n       [8.84170227, 4.83880544]])\n

Parameters:

Name Type Description Default projs NDArray

Point projections on the cluster-supporting line ( \\(p \\times n\\) matrix).

required lat_disp float

Standard deviation for the normal distribution, i.e., cluster lateral dispersion.

required line_len float

Length of cluster-supporting line (ignored).

required clu_dir NDArray

Direction of the cluster-supporting line.

required clu_ctr NDArray

Center position of the cluster-supporting line (ignored).

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Generated points ( \\(p \\times n\\) matrix).

Source code in pyclugen/module.py
def clupoints_n(\n    projs: NDArray,\n    lat_disp: float,\n    line_len: float,\n    clu_dir: NDArray,\n    clu_ctr: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Generate points from their $n$-D projections on a cluster-supporting line.\n\n    Each point is placed around its projection using the normal distribution\n    ( $\\mu=0$, $\u03c3=$`lat_disp`).\n\n    This function's main intended use is by the [`clugen()`][pyclugen.main.clugen]\n    function, generating the final points when the `point_dist_fn` parameter is\n    set to `\"n\"`.\n\n    Examples:\n        >>> from pyclugen import clupoints_n, points_on_line\n        >>> from numpy import array, linspace\n        >>> from numpy.random import Generator, PCG64\n        >>> prng = Generator(PCG64(123))\n        >>> projs = points_on_line(array([5,5]),     # Get 5 point projections\n        ...                        array([1,0]),     # on a 2D line\n        ...                        linspace(-4,4,5))\n        >>> projs\n        array([[1., 5.],\n               [3., 5.],\n               [5., 5.],\n               [7., 5.],\n               [9., 5.]])\n        >>> clupoints_n(projs, 0.5, 1.0, array([1,0]), array([0,0]), rng=prng)\n        array([[0.50543932, 4.81610667],\n               [3.64396263, 5.09698721],\n               [5.46011545, 5.2885519 ],\n               [6.68176818, 5.27097611],\n               [8.84170227, 4.83880544]])\n\n    Args:\n      projs: Point projections on the cluster-supporting line ( $p \\times n$ matrix).\n      lat_disp: Standard deviation for the normal distribution, i.e., cluster\n        lateral dispersion.\n      line_len: Length of cluster-supporting line (ignored).\n      clu_dir: Direction of the cluster-supporting line.\n      clu_ctr: Center position of the cluster-supporting line (ignored).\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Generated points ( $p \\times n$ matrix).\n    \"\"\"\n    # Number of dimensions\n    num_dims = clu_dir.size\n\n    # Number of points in this cluster\n    clu_num_points = projs.shape[0]\n\n    # Get random displacement vectors for each point projection\n    displ = lat_disp * rng.normal(size=(clu_num_points, num_dims))\n\n    # Add displacement vectors to each point projection\n    points = projs + displ\n\n    return points\n
"},{"location":"reference/#pyclugen.clupoints_n_1","title":"clupoints_n_1","text":"
clupoints_n_1(\n    projs: NDArray,\n    lat_disp: float,\n    line_len: float,\n    clu_dir: NDArray,\n    clu_ctr: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Generate points from their \\(n\\)-D projections on a cluster-supporting line.

Each point is placed on a hyperplane orthogonal to that line and centered at the point's projection, using the normal distribution ( \\(\\mu=0\\), \\(\u03c3=\\)lat_disp).

This function's main intended use is by the clugen() function, generating the final points when the point_dist_fn parameter is set to \"n-1\".

Examples:

>>> from pyclugen import clupoints_n_1, points_on_line\n>>> from numpy import array, linspace\n>>> from numpy.random import Generator, PCG64\n>>> prng = Generator(PCG64(123))\n>>> projs = points_on_line(array([5,5]),     # Get 5 point projections\n...                        array([1,0]),     # on a 2D line\n...                        linspace(-4,4,5))\n>>> projs\narray([[1., 5.],\n       [3., 5.],\n       [5., 5.],\n       [7., 5.],\n       [9., 5.]])\n>>> clupoints_n_1(projs, 0.5, 1.0, array([1,0]), array([0,0]), rng=prng)\narray([[1.        , 5.49456068],\n       [3.        , 5.18389333],\n       [5.        , 5.64396263],\n       [7.        , 5.09698721],\n       [9.        , 5.46011545]])\n

Parameters:

Name Type Description Default projs NDArray

Point projections on the cluster-supporting line ( \\(p \\times n\\) matrix).

required lat_disp float

Standard deviation for the normal distribution, i.e., cluster lateral dispersion.

required line_len float

Length of cluster-supporting line (ignored).

required clu_dir NDArray

Direction of the cluster-supporting line.

required clu_ctr NDArray

Center position of the cluster-supporting line (ignored).

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Generated points ( \\(p \\times n\\) matrix).

Source code in pyclugen/module.py
def clupoints_n_1(\n    projs: NDArray,\n    lat_disp: float,\n    line_len: float,\n    clu_dir: NDArray,\n    clu_ctr: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Generate points from their $n$-D projections on a cluster-supporting line.\n\n    Each point is placed on a hyperplane orthogonal to that line and centered at\n    the point's projection, using the normal distribution ( $\\mu=0$,\n    $\u03c3=$`lat_disp`).\n\n    This function's main intended use is by the [`clugen()`][pyclugen.main.clugen]\n    function, generating the final points when the `point_dist_fn` parameter is\n    set to `\"n-1\"`.\n\n    Examples:\n        >>> from pyclugen import clupoints_n_1, points_on_line\n        >>> from numpy import array, linspace\n        >>> from numpy.random import Generator, PCG64\n        >>> prng = Generator(PCG64(123))\n        >>> projs = points_on_line(array([5,5]),     # Get 5 point projections\n        ...                        array([1,0]),     # on a 2D line\n        ...                        linspace(-4,4,5))\n        >>> projs\n        array([[1., 5.],\n               [3., 5.],\n               [5., 5.],\n               [7., 5.],\n               [9., 5.]])\n        >>> clupoints_n_1(projs, 0.5, 1.0, array([1,0]), array([0,0]), rng=prng)\n        array([[1.        , 5.49456068],\n               [3.        , 5.18389333],\n               [5.        , 5.64396263],\n               [7.        , 5.09698721],\n               [9.        , 5.46011545]])\n\n    Args:\n      projs: Point projections on the cluster-supporting line ( $p \\times n$ matrix).\n      lat_disp: Standard deviation for the normal distribution, i.e., cluster\n        lateral dispersion.\n      line_len: Length of cluster-supporting line (ignored).\n      clu_dir: Direction of the cluster-supporting line.\n      clu_ctr: Center position of the cluster-supporting line (ignored).\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Generated points ( $p \\times n$ matrix).\n    \"\"\"\n    # No blank line allowed here\n\n    # Define function to get distances from points to their projections on the\n    # line (i.e., using the normal distribution)\n    def dist_fn(clu_num_points, ldisp, rg):\n        return ldisp * rg.normal(size=clu_num_points)\n\n    # Use clupoints_n_1_template() to do the heavy lifting\n    return clupoints_n_1_template(projs, lat_disp, clu_dir, dist_fn, rng=rng)\n
"},{"location":"reference/#pyclugen.clupoints_n_1_template","title":"clupoints_n_1_template","text":"
clupoints_n_1_template(\n    projs: NDArray,\n    lat_disp: float,\n    clu_dir: NDArray,\n    dist_fn: Callable[[int, float, Generator], NDArray],\n    rng: Generator = _default_rng,\n) -> NDArray\n

Create \\(p\\) points from their \\(n\\)-D projections on a cluster-supporting line.

Each point is placed on a hyperplane orthogonal to that line and centered at the point's projection. The function specified in dist_fn is used to perform the actual placement.

This function is used internally by clupoints_n_1() and may be useful for constructing user-defined final point placement strategies for the point_dist_fn parameter of the main clugen() function.

Examples:

>>> from numpy import array, zeros\n>>> from numpy.random import Generator, PCG64\n>>> from pyclugen import clupoints_n_1_template, points_on_line\n>>> ctr = zeros(2)\n>>> dir = array([1, 0])\n>>> pdist = array([-0.5, -0.2, 0.1, 0.3])\n>>> rng = Generator(PCG64(123))\n>>> proj = points_on_line(ctr, dir, pdist)\n>>> clupoints_n_1_template(proj, 0, dir, lambda p, l, r: r.random(p), rng=rng)\narray([[-0.5       ,  0.68235186],\n       [-0.2       , -0.05382102],\n       [ 0.1       ,  0.22035987],\n       [ 0.3       , -0.18437181]])\n

Parameters:

Name Type Description Default projs NDArray

Point projections on the cluster-supporting line ( \\(p \\times n\\) matrix).

required lat_disp float

Dispersion of points from their projection.

required clu_dir NDArray

Direction of the cluster-supporting line (unit vector).

required dist_fn Callable[[int, float, Generator], NDArray]

Function to place points on a second line, orthogonal to the first. The functions accepts as parameters the number of points in the current cluster, the lateral_disp parameter (the same passed to the clugen() function), and a random number generator, returning a vector containing the distance of each point to its projection on the cluster-supporting line.

required rng Generator

An optional pseudo-random number generator for reproducible executions.

_default_rng

Returns:

Type Description NDArray

Generated points ( \\(p \\times n\\) matrix).

Source code in pyclugen/helper.py
def clupoints_n_1_template(\n    projs: NDArray,\n    lat_disp: float,\n    clu_dir: NDArray,\n    dist_fn: Callable[[int, float, Generator], NDArray],\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Create $p$ points from their $n$-D projections on a cluster-supporting line.\n\n    Each point is placed on a hyperplane orthogonal to that line and centered at\n    the point's projection. The function specified in `dist_fn` is used to perform\n    the actual placement.\n\n    This function is used internally by\n    [`clupoints_n_1()`][pyclugen.module.clupoints_n_1] and may be useful for\n    constructing user-defined final point placement strategies for the `point_dist_fn`\n    parameter of the main [`clugen()`][pyclugen.main.clugen] function.\n\n    Examples:\n        >>> from numpy import array, zeros\n        >>> from numpy.random import Generator, PCG64\n        >>> from pyclugen import clupoints_n_1_template, points_on_line\n        >>> ctr = zeros(2)\n        >>> dir = array([1, 0])\n        >>> pdist = array([-0.5, -0.2, 0.1, 0.3])\n        >>> rng = Generator(PCG64(123))\n        >>> proj = points_on_line(ctr, dir, pdist)\n        >>> clupoints_n_1_template(proj, 0, dir, lambda p, l, r: r.random(p), rng=rng)\n        array([[-0.5       ,  0.68235186],\n               [-0.2       , -0.05382102],\n               [ 0.1       ,  0.22035987],\n               [ 0.3       , -0.18437181]])\n\n    Args:\n      projs: Point projections on the cluster-supporting line ( $p \\times n$ matrix).\n      lat_disp: Dispersion of points from their projection.\n      clu_dir: Direction of the cluster-supporting line (unit vector).\n      dist_fn: Function to place points on a second line, orthogonal to the first.\n        The functions accepts as parameters the number of points in the current\n        cluster, the `lateral_disp` parameter (the same passed to the\n        [`clugen()`][pyclugen.main.clugen] function), and a random number generator,\n        returning a vector containing the distance of each point to its projection\n        on the cluster-supporting line.\n      rng: An optional pseudo-random number generator for reproducible executions.\n\n    Returns:\n      Generated points ( $p \\times n$ matrix).\n    \"\"\"\n    # Number of dimensions\n    num_dims = clu_dir.size\n\n    # Number of points in this cluster\n    clu_num_points = projs.shape[0]\n\n    # Get distances from points to their projections on the line\n    points_dist = dist_fn(clu_num_points, lat_disp, rng)\n\n    # Get normalized vectors, orthogonal to the current line, for each point\n    orth_vecs = zeros((clu_num_points, num_dims))\n\n    for j in range(clu_num_points):\n        orth_vecs[j, :] = rand_ortho_vector(clu_dir, rng=rng).ravel()\n\n    # Set vector magnitudes\n    orth_vecs = abs(points_dist).reshape(-1, 1) * orth_vecs\n\n    # Add perpendicular vectors to point projections on the line,\n    # yielding final cluster points\n    points = projs + orth_vecs\n\n    return points\n
"},{"location":"reference/#pyclugen.clusizes","title":"clusizes","text":"
clusizes(\n    num_clusters: int,\n    num_points: int,\n    allow_empty: bool,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Determine cluster sizes, i.e., the number of points in each cluster.

Cluster sizes are determined using the normal distribution ( \\(\\mu=\\)num_points \\(/\\)num_clusters, \\(\\sigma=\\mu/3\\)), and then assuring that the final cluster sizes add up to num_points via the fix_num_points() function.

Examples:

>>> from numpy.random import Generator, PCG64\n>>> from pyclugen import clusizes\n>>> prng = Generator(PCG64(123))\n>>> sizes = clusizes(4, 1000, True, rng=prng)\n>>> sizes\narray([166, 217, 354, 263])\n>>> sum(sizes)\n1000\n

Parameters:

Name Type Description Default num_clusters int

Number of clusters.

required num_points int

Total number of points.

required allow_empty bool

Allow empty clusters?

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Number of points in each cluster (vector of size num_clusters).

Source code in pyclugen/module.py
def clusizes(\n    num_clusters: int,\n    num_points: int,\n    allow_empty: bool,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Determine cluster sizes, i.e., the number of points in each cluster.\n\n    Cluster sizes are determined using the normal distribution (\n    $\\mu=$`num_points` $/$`num_clusters`, $\\sigma=\\mu/3$), and then\n    assuring that the final cluster sizes add up to `num_points` via the\n    [`fix_num_points()`][pyclugen.helper.fix_num_points] function.\n\n    Examples:\n        >>> from numpy.random import Generator, PCG64\n        >>> from pyclugen import clusizes\n        >>> prng = Generator(PCG64(123))\n        >>> sizes = clusizes(4, 1000, True, rng=prng)\n        >>> sizes\n        array([166, 217, 354, 263])\n        >>> sum(sizes)\n        1000\n\n    Args:\n      num_clusters: Number of clusters.\n      num_points: Total number of points.\n      allow_empty: Allow empty clusters?\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Number of points in each cluster (vector of size `num_clusters`).\n    \"\"\"\n    # Determine number of points in each cluster using the normal distribution\n\n    # Consider the mean an equal division of points between clusters\n    mean = num_points / num_clusters\n    # The standard deviation is such that the interval [0, 2 * mean] will contain\n    # \u224899.7% of cluster sizes\n    std = mean / 3\n\n    # Determine points with the normal distribution\n    clu_num_points = std * rng.normal(size=num_clusters) + mean\n\n    # Set negative values to zero\n    clu_num_points = where(clu_num_points > 0, clu_num_points, 0)\n\n    # Fix imbalances, so that num_points is respected\n    if sum(clu_num_points) > 0:  # Be careful not to divide by zero\n        clu_num_points *= num_points / sum(clu_num_points)\n\n    # Round the real values to integers since a cluster sizes is represented by\n    # an integer\n    clu_num_points = rint(clu_num_points).astype(int)\n\n    # Make sure total points is respected, which may not be the case at this time due\n    # to rounding\n    fix_num_points(clu_num_points, num_points)\n\n    # If empty clusters are not allowed, make sure there aren't any\n    if not allow_empty:\n        fix_empty(clu_num_points)\n\n    return clu_num_points\n
"},{"location":"reference/#pyclugen.fix_empty","title":"fix_empty","text":"
fix_empty(clu_num_points: NDArray, allow_empty: bool = False) -> NDArray\n

Certifies that, given enough points, no clusters are left empty.

This is done by removing a point from the largest cluster and adding it to an empty cluster while there are empty clusters. If the total number of points is smaller than the number of clusters (or if the allow_empty parameter is set to true), this function does nothing.

This function is used internally by clusizes() and might be useful for custom cluster sizing implementations given as the clusizes_fn parameter of the main clugen() function.

Note that the array is changed in-place.

Examples:

>>> from numpy import array\n>>> from pyclugen import fix_empty\n>>> clusters = array([3, 4, 5, 0, 0])\n>>> fix_empty(clusters)\narray([3, 3, 4, 1, 1])\n>>> clusters # Verify that the array was changed in-place\narray([3, 3, 4, 1, 1])\n

Parameters:

Name Type Description Default clu_num_points NDArray

Number of points in each cluster (vector of size \\(c\\)), where \\(c\\) is the number of clusters.

required allow_empty bool

Allow empty clusters?

False

Returns:

Type Description NDArray

Number of points in each cluster, after being fixed by this function (vector of size \\(c\\), which is the same reference than clu_num_points).

Source code in pyclugen/helper.py
def fix_empty(clu_num_points: NDArray, allow_empty: bool = False) -> NDArray:\n    r\"\"\"Certifies that, given enough points, no clusters are left empty.\n\n    This is done by removing a point from the largest cluster and adding it to an\n    empty cluster while there are empty clusters. If the total number of points is\n    smaller than the number of clusters (or if the `allow_empty` parameter is set\n    to `true`), this function does nothing.\n\n    This function is used internally by [`clusizes()`][pyclugen.module.clusizes]\n    and might be useful for custom cluster sizing implementations given as the\n    `clusizes_fn` parameter of the main [`clugen()`][pyclugen.main.clugen] function.\n\n    Note that the array is changed in-place.\n\n    Examples:\n        >>> from numpy import array\n        >>> from pyclugen import fix_empty\n        >>> clusters = array([3, 4, 5, 0, 0])\n        >>> fix_empty(clusters)\n        array([3, 3, 4, 1, 1])\n        >>> clusters # Verify that the array was changed in-place\n        array([3, 3, 4, 1, 1])\n\n    Args:\n      clu_num_points: Number of points in each cluster (vector of size $c$),\n        where $c$ is the number of clusters.\n      allow_empty: Allow empty clusters?\n\n    Returns:\n      Number of points in each cluster, after being fixed by this function (vector\n        of size $c$, which is the same reference than `clu_num_points`).\n    \"\"\"\n    # If the allow_empty parameter is set to true, don't do anything and return\n    # immediately; this is useful for quick `clusizes_fn` one-liners\n    if not allow_empty:\n        # Find empty clusters\n        empty_clusts = [idx for idx, val in enumerate(clu_num_points) if val == 0]\n\n        # If there are empty clusters and enough points for all clusters...\n        if len(empty_clusts) > 0 and sum(clu_num_points) >= clu_num_points.size:\n            # Go through the empty clusters...\n            for i0 in empty_clusts:\n                # ...get a point from the largest cluster and assign it to the\n                # current empty cluster\n                imax = argmax(clu_num_points)\n                clu_num_points[imax] -= 1\n                clu_num_points[i0] += 1\n\n    return clu_num_points\n
"},{"location":"reference/#pyclugen.fix_num_points","title":"fix_num_points","text":"
fix_num_points(clu_num_points: NDArray, num_points: int) -> NDArray\n

Certifies that the values in the clu_num_points array add up to num_points.

If this is not the case, the clu_num_points array is modified in-place, incrementing the value corresponding to the smallest cluster while sum(clu_num_points) < num_points, or decrementing the value corresponding to the largest cluster while sum(clu_num_points) > num_points.

This function is used internally by clusizes() and might be useful for custom cluster sizing implementations given as the clusizes_fn parameter of the main clugen() function.

Examples:

>>> from numpy import array\n>>> from pyclugen import fix_num_points\n>>> clusters = array([1, 6, 3])  # 10 total points\n>>> fix_num_points(clusters, 12) # But we want 12 total points\narray([3, 6, 3])\n>>> clusters # Verify that the array was changed in-place\narray([3, 6, 3])\n

Parameters:

Name Type Description Default clu_num_points NDArray

Number of points in each cluster (vector of size \\(c\\)), where \\(c\\) is the number of clusters.

required num_points int

The expected total number of points.

required

Returns:

Type Description NDArray

Number of points in each cluster, after being fixed by this function (vector of size \\(c\\), which is the same reference than clu_num_points).

Source code in pyclugen/helper.py
def fix_num_points(clu_num_points: NDArray, num_points: int) -> NDArray:\n    r\"\"\"Certifies that the values in the `clu_num_points` array add up to `num_points`.\n\n    If this is not the case, the `clu_num_points` array is modified in-place,\n    incrementing the value corresponding to the smallest cluster while\n    `sum(clu_num_points) < num_points`, or decrementing the value corresponding to\n    the largest cluster while `sum(clu_num_points) > num_points`.\n\n    This function is used internally by [`clusizes()`][pyclugen.module.clusizes]\n    and might be useful for custom cluster sizing implementations given as the\n    `clusizes_fn` parameter of the main [`clugen()`][pyclugen.main.clugen] function.\n\n    Examples:\n        >>> from numpy import array\n        >>> from pyclugen import fix_num_points\n        >>> clusters = array([1, 6, 3])  # 10 total points\n        >>> fix_num_points(clusters, 12) # But we want 12 total points\n        array([3, 6, 3])\n        >>> clusters # Verify that the array was changed in-place\n        array([3, 6, 3])\n\n    Args:\n      clu_num_points: Number of points in each cluster (vector of size $c$),\n        where $c$ is the number of clusters.\n      num_points: The expected total number of points.\n\n    Returns:\n      Number of points in each cluster, after being fixed by this function (vector\n        of size $c$, which is the same reference than `clu_num_points`).\n    \"\"\"\n    while sum(clu_num_points) < num_points:\n        imin = argmin(clu_num_points)\n        clu_num_points[imin] += 1\n    while sum(clu_num_points) > num_points:\n        imax = argmax(clu_num_points)\n        clu_num_points[imax] -= 1\n\n    return clu_num_points\n
"},{"location":"reference/#pyclugen.llengths","title":"llengths","text":"
llengths(\n    num_clusters: int,\n    llength: float,\n    llength_disp: float,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Determine length of cluster-supporting lines.

Line lengths are determined using the folded normal distribution ( \\(\\mu=\\)llength, \\(\\sigma=\\)llength_disp).

Examples:

>>> from numpy.random import Generator, MT19937\n>>> from pyclugen import llengths\n>>> prng = Generator(MT19937(123))\n>>> llengths(4, 20, 3.5, rng=prng)\narray([19.50968733, 19.92482858, 25.99013804, 18.58029672])\n

Parameters:

Name Type Description Default num_clusters int

Number of clusters.

required llength float

Average line length.

required llength_disp float

Line length dispersion.

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Lengths of cluster-supporting lines (vector of size num_clusters).

Source code in pyclugen/module.py
def llengths(\n    num_clusters: int,\n    llength: float,\n    llength_disp: float,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Determine length of cluster-supporting lines.\n\n    Line lengths are determined using the folded normal distribution (\n    $\\mu=$`llength`, $\\sigma=$`llength_disp`).\n\n    Examples:\n        >>> from numpy.random import Generator, MT19937\n        >>> from pyclugen import llengths\n        >>> prng = Generator(MT19937(123))\n        >>> llengths(4, 20, 3.5, rng=prng)\n        array([19.50968733, 19.92482858, 25.99013804, 18.58029672])\n\n    Args:\n      num_clusters: Number of clusters.\n      llength: Average line length.\n      llength_disp: Line length dispersion.\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Lengths of cluster-supporting lines (vector of size `num_clusters`).\n    \"\"\"\n    return abs(llength + llength_disp * rng.normal(size=num_clusters))\n
"},{"location":"reference/#pyclugen.points_on_line","title":"points_on_line","text":"
points_on_line(\n    center: NDArray, direction: NDArray, dist_center: NDArray\n) -> NDArray\n

Determine coordinates of points on a line.

Determine coordinates of points on a line with center and direction, based on the distances from the center given in dist_center.

This works by using the vector formulation of the line equation assuming direction is a \\(n\\)-dimensional unit vector. In other words, considering \\(\\mathbf{d}=\\)direction.reshape(-1,1) ( \\(n \\times 1\\) vector), \\(\\mathbf{c}=\\)center.reshape(-1,1) ( \\(n \\times 1\\) vector), and \\(\\mathbf{w}=\\) dist_center.reshape(-1,1) ( \\(p \\times 1\\) vector), the coordinates of points on the line are given by:

\\[ \\mathbf{P}=\\mathbf{1}\\,\\mathbf{c}^T + \\mathbf{w}\\mathbf{d}^T \\]

where \\(\\mathbf{P}\\) is the \\(p \\times n\\) matrix of point coordinates on the line, and \\(\\mathbf{1}\\) is a \\(p \\times 1\\) vector with all entries equal to 1.

Examples:

>>> from pyclugen import points_on_line\n>>> from numpy import array, linspace\n>>> points_on_line(array([5., 5.]),\n...                array([1., 0.]),\n...                linspace(-4, 4, 5)) # 2D, 5 points\narray([[1., 5.],\n       [3., 5.],\n       [5., 5.],\n       [7., 5.],\n       [9., 5.]])\n>>> points_on_line(array([-2, 0, 0., 2]),\n...                array([0., 0, -1, 0]),\n...                array([10, -10])) # 4D, 2 points\narray([[ -2.,   0., -10.,   2.],\n       [ -2.,   0.,  10.,   2.]])\n

Parameters:

Name Type Description Default center NDArray

Center of the line ( \\(n\\)-component vector).

required direction NDArray

Line direction ( \\(n\\)-component unit vector).

required dist_center NDArray

Distance of each point to the center of the line ( \\(p\\)-component vector, where \\(p\\) is the number of points).

required

Returns:

Type Description NDArray

Coordinates of points on the specified line ( \\(p \\times n\\) matrix).

Source code in pyclugen/core.py
def points_on_line(\n    center: NDArray, direction: NDArray, dist_center: NDArray\n) -> NDArray:\n    r\"\"\"Determine coordinates of points on a line.\n\n    Determine coordinates of points on a line with `center` and `direction`,\n    based on the distances from the center given in `dist_center`.\n\n    This works by using the vector formulation of the line equation assuming\n    `direction` is a $n$-dimensional unit vector. In other words, considering\n    $\\mathbf{d}=$`direction.reshape(-1,1)` ( $n \\times 1$ vector),\n    $\\mathbf{c}=$`center.reshape(-1,1)` ( $n \\times 1$ vector), and\n    $\\mathbf{w}=$ `dist_center.reshape(-1,1)` ( $p \\times 1$ vector),\n    the coordinates of points on the line are given by:\n\n    $$\n    \\mathbf{P}=\\mathbf{1}\\,\\mathbf{c}^T + \\mathbf{w}\\mathbf{d}^T\n    $$\n\n    where $\\mathbf{P}$ is the $p \\times n$ matrix of point coordinates on the\n    line, and $\\mathbf{1}$ is a $p \\times 1$ vector with all entries equal to 1.\n\n    Examples:\n        >>> from pyclugen import points_on_line\n        >>> from numpy import array, linspace\n        >>> points_on_line(array([5., 5.]),\n        ...                array([1., 0.]),\n        ...                linspace(-4, 4, 5)) # 2D, 5 points\n        array([[1., 5.],\n               [3., 5.],\n               [5., 5.],\n               [7., 5.],\n               [9., 5.]])\n        >>> points_on_line(array([-2, 0, 0., 2]),\n        ...                array([0., 0, -1, 0]),\n        ...                array([10, -10])) # 4D, 2 points\n        array([[ -2.,   0., -10.,   2.],\n               [ -2.,   0.,  10.,   2.]])\n\n    Args:\n      center: Center of the line ( $n$-component vector).\n      direction: Line direction ( $n$-component unit vector).\n      dist_center: Distance of each point to the center of the line\n        ( $p$-component vector, where $p$ is the number of points).\n\n    Returns:\n      Coordinates of points on the specified line ( $p \\times n$ matrix).\n    \"\"\"\n    return center.reshape(1, -1) + dist_center.reshape(-1, 1) @ direction.reshape(\n        (1, -1)\n    )\n
"},{"location":"reference/#pyclugen.rand_ortho_vector","title":"rand_ortho_vector","text":"
rand_ortho_vector(u: NDArray, rng: Generator = _default_rng) -> NDArray\n

Get a random unit vector orthogonal to u.

Note that u is expected to be a unit vector itself.

Examples:

>>> from pyclugen import rand_ortho_vector\n>>> from numpy import isclose, dot\n>>> from numpy.linalg import norm\n>>> from numpy.random import Generator, PCG64\n>>> rng = Generator(PCG64(123))\n>>> r = rng.random(3) # Get a random vector with 3 components (3D)\n>>> r = r / norm(r) # Normalize it\n>>> r_ort = rand_ortho_vector(r, rng=rng) # Get random unit vector orth. to r\n>>> r_ort\narray([-0.1982903 , -0.61401512,  0.76398062])\n>>> isclose(dot(r, r_ort), 0) # Check that vectors are indeed orthogonal\nTrue\n

Parameters:

Name Type Description Default u NDArray

Unit vector with \\(n\\) components.

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

A random unit vector with \\(n\\) components orthogonal to u.

Source code in pyclugen/core.py
def rand_ortho_vector(u: NDArray, rng: Generator = _default_rng) -> NDArray:\n    r\"\"\"Get a random unit vector orthogonal to `u`.\n\n    Note that `u` is expected to be a unit vector itself.\n\n    Examples:\n        >>> from pyclugen import rand_ortho_vector\n        >>> from numpy import isclose, dot\n        >>> from numpy.linalg import norm\n        >>> from numpy.random import Generator, PCG64\n        >>> rng = Generator(PCG64(123))\n        >>> r = rng.random(3) # Get a random vector with 3 components (3D)\n        >>> r = r / norm(r) # Normalize it\n        >>> r_ort = rand_ortho_vector(r, rng=rng) # Get random unit vector orth. to r\n        >>> r_ort\n        array([-0.1982903 , -0.61401512,  0.76398062])\n        >>> isclose(dot(r, r_ort), 0) # Check that vectors are indeed orthogonal\n        True\n\n    Args:\n      u: Unit vector with $n$ components.\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      A random unit vector with $n$ components orthogonal to `u`.\n    \"\"\"\n    # If 1D, just return a random unit vector\n    if u.size == 1:\n        return rand_unit_vector(1, rng=rng)\n\n    # Find a random, non-parallel vector to u\n    while True:\n        # Find normalized random vector\n        r = rand_unit_vector(u.size, rng=rng)\n\n        # If not parallel to u we can keep it and break the loop\n        if not isclose(abs(dot(u, r)), 1):\n            break\n\n    # Get vector orthogonal to u using 1st iteration of Gram-Schmidt process\n    v = r - dot(u, r) / dot(u, u) * u\n\n    # Normalize it\n    v = v / norm(v)\n\n    # And return it\n    return v\n
"},{"location":"reference/#pyclugen.rand_unit_vector","title":"rand_unit_vector","text":"
rand_unit_vector(num_dims: int, rng: Generator = _default_rng) -> NDArray\n

Get a random unit vector with num_dims components.

Examples:

>>> from pyclugen import rand_unit_vector\n>>> rand_unit_vector(4)\narray([ 0.48653889,  0.50753862,  0.05711487, -0.70881757])\n
>>> from pyclugen import rand_unit_vector\n>>> from numpy.random import Generator, PCG64\n>>> rng = Generator(PCG64(123))\n>>> rand_unit_vector(2, rng=rng) # Reproducible\narray([ 0.3783202 , -0.92567479])\n

Parameters:

Name Type Description Default num_dims int

Number of components in vector (i.e. vector size).

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

A random unit vector with num_dims components.

Source code in pyclugen/core.py
def rand_unit_vector(num_dims: int, rng: Generator = _default_rng) -> NDArray:\n    r\"\"\"Get a random unit vector with `num_dims` components.\n\n    Examples:\n        >>> from pyclugen import rand_unit_vector\n        >>> rand_unit_vector(4) # doctest: +SKIP\n        array([ 0.48653889,  0.50753862,  0.05711487, -0.70881757])\n\n        >>> from pyclugen import rand_unit_vector\n        >>> from numpy.random import Generator, PCG64\n        >>> rng = Generator(PCG64(123))\n        >>> rand_unit_vector(2, rng=rng) # Reproducible\n        array([ 0.3783202 , -0.92567479])\n\n    Args:\n      num_dims: Number of components in vector (i.e. vector size).\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      A random unit vector with `num_dims` components.\n    \"\"\"\n    r = rng.random(num_dims) - 0.5\n    r = r / norm(r)\n    return r\n
"},{"location":"reference/#pyclugen.rand_vector_at_angle","title":"rand_vector_at_angle","text":"
rand_vector_at_angle(\n    u: NDArray, angle: float, rng: Generator = _default_rng\n) -> NDArray\n

Get a random unit vector which is at angle radians of vector u.

Note that u is expected to be a unit vector itself.

Examples:

>>> from pyclugen import rand_vector_at_angle\n>>> from numpy import arccos, array, degrees, pi, dot\n>>> from numpy.linalg import norm\n>>> from numpy.random import Generator, PCG64\n>>> rng = Generator(PCG64(123))\n>>> u = array([ 1.0, 0, 0.5, -0.5 ]) # Define a 4D vector\n>>> u = u / norm(u) # Normalize the vector\n>>> v = rand_vector_at_angle(u, pi/4, rng=rng) # Get a vector at 45 degrees\n>>> v\narray([ 0.633066  , -0.50953554, -0.10693823, -0.57285705])\n>>> degrees(arccos(dot(u, v) / norm(u) * norm(v))) # Angle between u and v\n45.0\n

Parameters:

Name Type Description Default u NDArray

Unit vector with \\(n\\) components.

required angle float

Angle in radians.

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Random unit vector with \\(n\\) components which is at angle radians with vector u.

Source code in pyclugen/core.py
def rand_vector_at_angle(\n    u: NDArray, angle: float, rng: Generator = _default_rng\n) -> NDArray:\n    r\"\"\"Get a random unit vector which is at `angle` radians of vector `u`.\n\n    Note that `u` is expected to be a unit vector itself.\n\n    Examples:\n        >>> from pyclugen import rand_vector_at_angle\n        >>> from numpy import arccos, array, degrees, pi, dot\n        >>> from numpy.linalg import norm\n        >>> from numpy.random import Generator, PCG64\n        >>> rng = Generator(PCG64(123))\n        >>> u = array([ 1.0, 0, 0.5, -0.5 ]) # Define a 4D vector\n        >>> u = u / norm(u) # Normalize the vector\n        >>> v = rand_vector_at_angle(u, pi/4, rng=rng) # Get a vector at 45 degrees\n        >>> v\n        array([ 0.633066  , -0.50953554, -0.10693823, -0.57285705])\n        >>> degrees(arccos(dot(u, v) / norm(u) * norm(v))) # Angle between u and v\n        45.0\n\n    Args:\n      u: Unit vector with $n$ components.\n      angle: Angle in radians.\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Random unit vector with $n$ components which is at `angle` radians\n        with vector `u`.\n    \"\"\"\n    if isclose(abs(angle), pi / 2) and u.size > 1:\n        return rand_ortho_vector(u, rng=rng)\n    elif -pi / 2 < angle < pi / 2 and u.size > 1:\n        v = u + rand_ortho_vector(u, rng=rng) * tan(angle)\n        return v / norm(v)\n    else:\n        # For |\u03b8| > \u03c0/2 or the 1D case, simply return a random vector\n        return rand_unit_vector(u.size, rng=rng)\n
"},{"location":"theory/","title":"Theory","text":"

This section presents a general overview of the clugen algorithm. A complete description of the algorithm's theoretical framework is available in the article \"Generating multidimensional clusters with support lines\" (an open version is available on arXiv).

Clugen is an algorithm for generating multidimensional clusters. Each cluster is supported by a line segment, the position, orientation and length of which guide where the respective points are placed. For brevity, line segments will be referred to as lines.

Given an \\(n\\)-dimensional direction vector \\(\\mathbf{d}\\) (and a number of additional parameters, which will be discussed shortly), the clugen algorithm works as follows (\\(^*\\) means the algorithm step is stochastic):

  1. Normalize \\(\\mathbf{d}\\).
  2. \\(^*\\)Determine cluster sizes.
  3. \\(^*\\)Determine cluster centers.
  4. \\(^*\\)Determine lengths of cluster-supporting lines.
  5. \\(^*\\)Determine angles between \\(\\mathbf{d}\\) and cluster-supporting lines.
  6. For each cluster:
  7. \\(^*\\)Determine direction of the cluster-supporting line.
  8. \\(^*\\)Determine distance of point projections from the center of the cluster-supporting line.
  9. Determine coordinates of point projections on the cluster-supporting line.
  10. \\(^*\\)Determine points from their projections on the cluster-supporting line.

Figure 1 provides a stylized overview of the algorithm's steps.

The example in Figure 1 was generated with the following parameters, the exact meaning of each will be discussed shortly:

Parameter values Description \\(n=2\\) Number of dimensions. \\(c=4\\) Number of clusters. \\(p=200\\) Total number of points. \\(\\mathbf{d}=\\begin{bmatrix}1 & 1\\end{bmatrix}^T\\) Average direction. \\(\\theta_\\sigma=\\pi/16\\approx{}11.25^{\\circ}\\) Angle dispersion. \\(\\mathbf{s}=\\begin{bmatrix}10 & 10\\end{bmatrix}^T\\) Average cluster separation. \\(l=10\\) Average line length. \\(l_\\sigma=1.5\\) Line length dispersion. \\(f_\\sigma=1\\) Cluster lateral dispersion.

Additionally, all optional parameters (not listed above) were left to their default values. The complete list of parameters is presented in the clugen() function documentation.

"},{"location":"generated/gallery/","title":"Examples","text":""},{"location":"generated/gallery/#examples","title":"Examples","text":"

Examples in 1D

Examples in 2D

Examples in 3D

Examples in nD

Merging and hierarchical cluster examples

Plot functions

Download all examples in Python source code: gallery_python.zip

Download all examples in Jupyter notebooks: gallery_jupyter.zip

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/mg_execution_times/","title":"Computation times","text":"

00:26.400 total execution time for generated_gallery files:

+----------------------------------------------------------------------------------------+-----------+--------+ | plot_2_2d_examples (docs/examples/plot_2_2d_examples.py) | 00:10.292 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_4_nd_examples (docs/examples/plot_4_nd_examples.py) | 00:06.376 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_3_3d_examples (docs/examples/plot_3_3d_examples.py) | 00:05.237 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_5_mrg_examples (docs/examples/plot_5_mrg_examples.py) | 00:03.156 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_1_1d_examples (docs/examples/plot_1_1d_examples.py) | 00:01.333 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_functions (docs/examples/plot_functions.py) | 00:00.005 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+

"},{"location":"generated/gallery/plot_1_1d_examples/","title":"Examples in 1D","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_1_1d_examples/#examples-in-1d","title":"Examples in 1D","text":"

This section contains several examples on how to generate 1D data with pyclugen. To run the examples we first need to import the clugen() function:

from pyclugen import clugen\n

To plot these examples we use the plot_examples_1d function:

from plot_functions import plot_examples_1d\n

Out:

/home/runner/work/pyclugen/pyclugen/docs/docs/examples/plot_functions.py:15: DeprecationWarning: \nPyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\nbut was not found to be installed on your system.\nIf this would cause problems for you,\nplease provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n\n  import pandas as pd\n
"},{"location":"generated/gallery/plot_1_1d_examples/#basic-1d-example-with-density-plot","title":"Basic 1D example with density plot","text":"
seed = 23456\n
# Custom proj_dist_fn: point projections placed using the Weibull distribution\ndef proj_weibull(len, n, rng):\n    return len / 2 * rng.weibull(1.5, size=n)\n
e082 = clugen(1, 3, 1000, [1], 0, [10], 6, 1.5, 0, rng=seed)\ne083 = clugen(1, 3, 1000, [1], 0, [10], 6, 1.5, 0, rng=seed, proj_dist_fn=\"unif\")\ne084 = clugen(1, 3, 1000, [1], 0, [10], 6, 1.5, 0, rng=seed, proj_dist_fn=proj_weibull)\n
plot_examples_1d(\n    e082, \"e082: proj_dist_fn = 'norm' (default)\",\n    e083, \"e083: proj_dist_fn = 'unif'\",\n    e084, \"e084: custom proj_dist_fn (Weibull)\")\n

Total running time of the script: ( 0 minutes 1.333 seconds)

Download Python source code: plot_1_1d_examples.py

Download Jupyter notebook: plot_1_1d_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_2_2d_examples/","title":"Examples in 2D","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_2_2d_examples/#examples-in-2d","title":"Examples in 2D","text":"

This section contains several examples on how to generate 2D data with pyclugen. To run the examples we first need to import the clugen() function:

import numpy as np\nfrom pyclugen import clugen\n

To plot these examples we use the plot_examples_2d function:

from plot_functions import plot_examples_2d\n
"},{"location":"generated/gallery/plot_2_2d_examples/#manipulating-the-direction-of-cluster-supporting-lines","title":"Manipulating the direction of cluster-supporting lines","text":""},{"location":"generated/gallery/plot_2_2d_examples/#using-the-direction-parameter","title":"Using the direction parameter","text":"
seed = 123\n
e001 = clugen(2, 4, 2000, [1, 0], 0, [10, 10], 10, 1.5, 0.5, rng=seed)\ne002 = clugen(2, 4, 200, [1, 1], 0, [10, 10], 10, 1.5, 0.5, rng=seed)\ne003 = clugen(2, 4, 200, [0, 1], 0, [10, 10], 10, 1.5, 0.5, rng=seed)\n
plot_examples_2d(\n    e001, \"e001: direction = [1, 0]\",\n    e002, \"e002: direction = [1, 1]\",\n    e003, \"e003: direction = [0, 1]\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#changing-the-angle_disp-parameter-and-using-a-custom-angle_deltas_fn-function","title":"Changing the angle_disp parameter and using a custom angle_deltas_fn function","text":"
seed = 321\n
# Custom angle_deltas function: arbitrarily rotate some clusters by 90 degrees\ndef angdel_90_fn(nclu, astd, rng):\n    return rng.choice([0, np.pi / 2], size=nclu)\n
e004 = clugen(2, 6, 500, [1, 0], 0, [10, 10], 10, 1.5, 0.5, rng=seed)\ne005 = clugen(2, 6, 500, [1, 0], np.pi / 8, [10, 10], 10, 1.5, 0.5, rng=seed)\ne006 = clugen(2, 6, 500, [1, 0], 0, [10, 10], 10, 1.5, 0.5, rng=seed,\n    angle_deltas_fn=angdel_90_fn)\n
plot_examples_2d(\n    e004, \"e004: angle_disp = 0\",\n    e005, \"e005: angle_disp = \u03c0/8\",\n    e006, \"e006: custom angle_deltas function\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#manipulating-the-length-of-cluster-supporting-lines","title":"Manipulating the length of cluster-supporting lines","text":""},{"location":"generated/gallery/plot_2_2d_examples/#using-the-llength-parameter","title":"Using the llength parameter","text":"
seed = 567\n
e007 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10],  0, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne008 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 10, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne009 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 30, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\n
plot_examples_2d(\n    e007, \"e007: llength = 0\",\n    e008, \"e008: llength = 10\",\n    e009, \"e009: llength = 30\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#changing-the-llength_disp-parameter-and-using-a-custom-llengths_fn-function","title":"Changing the llength_disp parameter and using a custom llengths_fn function","text":"
seed = 567\n
# Custom llengths function: line lengths grow for each new cluster\ndef llen_grow_fn(nclu, llen, llenstd, rng):\n    return llen * np.arange(nclu) + rng.normal(scale=llenstd, size=nclu)\n
e010 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 15,  0.0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne011 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 15, 10.0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne012 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 10,  0.1, 0.5, rng=seed,\n    llengths_fn=llen_grow_fn, point_dist_fn=\"n\")\n
plot_examples_2d(\n    e010, \"e010: llength_disp = 0.0\",\n    e011, \"e011: llength_disp = 5.0\",\n    e012, \"e012: custom llengths function\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#manipulating-relative-cluster-positions","title":"Manipulating relative cluster positions","text":""},{"location":"generated/gallery/plot_2_2d_examples/#using-the-cluster_sep-parameter","title":"Using the cluster_sep parameter","text":"
seed = 21\n
e013 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 10], 10, 2, 2.5, rng=seed)\ne014 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [30, 10], 10, 2, 2.5, rng=seed)\ne015 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 30], 10, 2, 2.5, rng=seed)\n
plt = plot_examples_2d(\n    e013, \"e013: cluster_sep = [10, 10]\",\n    e014, \"e014: cluster_sep = [30, 10]\",\n    e015, \"e015: cluster_sep = [10, 30]\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#changing-the-cluster_offset-parameter-and-using-a-custom-clucenters_fn-function","title":"Changing the cluster_offset parameter and using a custom clucenters_fn function","text":"
seed = 21\n
# Custom clucenters function: places clusters in a diagonal\ndef centers_diag_fn(nclu, csep, coff, rng):\n    return np.ones((nclu, len(csep))) * np.arange(1, nclu + 1)[:, None] * np.max(csep) + coff\n
e016 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 10], 10, 2, 2.5, rng=seed)\ne017 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 10], 10, 2, 2.5, rng=seed,\n    cluster_offset=[20, -20])\ne018 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 10], 10, 2, 2.5, rng=seed,\n    cluster_offset=[-50, -50], clucenters_fn=centers_diag_fn)\n
plt = plot_examples_2d(\n    e016, \"e016: default\",\n    e017, \"e017: cluster_offset = [20, -20]\",\n    e018, \"e018: custom clucenters function\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#lateral-dispersion-and-placement-of-point-projections-on-the-line","title":"Lateral dispersion and placement of point projections on the line","text":""},{"location":"generated/gallery/plot_2_2d_examples/#normal-projection-placement-default-proj_dist_fn-norm","title":"Normal projection placement (default): proj_dist_fn = \"norm\"","text":"
seed = 654\n
e019 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 0.0, rng=seed)\ne020 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 1.0, rng=seed)\ne021 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 3.0, rng=seed)\n
plt = plot_examples_2d(\n    e019, \"e019: lateral_disp = 0\",\n    e020, \"e020: lateral_disp = 1\",\n    e021, \"e021: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#uniform-projection-placement-proj_dist_fn-unif","title":"Uniform projection placement: proj_dist_fn = \"unif\"","text":"
seed = 654\n
e022 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 0.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne023 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 1.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne024 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 3.0, rng=seed,\n    proj_dist_fn=\"unif\")\n
plt = plot_examples_2d(\n    e022, \"e022: lateral_disp = 0\",\n    e023, \"e023: lateral_disp = 1\",\n    e024, \"e024: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#custom-projection-placement-using-the-laplace-distribution","title":"Custom projection placement using the Laplace distribution","text":"
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e025 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 0.0, rng=seed,\n    proj_dist_fn=proj_laplace)\ne026 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 1.0, rng=seed,\n    proj_dist_fn=proj_laplace)\ne027 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 3.0, rng=seed,\n    proj_dist_fn=proj_laplace)\n
plt = plot_examples_2d(\n    e025, \"e025: lateral_disp = 0\",\n    e026, \"e026: lateral_disp = 1\",\n    e027, \"e027: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#controlling-final-point-positions-from-their-projections-on-the-cluster-supporting-line","title":"Controlling final point positions from their projections on the cluster-supporting line","text":""},{"location":"generated/gallery/plot_2_2d_examples/#points-on-hyperplane-orthogonal-to-cluster-supporting-line-default-point_dist_fn-n-1","title":"Points on hyperplane orthogonal to cluster-supporting line (default): point_dist_fn = \"n-1\"","text":"
seed = 1357\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e028 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed)\ne029 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne030 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    proj_dist_fn=proj_laplace)\n
plt = plot_examples_2d(\n    e028, \"e028: proj_dist_fn=\\\"norm\\\" (default)\",\n    e029, \"e029: proj_dist_fn=\\\"unif\\\"\",\n    e030, \"e030: custom proj_dist_fn (Laplace)\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#points-around-projection-on-cluster-supporting-line-point_dist_fn-n","title":"Points around projection on cluster-supporting line: point_dist_fn = \"n\"","text":"
seed = 1357\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e031 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=\"n\")\ne032 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=\"n\", proj_dist_fn=\"unif\")\ne033 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=\"n\", proj_dist_fn=proj_laplace)\n
plt = plot_examples_2d(\n    e031, \"e031: proj_dist_fn=\\\"norm\\\" (default)\",\n    e032, \"e032: proj_dist_fn=\\\"unif\\\"\",\n    e033, \"e033: custom proj_dist_fn (Laplace)\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#custom-point-placement-using-the-exponential-distribution","title":"Custom point placement using the exponential distribution","text":"

For this example we require the clupoints_n_1_template() helper function:

from pyclugen import clupoints_n_1_template\n
seed = 1357\n
# Custom point_dist_fn: final points placed using the Exponential distribution\ndef clupoints_n_1_exp(projs, lat_std, len, clu_dir, clu_ctr, rng):\n    def dist_exp(npts, lstd, rg):\n        return lstd * rg.exponential(scale=2 / lstd, size=npts)\n    return clupoints_n_1_template(projs, lat_std, clu_dir, dist_exp, rng=rng)\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e034 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=clupoints_n_1_exp)\ne035 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=clupoints_n_1_exp, proj_dist_fn=\"unif\")\ne036 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=clupoints_n_1_exp, proj_dist_fn=proj_laplace)\n
plt = plot_examples_2d(\n    e034, \"e034: proj_dist_fn=\\\"norm\\\" (default)\",\n    e035, \"e035: proj_dist_fn=\\\"unif\\\"\",\n    e036, \"e036: custom proj_dist_fn (Laplace)\")\n

"},{"location":"generated/gallery/plot_2_2d_examples/#manipulating-cluster-sizes","title":"Manipulating cluster sizes","text":"
seed = 963\n
# Custom clusizes_fn (e038): cluster sizes determined via the uniform distribution,\n# no correction for total points\ndef clusizes_unif(nclu, npts, ae, rng):\n    return rng.integers(low=1, high=2 * npts / nclu + 1, size=nclu)\n
# Custom clusizes_fn (e039): clusters all have the same size, no correction for total points\ndef clusizes_equal(nclu, npts, ae, rng):\n    return (npts // nclu) * np.ones(nclu, dtype=int)\n
# Custom clucenters_fn (all): yields fixed positions for the clusters\ndef centers_fixed(nclu, csep, coff, rng):\n    return np.array([[-csep[0], -csep[1]], [csep[0], -csep[1]], [-csep[0], csep[1]], [csep[0], csep[1]]])\n
e037 = clugen(2, 4, 1500, [1, 1], np.pi, [20, 20], 0, 0, 5, rng=seed,\n    point_dist_fn=\"n\", clucenters_fn=centers_fixed)\ne038 = clugen(2, 4, 1500, [1, 1], np.pi, [20, 20], 0, 0, 5, rng=seed,\n    point_dist_fn=\"n\", clucenters_fn=centers_fixed, clusizes_fn=clusizes_unif)\ne039 = clugen(2, 4, 1500, [1, 1], np.pi, [20, 20], 0, 0, 5, rng=seed,\n    point_dist_fn=\"n\", clucenters_fn=centers_fixed, clusizes_fn=clusizes_equal)\n
plt = plot_examples_2d(\n    e037, \"e037: normal dist. (default)\",\n    e038, \"e038: unif. dist. (custom)\",\n    e039, \"e039: equal size (custom)\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#direct-specification-of-optional-parameters","title":"Direct specification of optional parameters","text":"
seed = 123\n
e040 = clugen(2, 4, 1000, [-1, 1], 0, [0, 0], 0, 0, 0.2, rng=seed,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\", clusizes_fn=[50, 200, 500, 2000],\n    llengths_fn=[0, 2, 4, 6], clucenters_fn=[[-5, -5], [-2.5, -2.5], [0, 0], [2.5, 2.5]])\n\ne041 = clugen(2, 5, 1000, [[1, 1], [1, 0], [1, 0], [0, 1], [0, 1]],\n    0, [0, 0], 0, 0, 0.2, rng=seed,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\",\n    clusizes_fn=[200, 500, 500, 500, 500], llengths_fn=[0, 5, 5, 5, 5],\n    clucenters_fn=[[0, 0], [0, 5], [0, -5], [5, 0], [-5, 0]])\n\ne042 = clugen(2, 5, 1000, [[0, 1], [0.25, 0.75], [0.5, 0.5], [0.75, 0.25], [1, 0]],\n    0, [0, 0], 5, 0, 0.2, rng=seed,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\", clusizes_fn=[500, 500, 500, 500, 500],\n    clucenters_fn=[[-5, 0], [-3, -0.3], [-1, -0.8], [1, -1.6], [3, -2.5]])\n
plt = plot_examples_2d(\n    e040, \"e040: direct params 1\",\n    e041, \"e041: direct params 2\",\n    e042, \"e042: direct params 3\")\n

Total running time of the script: ( 0 minutes 10.292 seconds)

Download Python source code: plot_2_2d_examples.py

Download Jupyter notebook: plot_2_2d_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_3_3d_examples/","title":"Examples in 3D","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_3_3d_examples/#examples-in-3d","title":"Examples in 3D","text":"

This section contains several examples on how to generate 3D data with pyclugen. To run the examples we first need to import the clugen() function:

import numpy as np\nfrom pyclugen import clugen\n

To plot these examples we use the plot_examples_3d function:

from plot_functions import plot_examples_3d\n
"},{"location":"generated/gallery/plot_3_3d_examples/#manipulating-the-direction-of-cluster-supporting-lines","title":"Manipulating the direction of cluster-supporting lines","text":""},{"location":"generated/gallery/plot_3_3d_examples/#using-the-direction-parameter","title":"Using the direction parameter","text":"
seed = 321\n
e043 = clugen(3, 4, 500, [1, 0, 0], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\ne044 = clugen(3, 4, 500, [1, 1, 1], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\ne045 = clugen(3, 4, 500, [0, 0, 1], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\n
plt = plot_examples_3d(\n    e043, \"e043: direction = [1, 0, 0]\",\n    e044, \"e044: direction = [1, 1, 1]\",\n    e045, \"e045: direction = [0, 0, 1]\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#changing-the-angle_disp-parameter-and-using-a-custom-angle_deltas_fn-function","title":"Changing the angle_disp parameter and using a custom angle_deltas_fn function","text":"
seed = 321\n\n# Custom angle_deltas function: arbitrarily rotate some clusters by 90 degrees\ndef angdel_90_fn(nclu, astd, rng):\n    return rng.choice([0, np.pi / 2], size=nclu)\n
e046 = clugen(3, 6, 1000, [1, 0, 0], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\ne047 = clugen(3, 6, 1000, [1, 0, 0], np.pi / 8, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\ne048 = clugen(3, 6, 1000, [1, 0, 0], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed,\n    angle_deltas_fn=angdel_90_fn)\n
plt = plot_examples_3d(\n    e046, \"e046: angle_disp = 0\",\n    e047, \"e047: angle_disp = \u03c0 / 8\",\n    e048, \"e048: custom angle_deltas function\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#specifying-a-main-direction-for-each-cluster-and-changing-angle_disp","title":"Specifying a main direction for each cluster and changing angle_disp","text":"
seed = 123\n\n# Define a main direction for each cluster\ndirs = [[1, 1, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0], [-1, 1, 1]]\n
e049 = clugen(3, 5, 1000, dirs, 0, np.zeros(3), 20, 0, 0.2, proj_dist_fn=\"unif\", rng=seed)\ne050 = clugen(3, 5, 1000, dirs, np.pi / 12, np.zeros(3), 20, 0, 0.2, proj_dist_fn=\"unif\", rng=seed)\ne051 = clugen(3, 5, 1000, dirs, np.pi / 4, np.zeros(3), 20, 0, 0.2, proj_dist_fn=\"unif\", rng=seed)\n
plot_examples_3d(\n    e049, \"e049: angle_disp = 0\",\n    e050, \"e050: angle_disp = \u03c0 / 12\",\n    e051, \"e051: angle_disp = \u03c0 / 4\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#manipulating-the-length-of-cluster-supporting-lines","title":"Manipulating the length of cluster-supporting lines","text":""},{"location":"generated/gallery/plot_3_3d_examples/#using-the-llength-parameter","title":"Using the llength parameter","text":"
seed = 789\n
e052 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 0, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne053 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 10, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne054 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 30, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\n
plt = plot_examples_3d(\n    e052, \"e052: llength = 0\",\n    e053, \"e053: llength = 10\",\n    e054, \"e054: llength = 30\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#changing-the-llength_disp-parameter-and-using-a-custom-llengths_fn-function","title":"Changing the llength_disp parameter and using a custom llengths_fn function","text":"
seed = 765\n
# Custom llengths function: line lengths tend to grow for each new cluster\ndef llen_grow_fn(nclu, llen, llenstd, rng):\n    return llen * np.arange(nclu) + rng.normal(scale=llenstd, size=nclu)\n\ne055 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 15,  0.0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne056 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 15, 10.0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne057 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 10,  0.1, 0.5, rng=seed,\n    point_dist_fn=\"n\", llengths_fn=llen_grow_fn)\n
plt = plot_examples_3d(\n    e055, \"e055: llength_disp = 0.0\",\n    e056, \"e056: llength_disp = 10.0\",\n    e057, \"e057: custom llengths function\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#manipulating-relative-cluster-positions","title":"Manipulating relative cluster positions","text":""},{"location":"generated/gallery/plot_3_3d_examples/#using-the-cluster_sep-parameter","title":"Using the cluster_sep parameter","text":"
seed = 765\n
e058 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [30, 10, 10], 25, 4, 3, rng=seed)\ne059 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 30, 10], 25, 4, 3, rng=seed)\ne060 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 10, 30], 25, 4, 3, rng=seed)\n
plt = plot_examples_3d(\n    e058, \"e058: cluster_sep = [30, 10, 10]\",\n    e059, \"e059: cluster_sep = [10, 30, 10]\",\n    e060, \"e060: cluster_sep = [10, 10, 30]\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#changing-the-cluster_offset-parameter-and-using-a-custom-clucenters_fn-function","title":"Changing the cluster_offset parameter and using a custom clucenters_fn function","text":"
# Custom clucenters function: places clusters in a diagonal\ndef centers_diag_fn(nclu, csep, coff, rng):\n    return np.ones((nclu, len(csep))) * np.arange(1, nclu + 1)[:, None] * np.max(csep) + coff\n\ne061 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 10, 10], 12, 3, 2.5, rng=seed)\ne062 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 10, 10], 12, 3, 2.5, rng=seed,\n    cluster_offset=[30, -30, 30])\ne063 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 10, 10], 12, 3, 2.5, rng=seed,\n    cluster_offset=[-40, -40, -40], clucenters_fn=centers_diag_fn)\n
plt = plot_examples_3d(\n    e061, \"e061: default\",\n    e062, \"e062: cluster_offset=[30, -30, 30]\",\n    e063, \"e063: custom clucenters function\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#lateral-dispersion-and-placement-of-point-projections-on-the-line","title":"Lateral dispersion and placement of point projections on the line","text":""},{"location":"generated/gallery/plot_3_3d_examples/#normal-projection-placement-default-proj_dist_fnnorm","title":"Normal projection placement (default): proj_dist_fn=\"norm\"","text":"
seed = 246\n
e064 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 0.0, rng=seed)\ne065 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 1.0, rng=seed)\ne066 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 3.0, rng=seed)\n
plt = plot_examples_3d(\n    e064, \"e064: lateral_disp = 0\",\n    e065, \"e065: lateral_disp = 1\",\n    e066, \"e066: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#uniform-projection-placement-proj_dist_fnunif","title":"Uniform projection placement: proj_dist_fn=\"unif\"","text":"
seed = 246\n
e067 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 0.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne068 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 1.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne069 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 3.0, rng=seed,\n    proj_dist_fn=\"unif\")\n
plt = plot_examples_3d(\n    e067, \"e067: lateral_disp = 0\",\n    e068, \"e068: lateral_disp = 1\",\n    e069, \"e069: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#custom-projection-placement-using-the-laplace-distribution","title":"Custom projection placement using the Laplace distribution","text":"
seed = 246\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e070 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 0.0, rng=seed,\n    proj_dist_fn=proj_laplace)\ne071 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 1.0, rng=seed,\n    proj_dist_fn=proj_laplace)\ne072 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 3.0, rng=seed,\n    proj_dist_fn=proj_laplace)\n
plt = plot_examples_3d(\n    e070, \"e070: lateral_disp = 0\",\n    e071, \"e071: lateral_disp = 1\",\n    e072, \"e072: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#controlling-final-point-positions-from-their-projections-on-the-cluster-supporting-line","title":"Controlling final point positions from their projections on the cluster-supporting line","text":""},{"location":"generated/gallery/plot_3_3d_examples/#points-on-hyperplane-orthogonal-to-cluster-supporting-line-default-point_dist_fnn-1","title":"Points on hyperplane orthogonal to cluster-supporting line (default): point_dist_fn=\"n-1\"","text":"
seed = 840\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e073 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed)\ne074 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    proj_dist_fn=\"unif\")\ne075 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    proj_dist_fn=proj_laplace)\n
plt = plot_examples_3d(\n    e073, \"e073: proj_dist_fn=\\\"norm\\\" (default)\",\n    e074, \"e074: proj_dist_fn=\\\"unif\\\"\",\n    e075, \"e075: custom proj_dist_fn (Laplace)\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#points-around-projection-on-cluster-supporting-line-point_dist_fnn","title":"Points around projection on cluster-supporting line: point_dist_fn=\"n\"","text":"
seed = 840\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n\ne076 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=\"n\")\ne077 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=\"n\", proj_dist_fn=\"unif\")\ne078 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=\"n\", proj_dist_fn=proj_laplace)\n
plt = plot_examples_3d(\n    e076, \"e076: proj_dist_fn=\\\"norm\\\" (default)\",\n    e077, \"e077: proj_dist_fn=\\\"unif\\\"\",\n    e078, \"e078: custom proj_dist_fn (Laplace)\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#custom-point-placement-using-the-exponential-distribution","title":"Custom point placement using the exponential distribution","text":"

For this example we require the clupoints_n_1_template() helper function:

from pyclugen import clupoints_n_1_template\n
seed = 840\n
# Custom point_dist_fn: final points placed using the Exponential distribution\ndef clupoints_n_1_exp(projs, lat_std, len, clu_dir, clu_ctr, rng):\n    def dist_exp(npts, lstd, rg):\n        return lstd * rg.exponential(scale=2 / lstd, size=npts)\n    return clupoints_n_1_template(projs, lat_std, clu_dir, dist_exp, rng=rng)\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e079 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=clupoints_n_1_exp)\ne080 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=clupoints_n_1_exp, proj_dist_fn=\"unif\")\ne081 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=clupoints_n_1_exp, proj_dist_fn=proj_laplace)\n
plt = plot_examples_3d(\n    e079, \"e079: proj_dist_fn=\\\"norm\\\" (default)\",\n    e080, \"e080: proj_dist_fn=\\\"unif\\\"\",\n    e081, \"e081: custom proj_dist_fn (Laplace)\")\n

"},{"location":"generated/gallery/plot_3_3d_examples/#manipulating-cluster-sizes","title":"Manipulating cluster sizes","text":"
seed = 555\n
# Custom clusizes_fn (e083): cluster sizes determined via the uniform distribution,\n# no correction for total points\ndef clusizes_unif(nclu, npts, ae, rng):\n    return rng.integers(low=1, high=2 * npts / nclu + 1, size=nclu)\n
# Custom clusizes_fn (e084): clusters all have the same size, no correction for total points\ndef clusizes_equal(nclu, npts, ae, rng):\n    return (npts // nclu) * np.ones(nclu, dtype=int)\n
# Custom clucenters_fn (all): yields fixed positions for the clusters\ndef centers_fixed(nclu, csep, coff, rng):\n    return np.array([\n        [-csep[0], -csep[1], -csep[2]],\n        [csep[0], -csep[1], -csep[2]],\n        [-csep[0], csep[1], csep[2]],\n        [csep[0], csep[1], csep[2]]])\n
e082 = clugen(3, 4, 1500, [1, 1, 1], np.pi, [20, 20, 20], 0, 0, 5, rng=seed,\n    clucenters_fn=centers_fixed, point_dist_fn=\"n\")\ne083 = clugen(3, 4, 1500, [1, 1, 1], np.pi, [20, 20, 20], 0, 0, 5, rng=seed,\n    clucenters_fn=centers_fixed, clusizes_fn=clusizes_unif, point_dist_fn=\"n\")\ne084 = clugen(3, 4, 1500, [1, 1, 1], np.pi, [20, 20, 20], 0, 0, 5, rng=seed,\n    clucenters_fn=centers_fixed, clusizes_fn=clusizes_equal, point_dist_fn=\"n\")\n
plt = plot_examples_3d(\n    e082, \"e082: normal dist. (default)\",\n    e083, \"e083: unif. dist. (custom)\",\n    e084, \"e084: equal size (custom)\")\n

Total running time of the script: ( 0 minutes 5.237 seconds)

Download Python source code: plot_3_3d_examples.py

Download Jupyter notebook: plot_3_3d_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_4_nd_examples/","title":"Examples in nD","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_4_nd_examples/#examples-in-nd","title":"Examples in nD","text":"

This section contains several examples on how to generate nD (n > 3) data with pyclugen. To run the examples we first need to import the clugen() function:

import numpy as np\nfrom pyclugen import clugen\n

To plot these examples we use the plot_examples_nd function:

from plot_functions import plot_examples_nd\n
"},{"location":"generated/gallery/plot_4_nd_examples/#5d-example-with-default-optional-arguments","title":"5D example with default optional arguments","text":"
seed = 123\n
# Number of dimensions\nnd = 5\n
e085 = clugen(nd, 6, 1500, [1, 1, 0.5, 0, 0], np.pi / 16, 30 * np.ones(nd), 30, 4, 3, rng=seed)\n
plot_examples_nd(e085, \"e085: 5D with optional parameters set to defaults\")\n
"},{"location":"generated/gallery/plot_4_nd_examples/#5d-example-with-proj_dist_fn-unif-and-point_dist_fn-n","title":"5D example with proj_dist_fn = \"unif\" and point_dist_fn = \"n\"","text":"
seed = 579\n
# Number of dimensions\nnd = 5\n
e086 = clugen(nd, 6, 1500, [0.1, 0.3, 0.5, 0.3, 0.1], np.pi / 12, 30 * np.ones(nd), 35, 5, 3.5,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\", rng=seed)\n
plot_examples_nd(e086, \"e086: 5D with proj_dist_fn=\\\"unif\\\" and point_dist_fn=\\\"n\\\"\")\n
"},{"location":"generated/gallery/plot_4_nd_examples/#4d-example-with-custom-projection-placement-using-the-beta-distribution","title":"4D example with custom projection placement using the Beta distribution","text":"
seed = 963\n
# Number of dimensions\nnd = 4\n
# Custom proj_dist_fn: point projections placed using the Beta distribution\ndef proj_beta(len, n, rng):\n    return len * rng.beta(0.1, 0.1, size=n) - len / 2\n
e087 = clugen(nd, 5, 1500, np.ones(nd), np.pi / 6, 30 * np.ones(nd), 60, 15, 6, rng=seed,\n    proj_dist_fn=proj_beta)\n
plot_examples_nd(e087, \"e087: 4D with custom proj_dist_fn (Beta)\")\n

Total running time of the script: ( 0 minutes 6.376 seconds)

Download Python source code: plot_4_nd_examples.py

Download Jupyter notebook: plot_4_nd_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_5_mrg_examples/","title":"Merging and hierarchical cluster examples","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_5_mrg_examples/#merging-and-hierarchical-cluster-examples","title":"Merging and hierarchical cluster examples","text":"

This section contains several examples on how to merge cluster data, either generated with pyclugen or from other sources. To run the examples we first need to import the clugen() and clugen() functions:

import numpy as np\nfrom pyclugen import clugen, clumerge\n

Although it is possible to merge data in any dimension, these examples will focus on merging 2D data. Therefore, we'll use the same plot_examples_2d function used for the 2D examples:

from plot_functions import plot_examples_2d\n
"},{"location":"generated/gallery/plot_5_mrg_examples/#merging-two-data-sets-generated-with-clugen","title":"Merging two data sets generated with clugen()","text":"
seed1 = 444\nseed2 = 555\n
e088 = clugen(2, 5, 1000, [1, 1], np.pi / 12, [20, 20], 14, 1.2, 1.5, rng=seed1,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\")\ne089 = clugen(2, 3, 1500, [1, 0], 0.05, [20, 20], 0, 0, 4, rng=seed2,\n    point_dist_fn=\"n\", cluster_offset = [20, 0])\ne090 = clumerge(e088, e089)\n
plot_examples_2d(\n    e088, \"e088: data set 1\",\n    e089, \"e089: data set 2\",\n    e090, \"e090: merged data sets\")\n

In the previous example, clusters from individual data sets remain as separate clusters in the merged data set. It's also possible to maintain the original cluster labels by setting the clusters_field parameter to None:

e091 = clumerge(e088, e089, clusters_field=None)\n
plot_examples_2d(\n    e088, \"e088: data set 1\",\n    e089, \"e089: data set 2\",\n    e091, \"e091: merged data sets\")\n

"},{"location":"generated/gallery/plot_5_mrg_examples/#adding-noise-to-a-clugen-generated-data-set","title":"Adding noise to a clugen()-generated data set","text":"
seed = 333\n
prng = np.random.default_rng(seed)\ne092 = {\"points\": 120 * prng.random((500, 2)) - 60, \"clusters\": np.ones(500, dtype=np.int32)}\ne093 = clumerge(e092, e090) # clumerge(e092, e088, e089) would also work\n
plot_examples_2d(\n    e090, \"e090: original merged data sets\",\n    e092, \"e092: random uniform noise\",\n    e093, \"e093: data sets with noise\",\n    pmargin=0)\n
"},{"location":"generated/gallery/plot_5_mrg_examples/#merging-with-data-not-generated-with-clugen","title":"Merging with data not generated with clugen()","text":"

Data generated with clugen() can be merged with other data sets, for example data created with one of scikit-learn's generators:

seed = 321\n
from sklearn.datasets import make_moons\n\nX, y = make_moons(100, noise=0.05, random_state=seed)\n\ne094 = {\"points\": X, \"clusters\": y}\ne095 = clugen(2, 4, 200, [1, 1], np.pi / 12, [1, 1], 0.1, 0.01, 0.25, rng=seed,\n    proj_dist_fn = \"unif\", point_dist_fn = \"n\")\ne096 = clumerge(e094, e095)\n
plt = plot_examples_2d(\n    e094, \"e094: generated w/ make_moons()\",\n    e095, \"e095: generated w/ clugen()\",\n    e096, \"e096: merged data\")\n

We can also hierarchize clusters from different sources:

e097 = {**e094, \"hclusters\": np.ones(100, dtype=np.int32)}\ne098 = {**e095._asdict(), \"hclusters\": 2 * np.ones(200, np.int32)}\ne099 = clumerge(e097, e098, clusters_field=\"hclusters\")\n
plt = plot_examples_2d(\n    e097, \"e097: generated w/ make_moons()\",\n    e098, \"e098: generated w/ clugen()\",\n    e099, \"e099: merged data\",\n    clusters_field=\"hclusters\")\n

Total running time of the script: ( 0 minutes 3.156 seconds)

Download Python source code: plot_5_mrg_examples.py

Download Jupyter notebook: plot_5_mrg_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_functions/","title":"Plot functions","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_functions/#plot-functions","title":"Plot functions","text":"

Several auxiliary functions for plotting the examples in this documentation.

"},{"location":"generated/gallery/plot_functions/#import-the-required-libraries","title":"Import the required libraries","text":"
import os\nimport warnings\n\nimport matplotlib.pyplot as plt  # type: ignore\nimport numpy as np\nimport numpy.typing as npt\nimport pandas as pd\nimport seaborn as sns  # type: ignore\n\nfrom pyclugen import Clusters\n\n# Hide annoying warnings when building docs in CI\nif os.getenv(\"CI\") != None:\n    warnings.filterwarnings(\"ignore\")\n
"},{"location":"generated/gallery/plot_functions/#clusters2df","title":"clusters2df","text":"
def clusters2df(\n    *exs: Clusters | dict[str, npt.ArrayLike], clusters_field: str = \"clusters\"\n) -> pd.DataFrame:\n    \"\"\"Convert a sequence of clusters to a Pandas dataframe.\"\"\"\n\n    dfs = []\n    iex = 1\n\n    for ex in exs:\n        if isinstance(ex, dict):\n            points = ex[\"points\"]\n            clusters = ex[clusters_field]\n        else:\n            points = ex.points\n            clusters = ex.clusters\n\n        df = pd.DataFrame(\n            data=points, columns=[f\"x{i}\" for i in range(np.size(points, 1))]\n        )\n        df[\"cluster\"] = clusters.tolist()\n        df[\"example\"] = [iex] * clusters.size\n        dfs.append(df)\n        iex += 1\n\n    return pd.concat(dfs, ignore_index=True)\n
"},{"location":"generated/gallery/plot_functions/#get_plot_lims","title":"get_plot_lims","text":"
def get_plot_lims(df: pd.DataFrame, pmargin: float = 0.1):\n    \"\"\"Determine the plot limits for the cluster data given in `df`.\"\"\"\n\n    # Get maximum and minimum points in each dimension\n    xmaxs = df.iloc[:, :-2].max()\n    xmins = df.iloc[:, :-2].min()\n\n    # Determine plot centers in each dimension\n    xcenters = (xmaxs + xmins) / 2\n\n    # Determine plots span for all dimensions\n    sidespan = (1 + pmargin) * np.max(np.abs(xmaxs - xmins)) / 2\n\n    # Determine final plots limits\n    xmaxs = xcenters + sidespan\n    xmins = xcenters - sidespan\n\n    return xmaxs, xmins\n
"},{"location":"generated/gallery/plot_functions/#plot_examples_1d","title":"plot_examples_1d","text":"
def plot_examples_1d(*ets, ncols: int = 3, clusters_field: str = \"clusters\"):\n    \"\"\"Plot the 1D examples given in the ets parameter.\"\"\"\n\n    # Get examples\n    ex = ets[0::2]\n    # Get titles\n    et = ets[1::2]\n\n    df = clusters2df(*ex, clusters_field=clusters_field)\n\n    # Set seaborn's dark grid style\n    sns.set_theme(style=\"darkgrid\")\n\n    # Use seaborn to create the plots\n    g = sns.FacetGrid(df, col=\"example\", hue=\"cluster\", col_wrap=ncols)\n\n    # Plot the kernel density estimation plots\n    g.map(sns.kdeplot, \"x0\", multiple=\"layer\", fill=True)\n\n    # Get a flattened view of the axes array\n    g_axes = g.axes.reshape(-1)\n\n    # Determine the height of the rugs in the rug plot to 5% of total height\n    rug_height = g_axes[0].get_ylim()[1] * 0.05\n\n    # Plot the rug markers below the kde plots\n    g.map(sns.rugplot, \"x0\", height=rug_height)\n\n    # Set titles\n    for ax, t in zip(g_axes, et):\n        ax.set_title(t)\n
"},{"location":"generated/gallery/plot_functions/#plot_examples_2d","title":"plot_examples_2d","text":"
def plot_examples_2d(\n    *ets, pmargin: float = 0.1, ncols: int = 3, clusters_field: str = \"clusters\"\n):\n    \"\"\"Plot the 2D examples given in the ets parameter.\"\"\"\n\n    # Get examples\n    ex = ets[0::2]\n    # Get titles\n    et = ets[1::2]\n\n    df = clusters2df(*ex, clusters_field=clusters_field)\n\n    # Get limits in each dimension\n    xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)\n\n    # Set seaborn's dark grid style\n    sns.set_theme(style=\"darkgrid\")\n\n    # Use seaborn to create the plots\n    g = sns.FacetGrid(\n        df,\n        col=\"example\",\n        hue=\"cluster\",\n        xlim=(xmins.iloc[0], xmaxs.iloc[0]),\n        ylim=(xmins.iloc[1], xmaxs.iloc[1]),\n        aspect=1,\n        col_wrap=ncols,\n    )\n\n    g.map(sns.scatterplot, \"x0\", \"x1\", s=10)\n\n    # Set the plot titles and x, y labels\n    for ax, t in zip(g.axes, et):\n        ax.set_title(t)\n        ax.set_xlabel(\"x\")\n        ax.set_ylabel(\"y\")\n
"},{"location":"generated/gallery/plot_functions/#plot_examples_3d","title":"plot_examples_3d","text":"
def plot_examples_3d(\n    *ets,\n    pmargin: float = 0.1,\n    ncols: int = 3,\n    side=350,\n    clusters_field: str = \"clusters\",\n):\n    \"\"\"Plot the 3D examples given in the ets parameter.\"\"\"\n\n    # Get examples\n    ex = ets[0::2]\n    # Get titles\n    et = ets[1::2]\n\n    # Number of plots and number of rows in combined plot\n    num_plots = len(ex)\n    nrows = max(1, int(np.ceil(num_plots / ncols)))\n    blank_plots = nrows * ncols - num_plots\n\n    df = clusters2df(*ex, clusters_field=clusters_field)\n\n    # Get limits in each dimension\n    xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)\n\n    # Reset to default Matplotlib style, to avoid seaborn interference\n    sns.reset_orig()\n\n    # To convert inches to pixels afterwards\n    px = 1 / plt.rcParams[\"figure.dpi\"]  # pixel in inches\n\n    # Use Matplotlib to create the plots\n    _, axs = plt.subplots(\n        nrows,\n        ncols,\n        figsize=(side * px * ncols, side * px * nrows),\n        subplot_kw=dict(projection=\"3d\"),\n    )\n    axs = axs.reshape(-1)\n    for ax, e, t in zip(axs, ex, et):\n        ax.set_title(t, fontsize=10)\n        ax.set_xlim(xmins.iloc[0], xmaxs.iloc[0])\n        ax.set_ylim(xmins.iloc[1], xmaxs.iloc[1])\n        ax.set_zlim(xmins.iloc[2], xmaxs.iloc[2])\n        ax.set_xlabel(\"$x$\", labelpad=-2)\n        ax.set_ylabel(\"$y$\", labelpad=-2)\n        ax.set_zlabel(\"$z$\", labelpad=-2)\n        ax.tick_params(labelsize=8, pad=-2)\n        ax.scatter(\n            e.points[:, 0],\n            e.points[:, 1],\n            e.points[:, 2],\n            c=e.clusters,\n            depthshade=False,\n            edgecolor=\"black\",\n            linewidths=0.2,\n        )\n\n    # Remaining plots are left blank\n    for ax in axs[len(ex) : len(ex) + blank_plots]:\n        ax.set_axis_off()\n        ax.set_facecolor(color=\"white\")\n        ax.patch.set_alpha(0)\n
"},{"location":"generated/gallery/plot_functions/#plot_examples_nd","title":"plot_examples_nd","text":"
def plot_examples_nd(\n    ex: Clusters, t: str, pmargin: float = 0.1, clusters_field: str = \"clusters\"\n):\n    \"\"\"Plot the nD example given in the ex parameter.\"\"\"\n\n    # How many dimensions?\n    nd = ex.points.shape[1]\n\n    df = clusters2df(ex, clusters_field=clusters_field)\n\n    # Get limits in each dimension\n    xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)\n\n    # Set seaborn's dark grid style\n    sns.set_theme(style=\"darkgrid\")\n\n    # Create pairwise plots with nothing on the diagonal\n    g = sns.PairGrid(df.iloc[:, :-1], hue=\"cluster\", palette=\"deep\")\n    g.map_offdiag(sns.scatterplot, s=10)\n    g.figure.suptitle(t, y=1)\n\n    # Decorate plot\n    for i in range(nd):\n        for j in range(nd):\n            if i == j:\n                # Set the x labels in the diagonal plots\n                xycoord = (xmaxs.iloc[i] + xmins.iloc[i]) / 2\n                g.axes[i, i].text(\n                    xycoord, xycoord, f\"$x{i}$\", fontsize=20, ha=\"center\", va=\"center\"\n                )\n            else:\n                # Set appropriate plot intervals and aspect ratio\n                g.axes[i, j].set_xlim([xmins.iloc[j], xmaxs.iloc[j]])\n                g.axes[i, j].set_ylim([xmins.iloc[i], xmaxs.iloc[i]])\n                g.axes[i, j].set_aspect(1)\n

Total running time of the script: ( 0 minutes 0.005 seconds)

Download Python source code: plot_functions.py

Download Jupyter notebook: plot_functions.ipynb

Gallery generated by mkdocs-gallery

"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"pyclugen","text":"

pyclugen is Python package for generating multidimensional clusters. Each cluster is supported by a line segment, the position, orientation and length of which guide where the respective points are placed. The clugen() function is provided for this purpose, as well as a number of auxiliary functions, used internally and modularly by clugen(). Users can swap these auxiliary functions by their own customized versions, fine-tuning their cluster generation strategies, or even use them as the basis for their own generation algorithms.

"},{"location":"#installation","title":"Installation","text":"

Install from PyPI:

pip install --upgrade pip\npip install pyclugen\n

Or directly from GitHub:

pip install --upgrade pip\npip install git+https://github.com/clugen/pyclugen.git#egg=pyclugen\n
"},{"location":"#quick-start","title":"Quick start","text":"
from pyclugen import clugen\nimport matplotlib.pyplot as plt\n
out2 = clugen(2, 4, 400, [1, 0], 0.4, [50, 10], 20, 1, 2)\nplt.scatter(out2.points[:, 0], out2.points[:, 1], c=out2.clusters)\nplt.show()\n
out3 = clugen(3, 5, 10000, [0.5, 0.5, 0.5], 0.2, [10, 10, 10], 10, 1, 2)\nfig = plt.figure()\nax = fig.add_subplot(projection=\"3d\")\nax.scatter(out3.points[:, 0], out3.points[:, 1], out3.points[:, 2], c=out3.clusters)\nplt.show()\n
"},{"location":"#further-reading","title":"Further reading","text":"

The clugen algorithm and its several implementations are detailed in the following reference (please cite it if you use this software):

  • Fachada, N. & de Andrade, D. (2023). Generating multidimensional clusters with support lines. Knowledge-Based Systems, 277, 110836. https://doi.org/10.1016/j.knosys.2023.110836 (arXiv preprint)
"},{"location":"#also-in-this-documentation","title":"Also in this documentation","text":"
  • Theory: the clugen algorithm in detail
  • Detailed usage examples
  • Reference
  • Developing this package
"},{"location":"dev/","title":"Development","text":""},{"location":"dev/#installing-for-development-andor-improving-the-package","title":"Installing for development and/or improving the package","text":"
$ git clone https://github.com/clugen/pyclugen.git\n$ cd pyclugen\n$ python -m venv env\n$ source env/bin/activate\n$ pip install -e .[dev]\n$ pre-commit install\n

On Windows replace source env/bin/activate with . env\\Scripts\\activate.

"},{"location":"dev/#run-tests","title":"Run tests","text":"

Tests can be executed with the following command:

$ pytest\n

The previous command runs the tests at normal level by default. This test level can also be specified explicitly:

$ pytest --test-level=normal\n

There are four test levels, from fastest to slowest (i.e., from less thorough to more exhaustive): fast, ci, normal and full. The fast level tests all functions using typical parameters, just to check if everything is working. The ci level performs the minimal amount of testing that yields complete test coverage. Beyond complete coverage, the normal and full levels also test increasing combinations of parameters and PRNG seeds, which may be important to root out rare corner cases. Note that the full level can be extremely slow.

To generate a test coverage report, run pytest as follows:

$ pytest --cov=pyclugen --cov-report=html --test-level=ci\n
"},{"location":"dev/#build-docs","title":"Build docs","text":"

Considering we're in the pyclugen folder, run the following commands:

$ cd docs\n$ mkdocs build\n

The generated documentation will be placed in docs/site. Alternatively, the documentation can be generated and served locally with:

$ mkdocs serve\n
"},{"location":"dev/#code-style","title":"Code style","text":"

Code style is enforced with flake8 (and a number of plugins), black, and isort. Some highlights include, but are not limited to:

  • Encoding: UTF-8
  • Indentation: 4 spaces (no tabs)
  • Line size limit: 88 chars
  • Newlines: Unix style, i.e. LF or \\n
"},{"location":"reference/","title":"Reference","text":"

Various functions for multidimensional cluster generation in Python.

Note that:

  1. clugen() is the main function of the pyclugen package, and possibly the only function most users will need.
  2. Functions which accept rng as the last parameter are stochastic. Thus, in order to obtain the same result on separate invocations of these functions, pass them an instance of same pseudo-random number Generator initialized with the same seed.
"},{"location":"reference/#pyclugen.Clusters","title":"Clusters","text":"

Bases: NamedTuple

Read-only container for results returned by clugen().

The symbols presented in the instances variable below have the following meanings:

  • \\(n\\) : Number of dimensions.
  • \\(p\\) : Number of points.
  • \\(c\\) : Number of clusters.
Source code in pyclugen/main.py
class Clusters(NamedTuple):\n    r\"\"\"Read-only container for results returned by [`clugen()`][pyclugen.main.clugen].\n\n    The symbols presented in the instances variable below have the following\n    meanings:\n\n    - $n$ : Number of dimensions.\n    - $p$ : Number of points.\n    - $c$ : Number of clusters.\n    \"\"\"\n\n    points: NDArray\n    r\"\"\"$p \\times n$ matrix containing the generated points for all clusters.\"\"\"\n\n    clusters: NDArray\n    r\"\"\"Vector of size $p$ indicating the cluster each point in `points`\n    belongs to.\"\"\"\n\n    projections: NDArray\n    r\"\"\"$p \\times n$ matrix with the point projections on the cluster-supporting\n    lines.\"\"\"\n\n    sizes: NDArray\n    r\"\"\"Vector of size $c$ with the number of points in each cluster.\"\"\"\n\n    centers: NDArray\n    r\"\"\"$c \\times n$ matrix with the coordinates of the cluster centers.\"\"\"\n\n    directions: NDArray\n    r\"\"\"$c \\times n$ matrix with the direction of each cluster-supporting line.\"\"\"\n\n    angles: NDArray\n    r\"\"\"Vector of size $c$ with the angles between the cluster-supporting lines and\n    the main direction.\"\"\"\n\n    lengths: NDArray\n    r\"\"\"Vector of size $c$ with the lengths of the cluster-supporting lines.\"\"\"\n
"},{"location":"reference/#pyclugen.Clusters.angles","title":"angles instance-attribute","text":"
angles: NDArray\n

Vector of size \\(c\\) with the angles between the cluster-supporting lines and the main direction.

"},{"location":"reference/#pyclugen.Clusters.centers","title":"centers instance-attribute","text":"
centers: NDArray\n

\\(c \\times n\\) matrix with the coordinates of the cluster centers.

"},{"location":"reference/#pyclugen.Clusters.clusters","title":"clusters instance-attribute","text":"
clusters: NDArray\n

Vector of size \\(p\\) indicating the cluster each point in points belongs to.

"},{"location":"reference/#pyclugen.Clusters.directions","title":"directions instance-attribute","text":"
directions: NDArray\n

\\(c \\times n\\) matrix with the direction of each cluster-supporting line.

"},{"location":"reference/#pyclugen.Clusters.lengths","title":"lengths instance-attribute","text":"
lengths: NDArray\n

Vector of size \\(c\\) with the lengths of the cluster-supporting lines.

"},{"location":"reference/#pyclugen.Clusters.points","title":"points instance-attribute","text":"
points: NDArray\n

\\(p \\times n\\) matrix containing the generated points for all clusters.

"},{"location":"reference/#pyclugen.Clusters.projections","title":"projections instance-attribute","text":"
projections: NDArray\n

\\(p \\times n\\) matrix with the point projections on the cluster-supporting lines.

"},{"location":"reference/#pyclugen.Clusters.sizes","title":"sizes instance-attribute","text":"
sizes: NDArray\n

Vector of size \\(c\\) with the number of points in each cluster.

"},{"location":"reference/#pyclugen.angle_btw","title":"angle_btw","text":"
angle_btw(v1: NDArray, v2: NDArray) -> float\n

Angle between two \\(n\\)-dimensional vectors.

Typically, the angle between two vectors v1 and v2 can be obtained with:

arccos(dot(u, v) / (norm(u) * norm(v)))\n

However, this approach is numerically unstable. The version provided here is numerically stable and based on the AngleBetweenVectors Julia package by Jeffrey Sarnoff (MIT license), implementing an algorithm provided by Prof. W. Kahan in these notes (see page 15).

Examples:

>>> from numpy import array, degrees\n>>> from pyclugen import angle_btw\n>>> v1 = array([1.0, 1.0, 1.0, 1.0])\n>>> v2 = array([1.0, 0.0, 0.0, 0.0])\n>>> degrees(angle_btw(v1, v2))\n60.00000000000001\n

Parameters:

Name Type Description Default v1 NDArray

First vector.

required v2 NDArray

Second vector.

required

Returns:

Type Description float

Angle between v1 and v2 in radians.

Source code in pyclugen/helper.py
def angle_btw(v1: NDArray, v2: NDArray) -> float:\n    r\"\"\"Angle between two $n$-dimensional vectors.\n\n    Typically, the angle between two vectors `v1` and `v2` can be obtained with:\n\n    ```python\n    arccos(dot(u, v) / (norm(u) * norm(v)))\n    ```\n\n    However, this approach is numerically unstable. The version provided here is\n    numerically stable and based on the\n    [AngleBetweenVectors](https://github.com/JeffreySarnoff/AngleBetweenVectors.jl)\n    Julia package by Jeffrey Sarnoff (MIT license), implementing an algorithm\n    provided by Prof. W. Kahan in\n    [these notes](https://people.eecs.berkeley.edu/~wkahan/MathH110/Cross.pdf)\n    (see page 15).\n\n    Examples:\n        >>> from numpy import array, degrees\n        >>> from pyclugen import angle_btw\n        >>> v1 = array([1.0, 1.0, 1.0, 1.0])\n        >>> v2 = array([1.0, 0.0, 0.0, 0.0])\n        >>> degrees(angle_btw(v1, v2))\n        60.00000000000001\n\n    Args:\n      v1: First vector.\n      v2: Second vector.\n\n    Returns:\n      Angle between `v1` and `v2` in radians.\n    \"\"\"\n    u1 = v1 / norm(v1)\n    u2 = v2 / norm(v2)\n\n    y = u1 - u2\n    x = u1 + u2\n\n    return 2 * arctan(norm(y) / norm(x))\n
"},{"location":"reference/#pyclugen.angle_deltas","title":"angle_deltas","text":"
angle_deltas(\n    num_clusters: int, angle_disp: float, rng: Generator = _default_rng\n) -> NDArray\n

Get angles between average cluster direction and cluster-supporting lines.

Determine the angles between the average cluster direction and the cluster-supporting lines. These angles are obtained from a wrapped normal distribution ( \\(\\mu=0\\), \\(\\sigma=\\)angle_disp) with support in the interval \\(\\left[-\\pi/2,\\pi/2\\right]\\). Note this is different from the standard wrapped normal distribution, the support of which is given by the interval \\(\\left[-\\pi,\\pi\\right]\\).

Examples:

>>> from pyclugen import angle_deltas\n>>> from numpy import degrees, pi\n>>> from numpy.random import Generator, PCG64\n>>> prng = Generator(PCG64(123))\n>>> a_rad = angle_deltas(4, pi/8, rng=prng) # Angle dispersion of 22.5 degrees\n>>> a_rad\narray([-0.38842705, -0.14442948,  0.50576707,  0.07617358])\n>>> degrees(a_rad) # Show angle deltas in degrees\narray([-22.25523038,  -8.27519966,  28.97831838,   4.36442443])\n

Parameters:

Name Type Description Default num_clusters int

Number of clusters.

required angle_disp float

Angle dispersion, in radians.

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Angles between the average cluster direction and the cluster-supporting lines, given in radians in the interval \\(\\left[-\\pi/2,\\pi/2\\right]\\).

Source code in pyclugen/module.py
def angle_deltas(\n    num_clusters: int, angle_disp: float, rng: Generator = _default_rng\n) -> NDArray:\n    r\"\"\"Get angles between average cluster direction and cluster-supporting lines.\n\n    Determine the angles between the average cluster direction and the\n    cluster-supporting lines. These angles are obtained from a wrapped normal\n    distribution ( $\\mu=0$, $\\sigma=$`angle_disp`) with support in the interval\n    $\\left[-\\pi/2,\\pi/2\\right]$. Note this is different from the standard\n    wrapped normal distribution, the support of which is given by the interval\n    $\\left[-\\pi,\\pi\\right]$.\n\n    Examples:\n        >>> from pyclugen import angle_deltas\n        >>> from numpy import degrees, pi\n        >>> from numpy.random import Generator, PCG64\n        >>> prng = Generator(PCG64(123))\n        >>> a_rad = angle_deltas(4, pi/8, rng=prng) # Angle dispersion of 22.5 degrees\n        >>> a_rad\n        array([-0.38842705, -0.14442948,  0.50576707,  0.07617358])\n        >>> degrees(a_rad) # Show angle deltas in degrees\n        array([-22.25523038,  -8.27519966,  28.97831838,   4.36442443])\n\n    Args:\n      num_clusters: Number of clusters.\n      angle_disp: Angle dispersion, in radians.\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Angles between the average cluster direction and the cluster-supporting\n        lines, given in radians in the interval $\\left[-\\pi/2,\\pi/2\\right]$.\n    \"\"\"\n    # Get random angle differences using the normal distribution\n    angles = angle_disp * rng.normal(size=num_clusters)\n\n    # Reduce angle differences to the interval [-\u03c0, \u03c0]\n    angles = arctan2(sin(angles), cos(angles))\n\n    # Make sure angle differences are within interval [-\u03c0/2, \u03c0/2]\n    return where(abs(angles) > pi / 2, angles - sign(angles) * pi / 2, angles)\n
"},{"location":"reference/#pyclugen.clucenters","title":"clucenters","text":"
clucenters(\n    num_clusters: int,\n    clu_sep: NDArray,\n    clu_offset: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Determine cluster centers using the uniform distribution.

The number of clusters (num_clusters) and the average cluster separation (clu_sep) are taken into account.

More specifically, let \\(c=\\)num_clusters, \\(\\mathbf{s}=\\)clu_sep.reshape(-1,1), \\(\\mathbf{o}=\\)clu_offset.reshape(-1,1), \\(n=\\)clu_sep.size (i.e., number of dimensions). Cluster centers are obtained according to the following equation:

\\[ \\mathbf{C}=c\\mathbf{U} \\cdot \\operatorname{diag}(\\mathbf{s}) + \\mathbf{1}\\,\\mathbf{o}^T \\]

where \\(\\mathbf{C}\\) is the \\(c \\times n\\) matrix of cluster centers, \\(\\mathbf{U}\\) is an \\(c \\times n\\) matrix of random values drawn from the uniform distribution between -0.5 and 0.5, and \\(\\mathbf{1}\\) is an \\(c \\times 1\\) vector with all entries equal to 1.

Examples:

>>> from pyclugen import clucenters\n>>> from numpy import array\n>>> from numpy.random import Generator, PCG64\n>>> prng = Generator(PCG64(123))\n>>> clucenters(3, array([30,10]), array([-50,50]), rng=prng)\narray([[-33.58833231,  36.61463056],\n       [-75.16761145,  40.53115432],\n       [-79.1684689 ,  59.3628352 ]])\n

Parameters:

Name Type Description Default num_clusters int

Number of clusters.

required clu_sep NDArray

Average cluster separation ( \\(n \\times 1\\) vector).

required clu_offset NDArray

Cluster offsets ( \\(n \\times 1\\) vector).

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

A \\(c \\times n\\) matrix containing the cluster centers.

Source code in pyclugen/module.py
def clucenters(\n    num_clusters: int,\n    clu_sep: NDArray,\n    clu_offset: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Determine cluster centers using the uniform distribution.\n\n    The number of clusters (`num_clusters`) and the average cluster separation\n    (`clu_sep`) are taken into account.\n\n    More specifically, let $c=$`num_clusters`, $\\mathbf{s}=$`clu_sep.reshape(-1,1)`,\n    $\\mathbf{o}=$`clu_offset.reshape(-1,1)`, $n=$`clu_sep.size` (i.e., number of\n    dimensions). Cluster centers are obtained according to the following equation:\n\n    $$\n    \\mathbf{C}=c\\mathbf{U} \\cdot \\operatorname{diag}(\\mathbf{s}) +\n        \\mathbf{1}\\,\\mathbf{o}^T\n    $$\n\n    where $\\mathbf{C}$ is the $c \\times n$ matrix of cluster centers,\n    $\\mathbf{U}$ is an $c \\times n$ matrix of random values drawn from the\n    uniform distribution between -0.5 and 0.5, and $\\mathbf{1}$ is an $c \\times\n    1$ vector with all entries equal to 1.\n\n    Examples:\n        >>> from pyclugen import clucenters\n        >>> from numpy import array\n        >>> from numpy.random import Generator, PCG64\n        >>> prng = Generator(PCG64(123))\n        >>> clucenters(3, array([30,10]), array([-50,50]), rng=prng)\n        array([[-33.58833231,  36.61463056],\n               [-75.16761145,  40.53115432],\n               [-79.1684689 ,  59.3628352 ]])\n\n    Args:\n      num_clusters: Number of clusters.\n      clu_sep: Average cluster separation ( $n \\times 1$ vector).\n      clu_offset: Cluster offsets ( $n \\times 1$ vector).\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n        A $c \\times n$ matrix containing the cluster centers.\n    \"\"\"\n    # Obtain a num_clusters x num_dims matrix of uniformly distributed values\n    # between -0.5 and 0.5 representing the relative cluster centers\n    ctr_rel = rng.random((num_clusters, clu_sep.size)) - 0.5\n\n    return num_clusters * (ctr_rel @ diag(clu_sep)) + clu_offset\n
"},{"location":"reference/#pyclugen.clugen","title":"clugen","text":"
clugen(\n    num_dims: int,\n    num_clusters: int,\n    num_points: int,\n    direction: ArrayLike,\n    angle_disp: float,\n    cluster_sep: ArrayLike,\n    llength: float,\n    llength_disp: float,\n    lateral_disp: float,\n    allow_empty: bool = False,\n    cluster_offset: Optional[ArrayLike] = None,\n    proj_dist_fn: str | Callable[[float, int, Generator], NDArray] = \"norm\",\n    point_dist_fn: str\n    | Callable[\n        [NDArray, float, float, NDArray, NDArray, Generator], NDArray\n    ] = \"n-1\",\n    clusizes_fn: Callable[[int, int, bool, Generator], NDArray]\n    | ArrayLike = clusizes,\n    clucenters_fn: Callable[[int, NDArray, NDArray, Generator], NDArray]\n    | ArrayLike = clucenters,\n    llengths_fn: Callable[[int, float, float, Generator], NDArray]\n    | ArrayLike = llengths,\n    angle_deltas_fn: Callable[[int, float, Generator], NDArray]\n    | ArrayLike = angle_deltas,\n    rng: int | Generator = _default_rng,\n) -> Clusters\n

Generate multidimensional clusters.

Tip

This is the main function of the pyclugen package, and possibly the only function most users will need.

"},{"location":"reference/#pyclugen.clugen--examples","title":"Examples:","text":"
>>> import matplotlib.pyplot as plt\n>>> from pyclugen import clugen\n>>> from numpy import pi\n>>> out = clugen(2, 5, 10000, [1, 0.5], pi/16, [10, 40], 10, 1, 2, rng=321)\n>>> out.centers # What are the cluster centers?\narray([[ 20.02876212,  36.59611434],\n       [-15.60290734, -26.52169579],\n       [ 23.09775166,  91.66309916],\n       [ -5.76816015,  54.9775074 ],\n       [ -4.64224681,  78.40990876]])\n>>> plt.scatter(out.points[:,0],\n...             out.points[:,1],\n...             c=out.clusters) # doctest: +SKIP\n>>> plt.show() # doctest: +SKIP\n

Note

In the descriptions below, the terms \"average\" and \"dispersion\" refer to measures of central tendency and statistical dispersion, respectively. Their exact meaning depends on several optional arguments.

Parameters:

Name Type Description Default num_dims int

Number of dimensions.

required num_clusters int

Number of clusters to generate.

required num_points int

Total number of points to generate.

required direction ArrayLike

Average direction of the cluster-supporting lines. Can be a vector of length num_dims (same direction for all clusters) or a matrix of size num_clusters x num_dims (one direction per cluster).

required angle_disp float

Angle dispersion of cluster-supporting lines (radians).

required cluster_sep ArrayLike

Average cluster separation in each dimension (vector of size num_dims).

required llength float

Average length of cluster-supporting lines.

required llength_disp float

Length dispersion of cluster-supporting lines.

required lateral_disp float

Cluster lateral dispersion, i.e., dispersion of points from their projection on the cluster-supporting line.

required allow_empty bool

Allow empty clusters? False by default.

False cluster_offset Optional[ArrayLike]

Offset to add to all cluster centers (vector of size num_dims). By default the offset will be equal to numpy.zeros(num_dims).

None proj_dist_fn str | Callable[[float, int, Generator], NDArray]

Distribution of point projections along cluster-supporting lines, with three possible values:

  • \"norm\" (default): Distribute point projections along lines using a normal distribution (\u03bc=line center, \u03c3=llength/6).
  • \"unif\": Distribute points uniformly along the line.
  • User-defined function, which accepts three parameters, line length (float), number of points (int), and an instance of Generator, and returns an array containing the distance of each point projection to the center of the line. For example, the \"norm\" option roughly corresponds to lambda l, n, rg: l * rg.random((n, 1)) / 6.
'norm' point_dist_fn str | Callable[[NDArray, float, float, NDArray, NDArray, Generator], NDArray]

Controls how the final points are created from their projections on the cluster-supporting lines, with three possible values:

  • \"n-1\" (default): Final points are placed on a hyperplane orthogonal to the cluster-supporting line, centered at each point's projection, using the normal distribution (\u03bc=0, \u03c3=lateral_disp). This is done by the clupoints_n_1() function.
  • \"n\": Final points are placed around their projection on the cluster-supporting line using the normal distribution (\u03bc=0, \u03c3=lateral_disp). This is done by the clupoints_n() function.
  • User-defined function: The user can specify a custom point placement strategy by passing a function with the same signature as clupoints_n_1() and clupoints_n().
'n-1' clusizes_fn Callable[[int, int, bool, Generator], NDArray] | ArrayLike

Distribution of cluster sizes. By default, cluster sizes are determined by the clusizes() function, which uses the normal distribution (\u03bc=num_points/num_clusters, \u03c3=\u03bc/3), and assures that the final cluster sizes add up to num_points. This parameter allows the user to specify a custom function for this purpose, which must follow clusizes() signature. Note that custom functions are not required to strictly obey the num_points parameter. Alternatively, the user can specify an array of cluster sizes directly.

clusizes clucenters_fn Callable[[int, NDArray, NDArray, Generator], NDArray] | ArrayLike

Distribution of cluster centers. By default, cluster centers are determined by the clucenters() function, which uses the uniform distribution, and takes into account the num_clusters and cluster_sep parameters for generating well-distributed cluster centers. This parameter allows the user to specify a custom function for this purpose, which must follow clucenters() signature. Alternatively, the user can specify a matrix of size num_clusters x num_dims with the exact cluster centers.

clucenters llengths_fn Callable[[int, float, float, Generator], NDArray] | ArrayLike

Distribution of line lengths. By default, the lengths of cluster-supporting lines are determined by the llengths() function, which uses the folded normal distribution (\u03bc=llength, \u03c3=llength_disp). This parameter allows the user to specify a custom function for this purpose, which must follow llengths() signature. Alternatively, the user can specify an array of line lengths directly.

llengths angle_deltas_fn Callable[[int, float, Generator], NDArray] | ArrayLike

Distribution of line angle differences with respect to direction. By default, the angles between direction and the direction of cluster-supporting lines are determined by the angle_deltas() function, which uses the wrapped normal distribution (\u03bc=0, \u03c3=angle_disp) with support in the interval [-\u03c0/2, \u03c0/2]. This parameter allows the user to specify a custom function for this purpose, which must follow angle_deltas() signature. Alternatively, the user can specify an array of angle deltas directly.

angle_deltas rng int | Generator

The seed for the random number generator or an instance of Generator for reproducible executions.

_default_rng

Returns:

Type Description Clusters

The generated clusters and associated information in the form of a Clusters object.

Source code in pyclugen/main.py
def clugen(\n    num_dims: int,\n    num_clusters: int,\n    num_points: int,\n    direction: ArrayLike,\n    angle_disp: float,\n    cluster_sep: ArrayLike,\n    llength: float,\n    llength_disp: float,\n    lateral_disp: float,\n    allow_empty: bool = False,\n    cluster_offset: Optional[ArrayLike] = None,\n    proj_dist_fn: str | Callable[[float, int, Generator], NDArray] = \"norm\",\n    point_dist_fn: str\n    | Callable[[NDArray, float, float, NDArray, NDArray, Generator], NDArray] = \"n-1\",\n    clusizes_fn: Callable[[int, int, bool, Generator], NDArray] | ArrayLike = clusizes,\n    clucenters_fn: Callable[[int, NDArray, NDArray, Generator], NDArray]\n    | ArrayLike = clucenters,\n    llengths_fn: Callable[[int, float, float, Generator], NDArray]\n    | ArrayLike = llengths,\n    angle_deltas_fn: Callable[[int, float, Generator], NDArray]\n    | ArrayLike = angle_deltas,\n    rng: int | Generator = _default_rng,\n) -> Clusters:\n    \"\"\"Generate multidimensional clusters.\n\n    !!! tip\n        This is the main function of the **pyclugen** package, and possibly the\n        only function most users will need.\n\n    ## Examples:\n\n        >>> import matplotlib.pyplot as plt\n        >>> from pyclugen import clugen\n        >>> from numpy import pi\n        >>> out = clugen(2, 5, 10000, [1, 0.5], pi/16, [10, 40], 10, 1, 2, rng=321)\n        >>> out.centers # What are the cluster centers?\n        array([[ 20.02876212,  36.59611434],\n               [-15.60290734, -26.52169579],\n               [ 23.09775166,  91.66309916],\n               [ -5.76816015,  54.9775074 ],\n               [ -4.64224681,  78.40990876]])\n        >>> plt.scatter(out.points[:,0],\n        ...             out.points[:,1],\n        ...             c=out.clusters) # doctest: +SKIP\n        >>> plt.show() # doctest: +SKIP\n\n    ![clugen](https://user-images.githubusercontent.com/3018963/151056890-c83c9509-b40d-4ab2-a842-f2a4706344c6.png)\n\n    !!! Note\n        In the descriptions below, the terms \"average\" and \"dispersion\" refer to\n        measures of central tendency and statistical dispersion, respectively.\n        Their exact meaning depends on several optional arguments.\n\n    Args:\n      num_dims: Number of dimensions.\n      num_clusters: Number of clusters to generate.\n      num_points: Total number of points to generate.\n      direction: Average direction of the cluster-supporting lines. Can be a\n        vector of length `num_dims` (same direction for all clusters) or a\n        matrix of size `num_clusters` x `num_dims` (one direction per cluster).\n      angle_disp: Angle dispersion of cluster-supporting lines (radians).\n      cluster_sep: Average cluster separation in each dimension (vector of size\n        `num_dims`).\n      llength: Average length of cluster-supporting lines.\n      llength_disp: Length dispersion of cluster-supporting lines.\n      lateral_disp: Cluster lateral dispersion, i.e., dispersion of points from their\n        projection on the cluster-supporting line.\n      allow_empty: Allow empty clusters? `False` by default.\n      cluster_offset: Offset to add to all cluster centers (vector of size `num_dims`).\n        By default the offset will be equal to `numpy.zeros(num_dims)`.\n      proj_dist_fn: Distribution of point projections along cluster-supporting lines,\n        with three possible values:\n\n        - `\"norm\"` (default): Distribute point projections along lines using a normal\n          distribution (\u03bc=_line center_, \u03c3=`llength/6`).\n        - `\"unif\"`: Distribute points uniformly along the line.\n        - User-defined function, which accepts three parameters, line length (`float`),\n          number of points (`int`), and an instance of\n          [`Generator`](https://numpy.org/doc/stable/reference/random/generator.html?highlight=generator#numpy.random.Generator),\n          and returns an array containing the distance of each point projection to\n          the center of the line. For example, the `\"norm\"` option roughly corresponds\n          to `lambda l, n, rg: l * rg.random((n, 1)) / 6`.\n\n      point_dist_fn: Controls how the final points are created from their projections\n        on the cluster-supporting lines, with three possible values:\n\n        - `\"n-1\"` (default): Final points are placed on a hyperplane orthogonal to\n          the cluster-supporting line, centered at each point's projection, using the\n          normal distribution (\u03bc=0, \u03c3=`lateral_disp`). This is done by the\n          [`clupoints_n_1()`][pyclugen.module.clupoints_n_1] function.\n        - `\"n\"`: Final points are placed around their projection on the\n          cluster-supporting line using the normal distribution (\u03bc=0,\n          \u03c3=`lateral_disp`). This is done by the\n          [`clupoints_n()`][pyclugen.module.clupoints_n] function.\n        - User-defined function: The user can specify a custom point placement\n          strategy by passing a function with the same signature as\n          [`clupoints_n_1()`][pyclugen.module.clupoints_n_1] and\n          [`clupoints_n()`][pyclugen.module.clupoints_n].\n\n      clusizes_fn: Distribution of cluster sizes. By default, cluster sizes are\n        determined by the [`clusizes()`][pyclugen.module.clusizes] function, which\n        uses the normal distribution (\u03bc=`num_points`/`num_clusters`, \u03c3=\u03bc/3), and\n        assures that the final cluster sizes add up to `num_points`. This parameter\n        allows the user to specify a custom function for this purpose, which must\n        follow [`clusizes()`][pyclugen.module.clusizes] signature. Note that custom\n        functions are not required to strictly obey the `num_points` parameter.\n        Alternatively, the user can specify an array of cluster sizes directly.\n      clucenters_fn: Distribution of cluster centers. By default, cluster centers\n        are determined by the [`clucenters()`][pyclugen.module.clucenters] function,\n        which uses the uniform distribution, and takes into account the `num_clusters`\n        and `cluster_sep` parameters for generating well-distributed cluster centers.\n        This parameter allows the user to specify a custom function for this purpose,\n        which must follow [`clucenters()`][pyclugen.module.clucenters] signature.\n        Alternatively, the user can specify a matrix of size `num_clusters` x\n        `num_dims` with the exact cluster centers.\n      llengths_fn: Distribution of line lengths. By default, the lengths of\n        cluster-supporting lines are determined by the\n        [`llengths()`][pyclugen.module.llengths] function, which uses the folded\n        normal distribution (\u03bc=`llength`, \u03c3=`llength_disp`). This parameter allows\n        the user to specify a custom function for this purpose, which must follow\n        [`llengths()`][pyclugen.module.llengths] signature. Alternatively, the user\n        can specify an array of line lengths directly.\n      angle_deltas_fn: Distribution of line angle differences with respect to\n        `direction`. By default, the angles between `direction` and the direction of\n        cluster-supporting lines are determined by the\n        [`angle_deltas()`][pyclugen.module.angle_deltas] function, which uses the\n        wrapped normal distribution (\u03bc=0, \u03c3=`angle_disp`) with support in the interval\n        [-\u03c0/2, \u03c0/2]. This parameter allows the user to specify a custom function for\n        this purpose, which must follow [`angle_deltas()`][pyclugen.module.angle_deltas]\n        signature. Alternatively, the user can specify an array of angle deltas\n        directly.\n      rng: The seed for the random number generator or an instance of\n        [`Generator`][numpy.random.Generator] for reproducible executions.\n\n    Returns:\n      The generated clusters and associated information in the form of a\n        [`Clusters`][pyclugen.main.Clusters] object.\n    \"\"\"\n    # ############### #\n    # Validate inputs #\n    # ############### #\n\n    # Check that number of dimensions is > 0\n    if num_dims < 1:\n        raise ValueError(\"Number of dimensions, `num_dims`, must be > 0\")\n\n    # Check that number of clusters is > 0\n    if num_clusters < 1:\n        raise ValueError(\"Number of clusters, `num_clust`, must be > 0\")\n\n    # Convert given direction into a NumPy array\n    arrdir: NDArray = asarray(direction)\n\n    # Get number of dimensions in `direction` array\n    dir_ndims = arrdir.ndim\n\n    # Is direction a vector or a matrix?\n    if dir_ndims == 1:\n        # It's a vector, let's convert it into a row matrix, since this will be\n        # useful down the road\n        arrdir = arrdir.reshape((1, -1))\n    elif dir_ndims == 2:\n        # If a matrix was given (i.e. a main direction is given for each cluster),\n        # check if the number of directions is the same as the number of clusters\n        dir_size_1 = arrdir.shape[0]\n        if dir_size_1 != num_clusters:\n            raise ValueError(\n                \"Number of rows in `direction` must be the same as the \"\n                + f\"number of clusters ({dir_size_1} != {num_clusters})\"\n            )\n    else:\n        # The `directions` array must be a vector or a matrix, so if we get here\n        # it means we have invalid arguments\n        raise ValueError(\n            \"`direction` must be a vector (1D array) or a matrix (2D array), \"\n            + f\"but is {dir_ndims}D\"\n        )\n\n    # Check that direction has num_dims dimensions\n    dir_size_2 = arrdir.shape[1]\n    if dir_size_2 != num_dims:\n        raise ValueError(\n            \"Length of directions in `direction` must be equal to \"\n            + f\"`num_dims` ({dir_size_2} != {num_dims})\"\n        )\n\n    # Check that directions have magnitude > 0\n    dir_magnitudes = apply_along_axis(norm, 1, arrdir)\n    if any(isclose(dir_magnitudes, 0)):\n        raise ValueError(\"Directions in `direction` must have magnitude > 0\")\n\n    # If allow_empty is false, make sure there are enough points to distribute\n    # by the clusters\n    if (not allow_empty) and num_points < num_clusters:\n        raise ValueError(\n            f\"A total of {num_points} points is not enough for \"\n            + f\"{num_clusters} non-empty clusters\"\n        )\n\n    # Check that cluster_sep has num_dims dimensions\n    cluster_sep = asarray(cluster_sep)\n    if cluster_sep.size != num_dims:\n        raise ValueError(\n            \"Length of `cluster_sep` must be equal to `num_dims` \"\n            + f\"({cluster_sep.size} != {num_dims})\"\n        )\n\n    # If given, cluster_offset must have the correct number of dimensions,\n    # if not given then it will be a num_dims x 1 vector of zeros\n    if cluster_offset is None:\n        cluster_offset = zeros(num_dims)\n    else:\n        cluster_offset = asarray(cluster_offset)\n        if cluster_offset.size != num_dims:\n            raise ValueError(\n                \"Length of `cluster_offset` must be equal to `num_dims` \"\n                + f\"({cluster_offset.size} != {num_dims})\"\n            )\n\n    # If the user specified rng as an int, create a proper rng object\n    rng_sel: Generator\n    if isinstance(rng, Generator):\n        rng_sel = cast(Generator, rng)\n    elif isinstance(rng, int):\n        rng_sel = Generator(PCG64(cast(int, rng)))\n    else:\n        raise ValueError(\n            f\"`rng` must be an instance of int or Generator, but is {type(rng)}\"\n        )\n\n    # Check that proj_dist_fn specifies a valid way for projecting points along\n    # cluster-supporting lines i.e., either \"norm\" (default), \"unif\" or a\n    # user-defined function\n    pointproj_fn: Callable[[float, int, Generator], NDArray]\n\n    if callable(proj_dist_fn):\n        # Use user-defined distribution; assume function accepts length of line\n        # and number of points, and returns a number of points x 1 vector\n        pointproj_fn = proj_dist_fn\n\n    elif proj_dist_fn == \"unif\":\n        # Point projections will be uniformly placed along cluster-supporting lines\n        def pointproj_fn(length, n, rg):\n            return length * rg.random(n) - length / 2\n\n    elif proj_dist_fn == \"norm\":\n        # Use normal distribution for placing point projections along cluster-supporting\n        # lines, mean equal to line center, standard deviation equal to 1/6 of line\n        # length such that the line length contains \u224899.73% of the points\n        def pointproj_fn(length, n, rg):\n            return (1.0 / 6.0) * length * rg.normal(size=n)\n\n    else:\n        raise ValueError(\n            \"`proj_dist_fn` has to be either 'norm', 'unif' or user-defined function\"\n        )\n\n    # Check that point_dist_fn specifies a valid way for generating points given\n    # their projections along cluster-supporting lines, i.e., either \"n-1\"\n    # (default), \"n\" or a user-defined function\n    pt_from_proj_fn: Callable[\n        [NDArray, float, float, NDArray, NDArray, Generator], NDArray\n    ]\n\n    if num_dims == 1:\n        # If 1D was specified, point projections are the points themselves\n        def pt_from_proj_fn(projs, lat_disp, length, clu_dir, clu_ctr, rng=rng_sel):\n            return projs\n\n    elif callable(point_dist_fn):\n        # Use user-defined distribution; assume function accepts point projections\n        # on the line, lateral disp., cluster direction and cluster center, and\n        # returns a num_points x num_dims matrix containing the final points\n        # for the current cluster\n        pt_from_proj_fn = point_dist_fn\n\n    elif point_dist_fn == \"n-1\":\n        # Points will be placed on a hyperplane orthogonal to the cluster-supporting\n        # line using a normal distribution centered at their intersection\n        pt_from_proj_fn = clupoints_n_1\n\n    elif point_dist_fn == \"n\":\n        # Points will be placed using a multivariate normal distribution\n        # centered at the point projection\n        pt_from_proj_fn = clupoints_n\n\n    else:\n        raise ValueError(\n            \"point_dist_fn has to be either 'n-1', 'n' or a user-defined function\"\n        )\n\n    # ############################ #\n    # Determine cluster properties #\n    # ############################ #\n\n    # Normalize main direction(s)\n    arrdir = apply_along_axis(lambda a: a / norm(a), 1, arrdir)\n\n    # If only one main direction was given, expand it for all clusters\n    if dir_ndims == 1:\n        arrdir = repeat(arrdir, num_clusters, axis=0)\n\n    # Determine cluster sizes\n    if callable(clusizes_fn):\n        cluster_sizes = clusizes_fn(num_clusters, num_points, allow_empty, rng_sel)\n    elif len(asarray(clusizes_fn)) == num_clusters:\n        cluster_sizes = asarray(clusizes_fn)\n    else:\n        raise ValueError(\n            \"clusizes_fn has to be either a function or a `num_clusters`-sized array\"\n        )\n\n    # Custom clusizes_fn's are not required to obey num_points, so we update\n    # it here just in case it's different from what the user specified\n    num_points = sum(cluster_sizes)\n\n    # Determine cluster centers\n    if callable(clucenters_fn):\n        cluster_centers = clucenters_fn(\n            num_clusters, cluster_sep, cluster_offset, rng_sel\n        )\n    elif asarray(clucenters_fn).shape == (num_clusters, num_dims):\n        cluster_centers = asarray(clucenters_fn)\n    else:\n        raise ValueError(\n            \"clucenters_fn has to be either a function or a matrix of size \"\n            + \"`num_clusters` x `num_dims`\"\n        )\n\n    # Determine length of lines supporting clusters\n    if callable(llengths_fn):\n        cluster_lengths = llengths_fn(num_clusters, llength, llength_disp, rng_sel)\n    elif len(asarray(llengths_fn)) == num_clusters:\n        cluster_lengths = asarray(llengths_fn)\n    else:\n        raise ValueError(\n            \"llengths_fn has to be either a function or a `num_clusters`-sized array\"\n        )\n\n    # Obtain angles between main direction and cluster-supporting lines\n    if callable(angle_deltas_fn):\n        cluster_angles = angle_deltas_fn(num_clusters, angle_disp, rng_sel)\n    elif len(asarray(angle_deltas_fn)) == num_clusters:\n        cluster_angles = asarray(angle_deltas_fn)\n    else:\n        raise ValueError(\n            \"angle_deltas_fn has to be either a function or a \"\n            + \"`num_clusters`-sized array\"\n        )\n\n    # Determine normalized cluster directions by applying the obtained angles\n    cluster_directions = apply_along_axis(\n        lambda v, a: rand_vector_at_angle(v, next(a), rng_sel),\n        1,\n        arrdir,\n        iter(cluster_angles),\n    )\n\n    # ################################# #\n    # Determine points for each cluster #\n    # ################################# #\n\n    # Aux. vector with cumulative sum of number of points in each cluster\n    cumsum_points = concatenate((asarray([0]), cumsum(cluster_sizes)))\n\n    # Pre-allocate data structures for holding cluster info and points\n    point_clusters: NDArray = empty(\n        num_points, dtype=int32\n    )  # Cluster indices of each point\n    point_projections = empty((num_points, num_dims))  # Point projections on\n    #                                                  # cluster-supporting lines\n    points = empty((num_points, num_dims))  # Final points to be generated\n\n    # Loop through clusters and create points for each one\n    for i in range(num_clusters):\n        # Start and end indexes for points in current cluster\n        idx_start = cumsum_points[i]\n        idx_end = cumsum_points[i + 1]\n\n        # Update cluster indices of each point\n        point_clusters[idx_start:idx_end] = i\n\n        # Determine distance of point projections from the center of the line\n        ptproj_dist_fn_center = pointproj_fn(\n            cluster_lengths[i], cluster_sizes[i], rng_sel\n        )\n\n        # Determine coordinates of point projections on the line using the\n        # parametric line equation (this works since cluster direction is normalized)\n        point_projections[idx_start:idx_end, :] = points_on_line(\n            cluster_centers[i, :], cluster_directions[i, :], ptproj_dist_fn_center\n        )\n\n        # Determine points from their projections on the line\n        points[idx_start:idx_end, :] = pt_from_proj_fn(\n            point_projections[idx_start:idx_end, :],\n            lateral_disp,\n            cluster_lengths[i],\n            cluster_directions[i, :],\n            cluster_centers[i, :],\n            rng_sel,\n        )\n\n    return Clusters(\n        points,\n        point_clusters,\n        point_projections,\n        cluster_sizes,\n        cluster_centers,\n        cluster_directions,\n        cluster_angles,\n        cluster_lengths,\n    )\n
"},{"location":"reference/#pyclugen.clumerge","title":"clumerge","text":"
clumerge(\n    *data: NamedTuple | Mapping[str, ArrayLike],\n    fields: tuple[str, ...] = (\"points\", \"clusters\"),\n    clusters_field: str | None = \"clusters\"\n) -> dict[str, NDArray]\n

Merges the fields (specified in fields) of two or more data sets.

Merges the fields (specified in fields) of two or more data sets (named tuples or dictionaries). The fields to be merged need to have the same number of columns. The corresponding merged field will contain the rows of the fields to be merged, and will have a common supertype.

The clusters_field parameter specifies a field containing integers that identify the cluster to which the respective points belongs to. If clusters_field is specified (by default it's specified as \"clusters\"), cluster assignments in individual datasets will be updated in the merged dataset so that clusters are considered separate. This parameter can be set to None, in which case no field will be considered as a special cluster assignments field.

This function can be used to merge data sets generated with the clugen() function, by default merging the points and clusters fields in those data sets. It also works with arbitrary data by specifying alternative fields in the fields parameter. It can be used, for example, to merge third-party data with clugen()-generated data.

Examples:

>>> from pyclugen import clugen, clumerge\n>>> data1 = clugen(2, 5, 1000, [1, 1], 0.01, [20, 20], 14, 1.2, 1.5);\n>>> data2 = clugen(2, 3, 450, [0.8, -0.3], 0, [25, 21], 6, 0.4, 3.5);\n>>> data3 = clugen(2, 2, 600, [0, -0.7], 0.2, [15, 10], 1, 0.1, 5.2);\n>>> data_merged = clumerge(data1, data2, data3)\n

Parameters:

Name Type Description Default *data NamedTuple | Mapping[str, ArrayLike]

One or more cluster data sets whose fields are to be merged.

() fields tuple[str, ...]

Fields to be merged, which must exist in the data set given in *data.

('points', 'clusters') clusters_field str | None

Field containing the integer cluster labels. If specified, cluster assignments in individual datasets will be updated in the merged dataset so that clusters are considered separate.

'clusters'

Returns:

Type Description dict[str, NDArray]

A dictionary, where keys correspond to field names, and values to the merged numerical arrays.

Source code in pyclugen/main.py
def clumerge(\n    *data: NamedTuple | Mapping[str, ArrayLike],\n    fields: tuple[str, ...] = (\"points\", \"clusters\"),\n    clusters_field: str | None = \"clusters\",\n) -> dict[str, NDArray]:\n    r\"\"\"Merges the fields (specified in `fields`) of two or more `data` sets.\n\n    Merges the fields (specified in `fields`) of two or more `data` sets (named\n    tuples or dictionaries). The fields to be merged need to have the same\n    number of columns. The corresponding merged field will contain the rows of\n    the fields to be merged, and will have a common supertype.\n\n    The `clusters_field` parameter specifies a field containing integers that\n    identify the cluster to which the respective points belongs to. If\n    `clusters_field` is specified (by default it's specified as `\"clusters\"`),\n    cluster assignments in individual datasets will be updated in the merged\n    dataset so that clusters are considered separate. This parameter can be set\n    to `None`, in which case no field will be considered as a special cluster\n    assignments field.\n\n    This function can be used to merge data sets generated with the\n    [`clugen()`][pyclugen.main.clugen] function, by default merging the\n    `points` and `clusters` fields in those data sets. It also works with\n    arbitrary data by specifying alternative fields in the `fields` parameter.\n    It can be used, for example, to merge third-party data with\n    [`clugen()`][pyclugen.main.clugen]-generated data.\n\n    Examples:\n        >>> from pyclugen import clugen, clumerge\n        >>> data1 = clugen(2, 5, 1000, [1, 1], 0.01, [20, 20], 14, 1.2, 1.5);\n        >>> data2 = clugen(2, 3, 450, [0.8, -0.3], 0, [25, 21], 6, 0.4, 3.5);\n        >>> data3 = clugen(2, 2, 600, [0, -0.7], 0.2, [15, 10], 1, 0.1, 5.2);\n        >>> data_merged = clumerge(data1, data2, data3)\n\n    Args:\n      *data: One or more cluster data sets whose `fields` are to be merged.\n      fields: Fields to be merged, which must exist in the data set given in\n        `*data`.\n      clusters_field: Field containing the integer cluster labels. If specified,\n        cluster assignments in individual datasets will be updated in the merged\n        dataset so that clusters are considered separate.\n\n    Returns:\n      A dictionary, where keys correspond to field names, and values to the\n        merged numerical arrays.\n    \"\"\"\n    # Number of elements in each array the merged dataset\n    numel: int = 0\n\n    # Number of columns of values in each field\n    fields_info: dict[str, _FieldInfo] = {}\n\n    # Merged dataset to output, initially empty\n    output: dict[str, NDArray] = {}\n\n    # Create a fields set\n    fields_set: MutableSet[str] = set(fields)\n\n    # If a clusters field is given, add it\n    if clusters_field is not None:\n        fields_set.add(str(clusters_field))\n\n    # Data in dictionary format with NDArray views on data\n    ddata: MutableSequence[Mapping[str, NDArray]] = []\n    for dt in data:\n        # If dt is a named tuple, convert it into a dictionary\n        ddt: Mapping[str, ArrayLike]\n        if isinstance(dt, dict):\n            ddt = cast(dict, dt)\n        else:\n            ntdt = cast(NamedTuple, dt)\n            ddt = ntdt._asdict()\n\n        # Convert dictionary values to NDArrays\n        ddtnp: Mapping[str, NDArray] = {k: asarray(v) for k, v in ddt.items()}\n\n        # Add converted dictionary to our sequence of dictionaries\n        ddata.append(ddtnp)\n\n    # Cycle through data items\n    for dt in ddata:\n        # Number of elements in the current item\n        numel_i: int = -1\n\n        # Cycle through fields for the current item\n        for field in fields_set:\n            if field not in dt:\n                raise ValueError(f\"Data item does not contain required field `{field}`\")\n            elif field == clusters_field and not can_cast(\n                dt[clusters_field].dtype, int64\n            ):\n                raise ValueError(f\"`{clusters_field}` must contain integer types\")\n\n            # Get the field value\n            value: NDArray = dt[field]\n\n            # Number of elements in field value\n            numel_tmp = len(value)\n\n            # Check the number of elements in the field value\n            if numel_i == -1:\n                # First field: get number of elements in value (must be the same\n                # for the remaining field values)\n                numel_i = numel_tmp\n\n            elif numel_tmp != numel_i:\n                # Fields values after the first must have the same number of\n                # elements\n                raise ValueError(\n                    \"Data item contains fields with different sizes \"\n                    + f\"({numel_tmp} != {numel_i})\"\n                )\n\n            # Get/check info about the field value type\n            if field not in fields_info:\n                # If it's the first time this field appears, just get the info\n                fields_info[field] = _FieldInfo(value.dtype, _getcols(value))\n\n            else:\n                # If this field already appeared in previous data items, get the\n                # info and check/determine its compatibility with respect to\n                # previous data items\n                if _getcols(value) != fields_info[field].ncol:\n                    # Number of columns must be the same\n                    raise ValueError(f\"Dimension mismatch in field `{field}`\")\n\n                # Get the common supertype\n                fields_info[field].dtype = promote_types(\n                    fields_info[field].dtype, value.dtype\n                )\n\n        # Update total number of elements\n        numel += numel_i\n\n    # Initialize output dictionary fields with room for all items\n    for field in fields_info:\n        if fields_info[field].ncol == 1:\n            output[field] = empty((numel,), dtype=fields_info[field].dtype)\n        else:\n            output[field] = empty(\n                (numel, fields_info[field].ncol), dtype=fields_info[field].dtype\n            )\n\n    # Copy items from input data to output dictionary, field-wise\n    copied: int = 0\n    last_cluster: int = 0\n\n    # Create merged output\n    for dt in ddata:\n        # How many elements to copy for the current data item?\n        tocopy: int = len(dt[fields[0]])\n\n        # Cycle through each field and its information\n        for field in fields_info:\n            # Copy elements\n            if field == clusters_field:\n                # If this is a clusters field, update the cluster IDs\n                old_clusters = unique(dt[clusters_field])\n                new_clusters = list(\n                    range(last_cluster + 1, last_cluster + len(old_clusters) + 1)\n                )\n                old2new = zip(old_clusters, new_clusters)\n                mapping = dict(old2new)\n                last_cluster = new_clusters[-1]\n\n                output[field][copied : (copied + tocopy)] = [\n                    mapping[val] for val in dt[clusters_field]\n                ]\n\n            else:\n                # Otherwise just copy the elements\n                ncol: int = fields_info[field].ncol\n                output[field].flat[copied * ncol : (copied + tocopy) * ncol] = dt[field]\n\n        # Update how many were copied so far\n        copied += tocopy\n\n    # Return result\n    return output\n
"},{"location":"reference/#pyclugen.clupoints_n","title":"clupoints_n","text":"
clupoints_n(\n    projs: NDArray,\n    lat_disp: float,\n    line_len: float,\n    clu_dir: NDArray,\n    clu_ctr: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Generate points from their \\(n\\)-D projections on a cluster-supporting line.

Each point is placed around its projection using the normal distribution ( \\(\\mu=0\\), \\(\u03c3=\\)lat_disp).

This function's main intended use is by the clugen() function, generating the final points when the point_dist_fn parameter is set to \"n\".

Examples:

>>> from pyclugen import clupoints_n, points_on_line\n>>> from numpy import array, linspace\n>>> from numpy.random import Generator, PCG64\n>>> prng = Generator(PCG64(123))\n>>> projs = points_on_line(array([5,5]),     # Get 5 point projections\n...                        array([1,0]),     # on a 2D line\n...                        linspace(-4,4,5))\n>>> projs\narray([[1., 5.],\n       [3., 5.],\n       [5., 5.],\n       [7., 5.],\n       [9., 5.]])\n>>> clupoints_n(projs, 0.5, 1.0, array([1,0]), array([0,0]), rng=prng)\narray([[0.50543932, 4.81610667],\n       [3.64396263, 5.09698721],\n       [5.46011545, 5.2885519 ],\n       [6.68176818, 5.27097611],\n       [8.84170227, 4.83880544]])\n

Parameters:

Name Type Description Default projs NDArray

Point projections on the cluster-supporting line ( \\(p \\times n\\) matrix).

required lat_disp float

Standard deviation for the normal distribution, i.e., cluster lateral dispersion.

required line_len float

Length of cluster-supporting line (ignored).

required clu_dir NDArray

Direction of the cluster-supporting line.

required clu_ctr NDArray

Center position of the cluster-supporting line (ignored).

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Generated points ( \\(p \\times n\\) matrix).

Source code in pyclugen/module.py
def clupoints_n(\n    projs: NDArray,\n    lat_disp: float,\n    line_len: float,\n    clu_dir: NDArray,\n    clu_ctr: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Generate points from their $n$-D projections on a cluster-supporting line.\n\n    Each point is placed around its projection using the normal distribution\n    ( $\\mu=0$, $\u03c3=$`lat_disp`).\n\n    This function's main intended use is by the [`clugen()`][pyclugen.main.clugen]\n    function, generating the final points when the `point_dist_fn` parameter is\n    set to `\"n\"`.\n\n    Examples:\n        >>> from pyclugen import clupoints_n, points_on_line\n        >>> from numpy import array, linspace\n        >>> from numpy.random import Generator, PCG64\n        >>> prng = Generator(PCG64(123))\n        >>> projs = points_on_line(array([5,5]),     # Get 5 point projections\n        ...                        array([1,0]),     # on a 2D line\n        ...                        linspace(-4,4,5))\n        >>> projs\n        array([[1., 5.],\n               [3., 5.],\n               [5., 5.],\n               [7., 5.],\n               [9., 5.]])\n        >>> clupoints_n(projs, 0.5, 1.0, array([1,0]), array([0,0]), rng=prng)\n        array([[0.50543932, 4.81610667],\n               [3.64396263, 5.09698721],\n               [5.46011545, 5.2885519 ],\n               [6.68176818, 5.27097611],\n               [8.84170227, 4.83880544]])\n\n    Args:\n      projs: Point projections on the cluster-supporting line ( $p \\times n$ matrix).\n      lat_disp: Standard deviation for the normal distribution, i.e., cluster\n        lateral dispersion.\n      line_len: Length of cluster-supporting line (ignored).\n      clu_dir: Direction of the cluster-supporting line.\n      clu_ctr: Center position of the cluster-supporting line (ignored).\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Generated points ( $p \\times n$ matrix).\n    \"\"\"\n    # Number of dimensions\n    num_dims = clu_dir.size\n\n    # Number of points in this cluster\n    clu_num_points = projs.shape[0]\n\n    # Get random displacement vectors for each point projection\n    displ = lat_disp * rng.normal(size=(clu_num_points, num_dims))\n\n    # Add displacement vectors to each point projection\n    points = projs + displ\n\n    return points\n
"},{"location":"reference/#pyclugen.clupoints_n_1","title":"clupoints_n_1","text":"
clupoints_n_1(\n    projs: NDArray,\n    lat_disp: float,\n    line_len: float,\n    clu_dir: NDArray,\n    clu_ctr: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Generate points from their \\(n\\)-D projections on a cluster-supporting line.

Each point is placed on a hyperplane orthogonal to that line and centered at the point's projection, using the normal distribution ( \\(\\mu=0\\), \\(\u03c3=\\)lat_disp).

This function's main intended use is by the clugen() function, generating the final points when the point_dist_fn parameter is set to \"n-1\".

Examples:

>>> from pyclugen import clupoints_n_1, points_on_line\n>>> from numpy import array, linspace\n>>> from numpy.random import Generator, PCG64\n>>> prng = Generator(PCG64(123))\n>>> projs = points_on_line(array([5,5]),     # Get 5 point projections\n...                        array([1,0]),     # on a 2D line\n...                        linspace(-4,4,5))\n>>> projs\narray([[1., 5.],\n       [3., 5.],\n       [5., 5.],\n       [7., 5.],\n       [9., 5.]])\n>>> clupoints_n_1(projs, 0.5, 1.0, array([1,0]), array([0,0]), rng=prng)\narray([[1.        , 5.49456068],\n       [3.        , 5.18389333],\n       [5.        , 5.64396263],\n       [7.        , 5.09698721],\n       [9.        , 5.46011545]])\n

Parameters:

Name Type Description Default projs NDArray

Point projections on the cluster-supporting line ( \\(p \\times n\\) matrix).

required lat_disp float

Standard deviation for the normal distribution, i.e., cluster lateral dispersion.

required line_len float

Length of cluster-supporting line (ignored).

required clu_dir NDArray

Direction of the cluster-supporting line.

required clu_ctr NDArray

Center position of the cluster-supporting line (ignored).

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Generated points ( \\(p \\times n\\) matrix).

Source code in pyclugen/module.py
def clupoints_n_1(\n    projs: NDArray,\n    lat_disp: float,\n    line_len: float,\n    clu_dir: NDArray,\n    clu_ctr: NDArray,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Generate points from their $n$-D projections on a cluster-supporting line.\n\n    Each point is placed on a hyperplane orthogonal to that line and centered at\n    the point's projection, using the normal distribution ( $\\mu=0$,\n    $\u03c3=$`lat_disp`).\n\n    This function's main intended use is by the [`clugen()`][pyclugen.main.clugen]\n    function, generating the final points when the `point_dist_fn` parameter is\n    set to `\"n-1\"`.\n\n    Examples:\n        >>> from pyclugen import clupoints_n_1, points_on_line\n        >>> from numpy import array, linspace\n        >>> from numpy.random import Generator, PCG64\n        >>> prng = Generator(PCG64(123))\n        >>> projs = points_on_line(array([5,5]),     # Get 5 point projections\n        ...                        array([1,0]),     # on a 2D line\n        ...                        linspace(-4,4,5))\n        >>> projs\n        array([[1., 5.],\n               [3., 5.],\n               [5., 5.],\n               [7., 5.],\n               [9., 5.]])\n        >>> clupoints_n_1(projs, 0.5, 1.0, array([1,0]), array([0,0]), rng=prng)\n        array([[1.        , 5.49456068],\n               [3.        , 5.18389333],\n               [5.        , 5.64396263],\n               [7.        , 5.09698721],\n               [9.        , 5.46011545]])\n\n    Args:\n      projs: Point projections on the cluster-supporting line ( $p \\times n$ matrix).\n      lat_disp: Standard deviation for the normal distribution, i.e., cluster\n        lateral dispersion.\n      line_len: Length of cluster-supporting line (ignored).\n      clu_dir: Direction of the cluster-supporting line.\n      clu_ctr: Center position of the cluster-supporting line (ignored).\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Generated points ( $p \\times n$ matrix).\n    \"\"\"\n    # No blank line allowed here\n\n    # Define function to get distances from points to their projections on the\n    # line (i.e., using the normal distribution)\n    def dist_fn(clu_num_points, ldisp, rg):\n        return ldisp * rg.normal(size=clu_num_points)\n\n    # Use clupoints_n_1_template() to do the heavy lifting\n    return clupoints_n_1_template(projs, lat_disp, clu_dir, dist_fn, rng=rng)\n
"},{"location":"reference/#pyclugen.clupoints_n_1_template","title":"clupoints_n_1_template","text":"
clupoints_n_1_template(\n    projs: NDArray,\n    lat_disp: float,\n    clu_dir: NDArray,\n    dist_fn: Callable[[int, float, Generator], NDArray],\n    rng: Generator = _default_rng,\n) -> NDArray\n

Create \\(p\\) points from their \\(n\\)-D projections on a cluster-supporting line.

Each point is placed on a hyperplane orthogonal to that line and centered at the point's projection. The function specified in dist_fn is used to perform the actual placement.

This function is used internally by clupoints_n_1() and may be useful for constructing user-defined final point placement strategies for the point_dist_fn parameter of the main clugen() function.

Examples:

>>> from numpy import array, zeros\n>>> from numpy.random import Generator, PCG64\n>>> from pyclugen import clupoints_n_1_template, points_on_line\n>>> ctr = zeros(2)\n>>> dir = array([1, 0])\n>>> pdist = array([-0.5, -0.2, 0.1, 0.3])\n>>> rng = Generator(PCG64(123))\n>>> proj = points_on_line(ctr, dir, pdist)\n>>> clupoints_n_1_template(proj, 0, dir, lambda p, l, r: r.random(p), rng=rng)\narray([[-0.5       ,  0.68235186],\n       [-0.2       , -0.05382102],\n       [ 0.1       ,  0.22035987],\n       [ 0.3       , -0.18437181]])\n

Parameters:

Name Type Description Default projs NDArray

Point projections on the cluster-supporting line ( \\(p \\times n\\) matrix).

required lat_disp float

Dispersion of points from their projection.

required clu_dir NDArray

Direction of the cluster-supporting line (unit vector).

required dist_fn Callable[[int, float, Generator], NDArray]

Function to place points on a second line, orthogonal to the first. The functions accepts as parameters the number of points in the current cluster, the lateral_disp parameter (the same passed to the clugen() function), and a random number generator, returning a vector containing the distance of each point to its projection on the cluster-supporting line.

required rng Generator

An optional pseudo-random number generator for reproducible executions.

_default_rng

Returns:

Type Description NDArray

Generated points ( \\(p \\times n\\) matrix).

Source code in pyclugen/helper.py
def clupoints_n_1_template(\n    projs: NDArray,\n    lat_disp: float,\n    clu_dir: NDArray,\n    dist_fn: Callable[[int, float, Generator], NDArray],\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Create $p$ points from their $n$-D projections on a cluster-supporting line.\n\n    Each point is placed on a hyperplane orthogonal to that line and centered at\n    the point's projection. The function specified in `dist_fn` is used to perform\n    the actual placement.\n\n    This function is used internally by\n    [`clupoints_n_1()`][pyclugen.module.clupoints_n_1] and may be useful for\n    constructing user-defined final point placement strategies for the `point_dist_fn`\n    parameter of the main [`clugen()`][pyclugen.main.clugen] function.\n\n    Examples:\n        >>> from numpy import array, zeros\n        >>> from numpy.random import Generator, PCG64\n        >>> from pyclugen import clupoints_n_1_template, points_on_line\n        >>> ctr = zeros(2)\n        >>> dir = array([1, 0])\n        >>> pdist = array([-0.5, -0.2, 0.1, 0.3])\n        >>> rng = Generator(PCG64(123))\n        >>> proj = points_on_line(ctr, dir, pdist)\n        >>> clupoints_n_1_template(proj, 0, dir, lambda p, l, r: r.random(p), rng=rng)\n        array([[-0.5       ,  0.68235186],\n               [-0.2       , -0.05382102],\n               [ 0.1       ,  0.22035987],\n               [ 0.3       , -0.18437181]])\n\n    Args:\n      projs: Point projections on the cluster-supporting line ( $p \\times n$ matrix).\n      lat_disp: Dispersion of points from their projection.\n      clu_dir: Direction of the cluster-supporting line (unit vector).\n      dist_fn: Function to place points on a second line, orthogonal to the first.\n        The functions accepts as parameters the number of points in the current\n        cluster, the `lateral_disp` parameter (the same passed to the\n        [`clugen()`][pyclugen.main.clugen] function), and a random number generator,\n        returning a vector containing the distance of each point to its projection\n        on the cluster-supporting line.\n      rng: An optional pseudo-random number generator for reproducible executions.\n\n    Returns:\n      Generated points ( $p \\times n$ matrix).\n    \"\"\"\n    # Number of dimensions\n    num_dims = clu_dir.size\n\n    # Number of points in this cluster\n    clu_num_points = projs.shape[0]\n\n    # Get distances from points to their projections on the line\n    points_dist = dist_fn(clu_num_points, lat_disp, rng)\n\n    # Get normalized vectors, orthogonal to the current line, for each point\n    orth_vecs = zeros((clu_num_points, num_dims))\n\n    for j in range(clu_num_points):\n        orth_vecs[j, :] = rand_ortho_vector(clu_dir, rng=rng).ravel()\n\n    # Set vector magnitudes\n    orth_vecs = abs(points_dist).reshape(-1, 1) * orth_vecs\n\n    # Add perpendicular vectors to point projections on the line,\n    # yielding final cluster points\n    points = projs + orth_vecs\n\n    return points\n
"},{"location":"reference/#pyclugen.clusizes","title":"clusizes","text":"
clusizes(\n    num_clusters: int,\n    num_points: int,\n    allow_empty: bool,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Determine cluster sizes, i.e., the number of points in each cluster.

Cluster sizes are determined using the normal distribution ( \\(\\mu=\\)num_points \\(/\\)num_clusters, \\(\\sigma=\\mu/3\\)), and then assuring that the final cluster sizes add up to num_points via the fix_num_points() function.

Examples:

>>> from numpy.random import Generator, PCG64\n>>> from pyclugen import clusizes\n>>> prng = Generator(PCG64(123))\n>>> sizes = clusizes(4, 1000, True, rng=prng)\n>>> sizes\narray([166, 217, 354, 263])\n>>> sum(sizes)\n1000\n

Parameters:

Name Type Description Default num_clusters int

Number of clusters.

required num_points int

Total number of points.

required allow_empty bool

Allow empty clusters?

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Number of points in each cluster (vector of size num_clusters).

Source code in pyclugen/module.py
def clusizes(\n    num_clusters: int,\n    num_points: int,\n    allow_empty: bool,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Determine cluster sizes, i.e., the number of points in each cluster.\n\n    Cluster sizes are determined using the normal distribution (\n    $\\mu=$`num_points` $/$`num_clusters`, $\\sigma=\\mu/3$), and then\n    assuring that the final cluster sizes add up to `num_points` via the\n    [`fix_num_points()`][pyclugen.helper.fix_num_points] function.\n\n    Examples:\n        >>> from numpy.random import Generator, PCG64\n        >>> from pyclugen import clusizes\n        >>> prng = Generator(PCG64(123))\n        >>> sizes = clusizes(4, 1000, True, rng=prng)\n        >>> sizes\n        array([166, 217, 354, 263])\n        >>> sum(sizes)\n        1000\n\n    Args:\n      num_clusters: Number of clusters.\n      num_points: Total number of points.\n      allow_empty: Allow empty clusters?\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Number of points in each cluster (vector of size `num_clusters`).\n    \"\"\"\n    # Determine number of points in each cluster using the normal distribution\n\n    # Consider the mean an equal division of points between clusters\n    mean = num_points / num_clusters\n    # The standard deviation is such that the interval [0, 2 * mean] will contain\n    # \u224899.7% of cluster sizes\n    std = mean / 3\n\n    # Determine points with the normal distribution\n    clu_num_points = std * rng.normal(size=num_clusters) + mean\n\n    # Set negative values to zero\n    clu_num_points = where(clu_num_points > 0, clu_num_points, 0)\n\n    # Fix imbalances, so that num_points is respected\n    if sum(clu_num_points) > 0:  # Be careful not to divide by zero\n        clu_num_points *= num_points / sum(clu_num_points)\n\n    # Round the real values to integers since a cluster sizes is represented by\n    # an integer\n    clu_num_points = rint(clu_num_points).astype(int)\n\n    # Make sure total points is respected, which may not be the case at this time due\n    # to rounding\n    fix_num_points(clu_num_points, num_points)\n\n    # If empty clusters are not allowed, make sure there aren't any\n    if not allow_empty:\n        fix_empty(clu_num_points)\n\n    return clu_num_points\n
"},{"location":"reference/#pyclugen.fix_empty","title":"fix_empty","text":"
fix_empty(clu_num_points: NDArray, allow_empty: bool = False) -> NDArray\n

Certifies that, given enough points, no clusters are left empty.

This is done by removing a point from the largest cluster and adding it to an empty cluster while there are empty clusters. If the total number of points is smaller than the number of clusters (or if the allow_empty parameter is set to true), this function does nothing.

This function is used internally by clusizes() and might be useful for custom cluster sizing implementations given as the clusizes_fn parameter of the main clugen() function.

Note that the array is changed in-place.

Examples:

>>> from numpy import array\n>>> from pyclugen import fix_empty\n>>> clusters = array([3, 4, 5, 0, 0])\n>>> fix_empty(clusters)\narray([3, 3, 4, 1, 1])\n>>> clusters # Verify that the array was changed in-place\narray([3, 3, 4, 1, 1])\n

Parameters:

Name Type Description Default clu_num_points NDArray

Number of points in each cluster (vector of size \\(c\\)), where \\(c\\) is the number of clusters.

required allow_empty bool

Allow empty clusters?

False

Returns:

Type Description NDArray

Number of points in each cluster, after being fixed by this function (vector of size \\(c\\), which is the same reference than clu_num_points).

Source code in pyclugen/helper.py
def fix_empty(clu_num_points: NDArray, allow_empty: bool = False) -> NDArray:\n    r\"\"\"Certifies that, given enough points, no clusters are left empty.\n\n    This is done by removing a point from the largest cluster and adding it to an\n    empty cluster while there are empty clusters. If the total number of points is\n    smaller than the number of clusters (or if the `allow_empty` parameter is set\n    to `true`), this function does nothing.\n\n    This function is used internally by [`clusizes()`][pyclugen.module.clusizes]\n    and might be useful for custom cluster sizing implementations given as the\n    `clusizes_fn` parameter of the main [`clugen()`][pyclugen.main.clugen] function.\n\n    Note that the array is changed in-place.\n\n    Examples:\n        >>> from numpy import array\n        >>> from pyclugen import fix_empty\n        >>> clusters = array([3, 4, 5, 0, 0])\n        >>> fix_empty(clusters)\n        array([3, 3, 4, 1, 1])\n        >>> clusters # Verify that the array was changed in-place\n        array([3, 3, 4, 1, 1])\n\n    Args:\n      clu_num_points: Number of points in each cluster (vector of size $c$),\n        where $c$ is the number of clusters.\n      allow_empty: Allow empty clusters?\n\n    Returns:\n      Number of points in each cluster, after being fixed by this function (vector\n        of size $c$, which is the same reference than `clu_num_points`).\n    \"\"\"\n    # If the allow_empty parameter is set to true, don't do anything and return\n    # immediately; this is useful for quick `clusizes_fn` one-liners\n    if not allow_empty:\n        # Find empty clusters\n        empty_clusts = [idx for idx, val in enumerate(clu_num_points) if val == 0]\n\n        # If there are empty clusters and enough points for all clusters...\n        if len(empty_clusts) > 0 and sum(clu_num_points) >= clu_num_points.size:\n            # Go through the empty clusters...\n            for i0 in empty_clusts:\n                # ...get a point from the largest cluster and assign it to the\n                # current empty cluster\n                imax = argmax(clu_num_points)\n                clu_num_points[imax] -= 1\n                clu_num_points[i0] += 1\n\n    return clu_num_points\n
"},{"location":"reference/#pyclugen.fix_num_points","title":"fix_num_points","text":"
fix_num_points(clu_num_points: NDArray, num_points: int) -> NDArray\n

Certifies that the values in the clu_num_points array add up to num_points.

If this is not the case, the clu_num_points array is modified in-place, incrementing the value corresponding to the smallest cluster while sum(clu_num_points) < num_points, or decrementing the value corresponding to the largest cluster while sum(clu_num_points) > num_points.

This function is used internally by clusizes() and might be useful for custom cluster sizing implementations given as the clusizes_fn parameter of the main clugen() function.

Examples:

>>> from numpy import array\n>>> from pyclugen import fix_num_points\n>>> clusters = array([1, 6, 3])  # 10 total points\n>>> fix_num_points(clusters, 12) # But we want 12 total points\narray([3, 6, 3])\n>>> clusters # Verify that the array was changed in-place\narray([3, 6, 3])\n

Parameters:

Name Type Description Default clu_num_points NDArray

Number of points in each cluster (vector of size \\(c\\)), where \\(c\\) is the number of clusters.

required num_points int

The expected total number of points.

required

Returns:

Type Description NDArray

Number of points in each cluster, after being fixed by this function (vector of size \\(c\\), which is the same reference than clu_num_points).

Source code in pyclugen/helper.py
def fix_num_points(clu_num_points: NDArray, num_points: int) -> NDArray:\n    r\"\"\"Certifies that the values in the `clu_num_points` array add up to `num_points`.\n\n    If this is not the case, the `clu_num_points` array is modified in-place,\n    incrementing the value corresponding to the smallest cluster while\n    `sum(clu_num_points) < num_points`, or decrementing the value corresponding to\n    the largest cluster while `sum(clu_num_points) > num_points`.\n\n    This function is used internally by [`clusizes()`][pyclugen.module.clusizes]\n    and might be useful for custom cluster sizing implementations given as the\n    `clusizes_fn` parameter of the main [`clugen()`][pyclugen.main.clugen] function.\n\n    Examples:\n        >>> from numpy import array\n        >>> from pyclugen import fix_num_points\n        >>> clusters = array([1, 6, 3])  # 10 total points\n        >>> fix_num_points(clusters, 12) # But we want 12 total points\n        array([3, 6, 3])\n        >>> clusters # Verify that the array was changed in-place\n        array([3, 6, 3])\n\n    Args:\n      clu_num_points: Number of points in each cluster (vector of size $c$),\n        where $c$ is the number of clusters.\n      num_points: The expected total number of points.\n\n    Returns:\n      Number of points in each cluster, after being fixed by this function (vector\n        of size $c$, which is the same reference than `clu_num_points`).\n    \"\"\"\n    while sum(clu_num_points) < num_points:\n        imin = argmin(clu_num_points)\n        clu_num_points[imin] += 1\n    while sum(clu_num_points) > num_points:\n        imax = argmax(clu_num_points)\n        clu_num_points[imax] -= 1\n\n    return clu_num_points\n
"},{"location":"reference/#pyclugen.llengths","title":"llengths","text":"
llengths(\n    num_clusters: int,\n    llength: float,\n    llength_disp: float,\n    rng: Generator = _default_rng,\n) -> NDArray\n

Determine length of cluster-supporting lines.

Line lengths are determined using the folded normal distribution ( \\(\\mu=\\)llength, \\(\\sigma=\\)llength_disp).

Examples:

>>> from numpy.random import Generator, MT19937\n>>> from pyclugen import llengths\n>>> prng = Generator(MT19937(123))\n>>> llengths(4, 20, 3.5, rng=prng)\narray([19.50968733, 19.92482858, 25.99013804, 18.58029672])\n

Parameters:

Name Type Description Default num_clusters int

Number of clusters.

required llength float

Average line length.

required llength_disp float

Line length dispersion.

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Lengths of cluster-supporting lines (vector of size num_clusters).

Source code in pyclugen/module.py
def llengths(\n    num_clusters: int,\n    llength: float,\n    llength_disp: float,\n    rng: Generator = _default_rng,\n) -> NDArray:\n    r\"\"\"Determine length of cluster-supporting lines.\n\n    Line lengths are determined using the folded normal distribution (\n    $\\mu=$`llength`, $\\sigma=$`llength_disp`).\n\n    Examples:\n        >>> from numpy.random import Generator, MT19937\n        >>> from pyclugen import llengths\n        >>> prng = Generator(MT19937(123))\n        >>> llengths(4, 20, 3.5, rng=prng)\n        array([19.50968733, 19.92482858, 25.99013804, 18.58029672])\n\n    Args:\n      num_clusters: Number of clusters.\n      llength: Average line length.\n      llength_disp: Line length dispersion.\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Lengths of cluster-supporting lines (vector of size `num_clusters`).\n    \"\"\"\n    return abs(llength + llength_disp * rng.normal(size=num_clusters))\n
"},{"location":"reference/#pyclugen.points_on_line","title":"points_on_line","text":"
points_on_line(\n    center: NDArray, direction: NDArray, dist_center: NDArray\n) -> NDArray\n

Determine coordinates of points on a line.

Determine coordinates of points on a line with center and direction, based on the distances from the center given in dist_center.

This works by using the vector formulation of the line equation assuming direction is a \\(n\\)-dimensional unit vector. In other words, considering \\(\\mathbf{d}=\\)direction.reshape(-1,1) ( \\(n \\times 1\\) vector), \\(\\mathbf{c}=\\)center.reshape(-1,1) ( \\(n \\times 1\\) vector), and \\(\\mathbf{w}=\\) dist_center.reshape(-1,1) ( \\(p \\times 1\\) vector), the coordinates of points on the line are given by:

\\[ \\mathbf{P}=\\mathbf{1}\\,\\mathbf{c}^T + \\mathbf{w}\\mathbf{d}^T \\]

where \\(\\mathbf{P}\\) is the \\(p \\times n\\) matrix of point coordinates on the line, and \\(\\mathbf{1}\\) is a \\(p \\times 1\\) vector with all entries equal to 1.

Examples:

>>> from pyclugen import points_on_line\n>>> from numpy import array, linspace\n>>> points_on_line(array([5., 5.]),\n...                array([1., 0.]),\n...                linspace(-4, 4, 5)) # 2D, 5 points\narray([[1., 5.],\n       [3., 5.],\n       [5., 5.],\n       [7., 5.],\n       [9., 5.]])\n>>> points_on_line(array([-2, 0, 0., 2]),\n...                array([0., 0, -1, 0]),\n...                array([10, -10])) # 4D, 2 points\narray([[ -2.,   0., -10.,   2.],\n       [ -2.,   0.,  10.,   2.]])\n

Parameters:

Name Type Description Default center NDArray

Center of the line ( \\(n\\)-component vector).

required direction NDArray

Line direction ( \\(n\\)-component unit vector).

required dist_center NDArray

Distance of each point to the center of the line ( \\(p\\)-component vector, where \\(p\\) is the number of points).

required

Returns:

Type Description NDArray

Coordinates of points on the specified line ( \\(p \\times n\\) matrix).

Source code in pyclugen/core.py
def points_on_line(\n    center: NDArray, direction: NDArray, dist_center: NDArray\n) -> NDArray:\n    r\"\"\"Determine coordinates of points on a line.\n\n    Determine coordinates of points on a line with `center` and `direction`,\n    based on the distances from the center given in `dist_center`.\n\n    This works by using the vector formulation of the line equation assuming\n    `direction` is a $n$-dimensional unit vector. In other words, considering\n    $\\mathbf{d}=$`direction.reshape(-1,1)` ( $n \\times 1$ vector),\n    $\\mathbf{c}=$`center.reshape(-1,1)` ( $n \\times 1$ vector), and\n    $\\mathbf{w}=$ `dist_center.reshape(-1,1)` ( $p \\times 1$ vector),\n    the coordinates of points on the line are given by:\n\n    $$\n    \\mathbf{P}=\\mathbf{1}\\,\\mathbf{c}^T + \\mathbf{w}\\mathbf{d}^T\n    $$\n\n    where $\\mathbf{P}$ is the $p \\times n$ matrix of point coordinates on the\n    line, and $\\mathbf{1}$ is a $p \\times 1$ vector with all entries equal to 1.\n\n    Examples:\n        >>> from pyclugen import points_on_line\n        >>> from numpy import array, linspace\n        >>> points_on_line(array([5., 5.]),\n        ...                array([1., 0.]),\n        ...                linspace(-4, 4, 5)) # 2D, 5 points\n        array([[1., 5.],\n               [3., 5.],\n               [5., 5.],\n               [7., 5.],\n               [9., 5.]])\n        >>> points_on_line(array([-2, 0, 0., 2]),\n        ...                array([0., 0, -1, 0]),\n        ...                array([10, -10])) # 4D, 2 points\n        array([[ -2.,   0., -10.,   2.],\n               [ -2.,   0.,  10.,   2.]])\n\n    Args:\n      center: Center of the line ( $n$-component vector).\n      direction: Line direction ( $n$-component unit vector).\n      dist_center: Distance of each point to the center of the line\n        ( $p$-component vector, where $p$ is the number of points).\n\n    Returns:\n      Coordinates of points on the specified line ( $p \\times n$ matrix).\n    \"\"\"\n    return center.reshape(1, -1) + dist_center.reshape(-1, 1) @ direction.reshape(\n        (1, -1)\n    )\n
"},{"location":"reference/#pyclugen.rand_ortho_vector","title":"rand_ortho_vector","text":"
rand_ortho_vector(u: NDArray, rng: Generator = _default_rng) -> NDArray\n

Get a random unit vector orthogonal to u.

Note that u is expected to be a unit vector itself.

Examples:

>>> from pyclugen import rand_ortho_vector\n>>> from numpy import isclose, dot\n>>> from numpy.linalg import norm\n>>> from numpy.random import Generator, PCG64\n>>> rng = Generator(PCG64(123))\n>>> r = rng.random(3) # Get a random vector with 3 components (3D)\n>>> r = r / norm(r) # Normalize it\n>>> r_ort = rand_ortho_vector(r, rng=rng) # Get random unit vector orth. to r\n>>> r_ort\narray([-0.1982903 , -0.61401512,  0.76398062])\n>>> isclose(dot(r, r_ort), 0) # Check that vectors are indeed orthogonal\nTrue\n

Parameters:

Name Type Description Default u NDArray

Unit vector with \\(n\\) components.

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

A random unit vector with \\(n\\) components orthogonal to u.

Source code in pyclugen/core.py
def rand_ortho_vector(u: NDArray, rng: Generator = _default_rng) -> NDArray:\n    r\"\"\"Get a random unit vector orthogonal to `u`.\n\n    Note that `u` is expected to be a unit vector itself.\n\n    Examples:\n        >>> from pyclugen import rand_ortho_vector\n        >>> from numpy import isclose, dot\n        >>> from numpy.linalg import norm\n        >>> from numpy.random import Generator, PCG64\n        >>> rng = Generator(PCG64(123))\n        >>> r = rng.random(3) # Get a random vector with 3 components (3D)\n        >>> r = r / norm(r) # Normalize it\n        >>> r_ort = rand_ortho_vector(r, rng=rng) # Get random unit vector orth. to r\n        >>> r_ort\n        array([-0.1982903 , -0.61401512,  0.76398062])\n        >>> isclose(dot(r, r_ort), 0) # Check that vectors are indeed orthogonal\n        True\n\n    Args:\n      u: Unit vector with $n$ components.\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      A random unit vector with $n$ components orthogonal to `u`.\n    \"\"\"\n    # If 1D, just return a random unit vector\n    if u.size == 1:\n        return rand_unit_vector(1, rng=rng)\n\n    # Find a random, non-parallel vector to u\n    while True:\n        # Find normalized random vector\n        r = rand_unit_vector(u.size, rng=rng)\n\n        # If not parallel to u we can keep it and break the loop\n        if not isclose(abs(dot(u, r)), 1):\n            break\n\n    # Get vector orthogonal to u using 1st iteration of Gram-Schmidt process\n    v = r - dot(u, r) / dot(u, u) * u\n\n    # Normalize it\n    v = v / norm(v)\n\n    # And return it\n    return v\n
"},{"location":"reference/#pyclugen.rand_unit_vector","title":"rand_unit_vector","text":"
rand_unit_vector(num_dims: int, rng: Generator = _default_rng) -> NDArray\n

Get a random unit vector with num_dims components.

Examples:

>>> from pyclugen import rand_unit_vector\n>>> rand_unit_vector(4)\narray([ 0.48653889,  0.50753862,  0.05711487, -0.70881757])\n
>>> from pyclugen import rand_unit_vector\n>>> from numpy.random import Generator, PCG64\n>>> rng = Generator(PCG64(123))\n>>> rand_unit_vector(2, rng=rng) # Reproducible\narray([ 0.3783202 , -0.92567479])\n

Parameters:

Name Type Description Default num_dims int

Number of components in vector (i.e. vector size).

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

A random unit vector with num_dims components.

Source code in pyclugen/core.py
def rand_unit_vector(num_dims: int, rng: Generator = _default_rng) -> NDArray:\n    r\"\"\"Get a random unit vector with `num_dims` components.\n\n    Examples:\n        >>> from pyclugen import rand_unit_vector\n        >>> rand_unit_vector(4) # doctest: +SKIP\n        array([ 0.48653889,  0.50753862,  0.05711487, -0.70881757])\n\n        >>> from pyclugen import rand_unit_vector\n        >>> from numpy.random import Generator, PCG64\n        >>> rng = Generator(PCG64(123))\n        >>> rand_unit_vector(2, rng=rng) # Reproducible\n        array([ 0.3783202 , -0.92567479])\n\n    Args:\n      num_dims: Number of components in vector (i.e. vector size).\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      A random unit vector with `num_dims` components.\n    \"\"\"\n    r = rng.random(num_dims) - 0.5\n    r = r / norm(r)\n    return r\n
"},{"location":"reference/#pyclugen.rand_vector_at_angle","title":"rand_vector_at_angle","text":"
rand_vector_at_angle(\n    u: NDArray, angle: float, rng: Generator = _default_rng\n) -> NDArray\n

Get a random unit vector which is at angle radians of vector u.

Note that u is expected to be a unit vector itself.

Examples:

>>> from pyclugen import rand_vector_at_angle\n>>> from numpy import arccos, array, degrees, pi, dot\n>>> from numpy.linalg import norm\n>>> from numpy.random import Generator, PCG64\n>>> rng = Generator(PCG64(123))\n>>> u = array([ 1.0, 0, 0.5, -0.5 ]) # Define a 4D vector\n>>> u = u / norm(u) # Normalize the vector\n>>> v = rand_vector_at_angle(u, pi/4, rng=rng) # Get a vector at 45 degrees\n>>> v\narray([ 0.633066  , -0.50953554, -0.10693823, -0.57285705])\n>>> degrees(arccos(dot(u, v) / norm(u) * norm(v))) # Angle between u and v\n45.0\n

Parameters:

Name Type Description Default u NDArray

Unit vector with \\(n\\) components.

required angle float

Angle in radians.

required rng Generator

Optional pseudo-random number generator.

_default_rng

Returns:

Type Description NDArray

Random unit vector with \\(n\\) components which is at angle radians with vector u.

Source code in pyclugen/core.py
def rand_vector_at_angle(\n    u: NDArray, angle: float, rng: Generator = _default_rng\n) -> NDArray:\n    r\"\"\"Get a random unit vector which is at `angle` radians of vector `u`.\n\n    Note that `u` is expected to be a unit vector itself.\n\n    Examples:\n        >>> from pyclugen import rand_vector_at_angle\n        >>> from numpy import arccos, array, degrees, pi, dot\n        >>> from numpy.linalg import norm\n        >>> from numpy.random import Generator, PCG64\n        >>> rng = Generator(PCG64(123))\n        >>> u = array([ 1.0, 0, 0.5, -0.5 ]) # Define a 4D vector\n        >>> u = u / norm(u) # Normalize the vector\n        >>> v = rand_vector_at_angle(u, pi/4, rng=rng) # Get a vector at 45 degrees\n        >>> v\n        array([ 0.633066  , -0.50953554, -0.10693823, -0.57285705])\n        >>> degrees(arccos(dot(u, v) / norm(u) * norm(v))) # Angle between u and v\n        45.0\n\n    Args:\n      u: Unit vector with $n$ components.\n      angle: Angle in radians.\n      rng: Optional pseudo-random number generator.\n\n    Returns:\n      Random unit vector with $n$ components which is at `angle` radians\n        with vector `u`.\n    \"\"\"\n    if isclose(abs(angle), pi / 2) and u.size > 1:\n        return rand_ortho_vector(u, rng=rng)\n    elif -pi / 2 < angle < pi / 2 and u.size > 1:\n        v = u + rand_ortho_vector(u, rng=rng) * tan(angle)\n        return v / norm(v)\n    else:\n        # For |\u03b8| > \u03c0/2 or the 1D case, simply return a random vector\n        return rand_unit_vector(u.size, rng=rng)\n
"},{"location":"theory/","title":"Theory","text":"

This section presents a general overview of the clugen algorithm. A complete description of the algorithm's theoretical framework is available in the article \"Generating multidimensional clusters with support lines\" (an open version is available on arXiv).

Clugen is an algorithm for generating multidimensional clusters. Each cluster is supported by a line segment, the position, orientation and length of which guide where the respective points are placed. For brevity, line segments will be referred to as lines.

Given an \\(n\\)-dimensional direction vector \\(\\mathbf{d}\\) (and a number of additional parameters, which will be discussed shortly), the clugen algorithm works as follows (\\(^*\\) means the algorithm step is stochastic):

  1. Normalize \\(\\mathbf{d}\\).
  2. \\(^*\\)Determine cluster sizes.
  3. \\(^*\\)Determine cluster centers.
  4. \\(^*\\)Determine lengths of cluster-supporting lines.
  5. \\(^*\\)Determine angles between \\(\\mathbf{d}\\) and cluster-supporting lines.
  6. For each cluster:
  7. \\(^*\\)Determine direction of the cluster-supporting line.
  8. \\(^*\\)Determine distance of point projections from the center of the cluster-supporting line.
  9. Determine coordinates of point projections on the cluster-supporting line.
  10. \\(^*\\)Determine points from their projections on the cluster-supporting line.

Figure 1 provides a stylized overview of the algorithm's steps.

The example in Figure 1 was generated with the following parameters, the exact meaning of each will be discussed shortly:

Parameter values Description \\(n=2\\) Number of dimensions. \\(c=4\\) Number of clusters. \\(p=200\\) Total number of points. \\(\\mathbf{d}=\\begin{bmatrix}1 & 1\\end{bmatrix}^T\\) Average direction. \\(\\theta_\\sigma=\\pi/16\\approx{}11.25^{\\circ}\\) Angle dispersion. \\(\\mathbf{s}=\\begin{bmatrix}10 & 10\\end{bmatrix}^T\\) Average cluster separation. \\(l=10\\) Average line length. \\(l_\\sigma=1.5\\) Line length dispersion. \\(f_\\sigma=1\\) Cluster lateral dispersion.

Additionally, all optional parameters (not listed above) were left to their default values. The complete list of parameters is presented in the clugen() function documentation.

"},{"location":"generated/gallery/","title":"Examples","text":""},{"location":"generated/gallery/#examples","title":"Examples","text":"

Examples in 1D

Examples in 2D

Examples in 3D

Examples in nD

Merging and hierarchical cluster examples

Plot functions

Download all examples in Python source code: gallery_python.zip

Download all examples in Jupyter notebooks: gallery_jupyter.zip

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/mg_execution_times/","title":"Computation times","text":"

00:29.296 total execution time for generated_gallery files:

+----------------------------------------------------------------------------------------+-----------+--------+ | plot_2_2d_examples (docs/examples/plot_2_2d_examples.py) | 00:11.459 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_4_nd_examples (docs/examples/plot_4_nd_examples.py) | 00:07.035 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_3_3d_examples (docs/examples/plot_3_3d_examples.py) | 00:05.724 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_5_mrg_examples (docs/examples/plot_5_mrg_examples.py) | 00:03.588 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_1_1d_examples (docs/examples/plot_1_1d_examples.py) | 00:01.484 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+ | plot_functions (docs/examples/plot_functions.py) | 00:00.006 | 0.0 MB | +----------------------------------------------------------------------------------------+-----------+--------+

"},{"location":"generated/gallery/plot_1_1d_examples/","title":"Examples in 1D","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_1_1d_examples/#examples-in-1d","title":"Examples in 1D","text":"

This section contains several examples on how to generate 1D data with pyclugen. To run the examples we first need to import the clugen() function:

from pyclugen import clugen\n

To plot these examples we use the plot_examples_1d function:

from plot_functions import plot_examples_1d\n

Out:

/home/runner/work/pyclugen/pyclugen/docs/docs/examples/plot_functions.py:15: DeprecationWarning: \nPyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\nbut was not found to be installed on your system.\nIf this would cause problems for you,\nplease provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n\n  import pandas as pd\n
"},{"location":"generated/gallery/plot_1_1d_examples/#basic-1d-example-with-density-plot","title":"Basic 1D example with density plot","text":"
seed = 23456\n
# Custom proj_dist_fn: point projections placed using the Weibull distribution\ndef proj_weibull(len, n, rng):\n    return len / 2 * rng.weibull(1.5, size=n)\n
e082 = clugen(1, 3, 1000, [1], 0, [10], 6, 1.5, 0, rng=seed)\ne083 = clugen(1, 3, 1000, [1], 0, [10], 6, 1.5, 0, rng=seed, proj_dist_fn=\"unif\")\ne084 = clugen(1, 3, 1000, [1], 0, [10], 6, 1.5, 0, rng=seed, proj_dist_fn=proj_weibull)\n
plot_examples_1d(\n    e082, \"e082: proj_dist_fn = 'norm' (default)\",\n    e083, \"e083: proj_dist_fn = 'unif'\",\n    e084, \"e084: custom proj_dist_fn (Weibull)\")\n

Total running time of the script: ( 0 minutes 1.484 seconds)

Download Python source code: plot_1_1d_examples.py

Download Jupyter notebook: plot_1_1d_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_2_2d_examples/","title":"Examples in 2D","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_2_2d_examples/#examples-in-2d","title":"Examples in 2D","text":"

This section contains several examples on how to generate 2D data with pyclugen. To run the examples we first need to import the clugen() function:

import numpy as np\nfrom pyclugen import clugen\n

To plot these examples we use the plot_examples_2d function:

from plot_functions import plot_examples_2d\n
"},{"location":"generated/gallery/plot_2_2d_examples/#manipulating-the-direction-of-cluster-supporting-lines","title":"Manipulating the direction of cluster-supporting lines","text":""},{"location":"generated/gallery/plot_2_2d_examples/#using-the-direction-parameter","title":"Using the direction parameter","text":"
seed = 123\n
e001 = clugen(2, 4, 2000, [1, 0], 0, [10, 10], 10, 1.5, 0.5, rng=seed)\ne002 = clugen(2, 4, 200, [1, 1], 0, [10, 10], 10, 1.5, 0.5, rng=seed)\ne003 = clugen(2, 4, 200, [0, 1], 0, [10, 10], 10, 1.5, 0.5, rng=seed)\n
plot_examples_2d(\n    e001, \"e001: direction = [1, 0]\",\n    e002, \"e002: direction = [1, 1]\",\n    e003, \"e003: direction = [0, 1]\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#changing-the-angle_disp-parameter-and-using-a-custom-angle_deltas_fn-function","title":"Changing the angle_disp parameter and using a custom angle_deltas_fn function","text":"
seed = 321\n
# Custom angle_deltas function: arbitrarily rotate some clusters by 90 degrees\ndef angdel_90_fn(nclu, astd, rng):\n    return rng.choice([0, np.pi / 2], size=nclu)\n
e004 = clugen(2, 6, 500, [1, 0], 0, [10, 10], 10, 1.5, 0.5, rng=seed)\ne005 = clugen(2, 6, 500, [1, 0], np.pi / 8, [10, 10], 10, 1.5, 0.5, rng=seed)\ne006 = clugen(2, 6, 500, [1, 0], 0, [10, 10], 10, 1.5, 0.5, rng=seed,\n    angle_deltas_fn=angdel_90_fn)\n
plot_examples_2d(\n    e004, \"e004: angle_disp = 0\",\n    e005, \"e005: angle_disp = \u03c0/8\",\n    e006, \"e006: custom angle_deltas function\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#manipulating-the-length-of-cluster-supporting-lines","title":"Manipulating the length of cluster-supporting lines","text":""},{"location":"generated/gallery/plot_2_2d_examples/#using-the-llength-parameter","title":"Using the llength parameter","text":"
seed = 567\n
e007 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10],  0, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne008 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 10, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne009 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 30, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\n
plot_examples_2d(\n    e007, \"e007: llength = 0\",\n    e008, \"e008: llength = 10\",\n    e009, \"e009: llength = 30\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#changing-the-llength_disp-parameter-and-using-a-custom-llengths_fn-function","title":"Changing the llength_disp parameter and using a custom llengths_fn function","text":"
seed = 567\n
# Custom llengths function: line lengths grow for each new cluster\ndef llen_grow_fn(nclu, llen, llenstd, rng):\n    return llen * np.arange(nclu) + rng.normal(scale=llenstd, size=nclu)\n
e010 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 15,  0.0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne011 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 15, 10.0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne012 = clugen(2, 5, 800, [1, 0], np.pi / 10, [10, 10], 10,  0.1, 0.5, rng=seed,\n    llengths_fn=llen_grow_fn, point_dist_fn=\"n\")\n
plot_examples_2d(\n    e010, \"e010: llength_disp = 0.0\",\n    e011, \"e011: llength_disp = 5.0\",\n    e012, \"e012: custom llengths function\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#manipulating-relative-cluster-positions","title":"Manipulating relative cluster positions","text":""},{"location":"generated/gallery/plot_2_2d_examples/#using-the-cluster_sep-parameter","title":"Using the cluster_sep parameter","text":"
seed = 21\n
e013 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 10], 10, 2, 2.5, rng=seed)\ne014 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [30, 10], 10, 2, 2.5, rng=seed)\ne015 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 30], 10, 2, 2.5, rng=seed)\n
plt = plot_examples_2d(\n    e013, \"e013: cluster_sep = [10, 10]\",\n    e014, \"e014: cluster_sep = [30, 10]\",\n    e015, \"e015: cluster_sep = [10, 30]\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#changing-the-cluster_offset-parameter-and-using-a-custom-clucenters_fn-function","title":"Changing the cluster_offset parameter and using a custom clucenters_fn function","text":"
seed = 21\n
# Custom clucenters function: places clusters in a diagonal\ndef centers_diag_fn(nclu, csep, coff, rng):\n    return np.ones((nclu, len(csep))) * np.arange(1, nclu + 1)[:, None] * np.max(csep) + coff\n
e016 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 10], 10, 2, 2.5, rng=seed)\ne017 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 10], 10, 2, 2.5, rng=seed,\n    cluster_offset=[20, -20])\ne018 = clugen(2, 8, 1000, [1, 1], np.pi / 4, [10, 10], 10, 2, 2.5, rng=seed,\n    cluster_offset=[-50, -50], clucenters_fn=centers_diag_fn)\n
plt = plot_examples_2d(\n    e016, \"e016: default\",\n    e017, \"e017: cluster_offset = [20, -20]\",\n    e018, \"e018: custom clucenters function\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#lateral-dispersion-and-placement-of-point-projections-on-the-line","title":"Lateral dispersion and placement of point projections on the line","text":""},{"location":"generated/gallery/plot_2_2d_examples/#normal-projection-placement-default-proj_dist_fn-norm","title":"Normal projection placement (default): proj_dist_fn = \"norm\"","text":"
seed = 654\n
e019 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 0.0, rng=seed)\ne020 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 1.0, rng=seed)\ne021 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 3.0, rng=seed)\n
plt = plot_examples_2d(\n    e019, \"e019: lateral_disp = 0\",\n    e020, \"e020: lateral_disp = 1\",\n    e021, \"e021: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#uniform-projection-placement-proj_dist_fn-unif","title":"Uniform projection placement: proj_dist_fn = \"unif\"","text":"
seed = 654\n
e022 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 0.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne023 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 1.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne024 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 3.0, rng=seed,\n    proj_dist_fn=\"unif\")\n
plt = plot_examples_2d(\n    e022, \"e022: lateral_disp = 0\",\n    e023, \"e023: lateral_disp = 1\",\n    e024, \"e024: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#custom-projection-placement-using-the-laplace-distribution","title":"Custom projection placement using the Laplace distribution","text":"
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e025 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 0.0, rng=seed,\n    proj_dist_fn=proj_laplace)\ne026 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 1.0, rng=seed,\n    proj_dist_fn=proj_laplace)\ne027 = clugen(2, 4, 1000, [1, 0], np.pi / 2, [20, 20], 13, 2, 3.0, rng=seed,\n    proj_dist_fn=proj_laplace)\n
plt = plot_examples_2d(\n    e025, \"e025: lateral_disp = 0\",\n    e026, \"e026: lateral_disp = 1\",\n    e027, \"e027: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#controlling-final-point-positions-from-their-projections-on-the-cluster-supporting-line","title":"Controlling final point positions from their projections on the cluster-supporting line","text":""},{"location":"generated/gallery/plot_2_2d_examples/#points-on-hyperplane-orthogonal-to-cluster-supporting-line-default-point_dist_fn-n-1","title":"Points on hyperplane orthogonal to cluster-supporting line (default): point_dist_fn = \"n-1\"","text":"
seed = 1357\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e028 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed)\ne029 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne030 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    proj_dist_fn=proj_laplace)\n
plt = plot_examples_2d(\n    e028, \"e028: proj_dist_fn=\\\"norm\\\" (default)\",\n    e029, \"e029: proj_dist_fn=\\\"unif\\\"\",\n    e030, \"e030: custom proj_dist_fn (Laplace)\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#points-around-projection-on-cluster-supporting-line-point_dist_fn-n","title":"Points around projection on cluster-supporting line: point_dist_fn = \"n\"","text":"
seed = 1357\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e031 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=\"n\")\ne032 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=\"n\", proj_dist_fn=\"unif\")\ne033 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=\"n\", proj_dist_fn=proj_laplace)\n
plt = plot_examples_2d(\n    e031, \"e031: proj_dist_fn=\\\"norm\\\" (default)\",\n    e032, \"e032: proj_dist_fn=\\\"unif\\\"\",\n    e033, \"e033: custom proj_dist_fn (Laplace)\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#custom-point-placement-using-the-exponential-distribution","title":"Custom point placement using the exponential distribution","text":"

For this example we require the clupoints_n_1_template() helper function:

from pyclugen import clupoints_n_1_template\n
seed = 1357\n
# Custom point_dist_fn: final points placed using the Exponential distribution\ndef clupoints_n_1_exp(projs, lat_std, len, clu_dir, clu_ctr, rng):\n    def dist_exp(npts, lstd, rg):\n        return lstd * rg.exponential(scale=2 / lstd, size=npts)\n    return clupoints_n_1_template(projs, lat_std, clu_dir, dist_exp, rng=rng)\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e034 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=clupoints_n_1_exp)\ne035 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=clupoints_n_1_exp, proj_dist_fn=\"unif\")\ne036 = clugen(2, 5, 1500, [1, 0], np.pi / 3, [20, 20], 12, 3, 1.0, rng=seed,\n    point_dist_fn=clupoints_n_1_exp, proj_dist_fn=proj_laplace)\n
plt = plot_examples_2d(\n    e034, \"e034: proj_dist_fn=\\\"norm\\\" (default)\",\n    e035, \"e035: proj_dist_fn=\\\"unif\\\"\",\n    e036, \"e036: custom proj_dist_fn (Laplace)\")\n

"},{"location":"generated/gallery/plot_2_2d_examples/#manipulating-cluster-sizes","title":"Manipulating cluster sizes","text":"
seed = 963\n
# Custom clusizes_fn (e038): cluster sizes determined via the uniform distribution,\n# no correction for total points\ndef clusizes_unif(nclu, npts, ae, rng):\n    return rng.integers(low=1, high=2 * npts / nclu + 1, size=nclu)\n
# Custom clusizes_fn (e039): clusters all have the same size, no correction for total points\ndef clusizes_equal(nclu, npts, ae, rng):\n    return (npts // nclu) * np.ones(nclu, dtype=int)\n
# Custom clucenters_fn (all): yields fixed positions for the clusters\ndef centers_fixed(nclu, csep, coff, rng):\n    return np.array([[-csep[0], -csep[1]], [csep[0], -csep[1]], [-csep[0], csep[1]], [csep[0], csep[1]]])\n
e037 = clugen(2, 4, 1500, [1, 1], np.pi, [20, 20], 0, 0, 5, rng=seed,\n    point_dist_fn=\"n\", clucenters_fn=centers_fixed)\ne038 = clugen(2, 4, 1500, [1, 1], np.pi, [20, 20], 0, 0, 5, rng=seed,\n    point_dist_fn=\"n\", clucenters_fn=centers_fixed, clusizes_fn=clusizes_unif)\ne039 = clugen(2, 4, 1500, [1, 1], np.pi, [20, 20], 0, 0, 5, rng=seed,\n    point_dist_fn=\"n\", clucenters_fn=centers_fixed, clusizes_fn=clusizes_equal)\n
plt = plot_examples_2d(\n    e037, \"e037: normal dist. (default)\",\n    e038, \"e038: unif. dist. (custom)\",\n    e039, \"e039: equal size (custom)\")\n
"},{"location":"generated/gallery/plot_2_2d_examples/#direct-specification-of-optional-parameters","title":"Direct specification of optional parameters","text":"
seed = 123\n
e040 = clugen(2, 4, 1000, [-1, 1], 0, [0, 0], 0, 0, 0.2, rng=seed,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\", clusizes_fn=[50, 200, 500, 2000],\n    llengths_fn=[0, 2, 4, 6], clucenters_fn=[[-5, -5], [-2.5, -2.5], [0, 0], [2.5, 2.5]])\n\ne041 = clugen(2, 5, 1000, [[1, 1], [1, 0], [1, 0], [0, 1], [0, 1]],\n    0, [0, 0], 0, 0, 0.2, rng=seed,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\",\n    clusizes_fn=[200, 500, 500, 500, 500], llengths_fn=[0, 5, 5, 5, 5],\n    clucenters_fn=[[0, 0], [0, 5], [0, -5], [5, 0], [-5, 0]])\n\ne042 = clugen(2, 5, 1000, [[0, 1], [0.25, 0.75], [0.5, 0.5], [0.75, 0.25], [1, 0]],\n    0, [0, 0], 5, 0, 0.2, rng=seed,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\", clusizes_fn=[500, 500, 500, 500, 500],\n    clucenters_fn=[[-5, 0], [-3, -0.3], [-1, -0.8], [1, -1.6], [3, -2.5]])\n
plt = plot_examples_2d(\n    e040, \"e040: direct params 1\",\n    e041, \"e041: direct params 2\",\n    e042, \"e042: direct params 3\")\n

Total running time of the script: ( 0 minutes 11.459 seconds)

Download Python source code: plot_2_2d_examples.py

Download Jupyter notebook: plot_2_2d_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_3_3d_examples/","title":"Examples in 3D","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_3_3d_examples/#examples-in-3d","title":"Examples in 3D","text":"

This section contains several examples on how to generate 3D data with pyclugen. To run the examples we first need to import the clugen() function:

import numpy as np\nfrom pyclugen import clugen\n

To plot these examples we use the plot_examples_3d function:

from plot_functions import plot_examples_3d\n
"},{"location":"generated/gallery/plot_3_3d_examples/#manipulating-the-direction-of-cluster-supporting-lines","title":"Manipulating the direction of cluster-supporting lines","text":""},{"location":"generated/gallery/plot_3_3d_examples/#using-the-direction-parameter","title":"Using the direction parameter","text":"
seed = 321\n
e043 = clugen(3, 4, 500, [1, 0, 0], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\ne044 = clugen(3, 4, 500, [1, 1, 1], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\ne045 = clugen(3, 4, 500, [0, 0, 1], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\n
plt = plot_examples_3d(\n    e043, \"e043: direction = [1, 0, 0]\",\n    e044, \"e044: direction = [1, 1, 1]\",\n    e045, \"e045: direction = [0, 0, 1]\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#changing-the-angle_disp-parameter-and-using-a-custom-angle_deltas_fn-function","title":"Changing the angle_disp parameter and using a custom angle_deltas_fn function","text":"
seed = 321\n\n# Custom angle_deltas function: arbitrarily rotate some clusters by 90 degrees\ndef angdel_90_fn(nclu, astd, rng):\n    return rng.choice([0, np.pi / 2], size=nclu)\n
e046 = clugen(3, 6, 1000, [1, 0, 0], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\ne047 = clugen(3, 6, 1000, [1, 0, 0], np.pi / 8, [10, 10, 10], 15, 1.5, 0.5, rng=seed)\ne048 = clugen(3, 6, 1000, [1, 0, 0], 0, [10, 10, 10], 15, 1.5, 0.5, rng=seed,\n    angle_deltas_fn=angdel_90_fn)\n
plt = plot_examples_3d(\n    e046, \"e046: angle_disp = 0\",\n    e047, \"e047: angle_disp = \u03c0 / 8\",\n    e048, \"e048: custom angle_deltas function\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#specifying-a-main-direction-for-each-cluster-and-changing-angle_disp","title":"Specifying a main direction for each cluster and changing angle_disp","text":"
seed = 123\n\n# Define a main direction for each cluster\ndirs = [[1, 1, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0], [-1, 1, 1]]\n
e049 = clugen(3, 5, 1000, dirs, 0, np.zeros(3), 20, 0, 0.2, proj_dist_fn=\"unif\", rng=seed)\ne050 = clugen(3, 5, 1000, dirs, np.pi / 12, np.zeros(3), 20, 0, 0.2, proj_dist_fn=\"unif\", rng=seed)\ne051 = clugen(3, 5, 1000, dirs, np.pi / 4, np.zeros(3), 20, 0, 0.2, proj_dist_fn=\"unif\", rng=seed)\n
plot_examples_3d(\n    e049, \"e049: angle_disp = 0\",\n    e050, \"e050: angle_disp = \u03c0 / 12\",\n    e051, \"e051: angle_disp = \u03c0 / 4\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#manipulating-the-length-of-cluster-supporting-lines","title":"Manipulating the length of cluster-supporting lines","text":""},{"location":"generated/gallery/plot_3_3d_examples/#using-the-llength-parameter","title":"Using the llength parameter","text":"
seed = 789\n
e052 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 0, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne053 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 10, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne054 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 30, 0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\n
plt = plot_examples_3d(\n    e052, \"e052: llength = 0\",\n    e053, \"e053: llength = 10\",\n    e054, \"e054: llength = 30\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#changing-the-llength_disp-parameter-and-using-a-custom-llengths_fn-function","title":"Changing the llength_disp parameter and using a custom llengths_fn function","text":"
seed = 765\n
# Custom llengths function: line lengths tend to grow for each new cluster\ndef llen_grow_fn(nclu, llen, llenstd, rng):\n    return llen * np.arange(nclu) + rng.normal(scale=llenstd, size=nclu)\n\ne055 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 15,  0.0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne056 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 15, 10.0, 0.5, rng=seed,\n    point_dist_fn=\"n\")\ne057 = clugen(3, 5, 800, [1, 0, 0], np.pi / 10, [10, 10, 10], 10,  0.1, 0.5, rng=seed,\n    point_dist_fn=\"n\", llengths_fn=llen_grow_fn)\n
plt = plot_examples_3d(\n    e055, \"e055: llength_disp = 0.0\",\n    e056, \"e056: llength_disp = 10.0\",\n    e057, \"e057: custom llengths function\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#manipulating-relative-cluster-positions","title":"Manipulating relative cluster positions","text":""},{"location":"generated/gallery/plot_3_3d_examples/#using-the-cluster_sep-parameter","title":"Using the cluster_sep parameter","text":"
seed = 765\n
e058 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [30, 10, 10], 25, 4, 3, rng=seed)\ne059 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 30, 10], 25, 4, 3, rng=seed)\ne060 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 10, 30], 25, 4, 3, rng=seed)\n
plt = plot_examples_3d(\n    e058, \"e058: cluster_sep = [30, 10, 10]\",\n    e059, \"e059: cluster_sep = [10, 30, 10]\",\n    e060, \"e060: cluster_sep = [10, 10, 30]\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#changing-the-cluster_offset-parameter-and-using-a-custom-clucenters_fn-function","title":"Changing the cluster_offset parameter and using a custom clucenters_fn function","text":"
# Custom clucenters function: places clusters in a diagonal\ndef centers_diag_fn(nclu, csep, coff, rng):\n    return np.ones((nclu, len(csep))) * np.arange(1, nclu + 1)[:, None] * np.max(csep) + coff\n\ne061 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 10, 10], 12, 3, 2.5, rng=seed)\ne062 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 10, 10], 12, 3, 2.5, rng=seed,\n    cluster_offset=[30, -30, 30])\ne063 = clugen(3, 8, 1000, [1, 1, 1], np.pi / 4, [10, 10, 10], 12, 3, 2.5, rng=seed,\n    cluster_offset=[-40, -40, -40], clucenters_fn=centers_diag_fn)\n
plt = plot_examples_3d(\n    e061, \"e061: default\",\n    e062, \"e062: cluster_offset=[30, -30, 30]\",\n    e063, \"e063: custom clucenters function\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#lateral-dispersion-and-placement-of-point-projections-on-the-line","title":"Lateral dispersion and placement of point projections on the line","text":""},{"location":"generated/gallery/plot_3_3d_examples/#normal-projection-placement-default-proj_dist_fnnorm","title":"Normal projection placement (default): proj_dist_fn=\"norm\"","text":"
seed = 246\n
e064 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 0.0, rng=seed)\ne065 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 1.0, rng=seed)\ne066 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 3.0, rng=seed)\n
plt = plot_examples_3d(\n    e064, \"e064: lateral_disp = 0\",\n    e065, \"e065: lateral_disp = 1\",\n    e066, \"e066: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#uniform-projection-placement-proj_dist_fnunif","title":"Uniform projection placement: proj_dist_fn=\"unif\"","text":"
seed = 246\n
e067 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 0.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne068 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 1.0, rng=seed,\n    proj_dist_fn=\"unif\")\ne069 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 3.0, rng=seed,\n    proj_dist_fn=\"unif\")\n
plt = plot_examples_3d(\n    e067, \"e067: lateral_disp = 0\",\n    e068, \"e068: lateral_disp = 1\",\n    e069, \"e069: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#custom-projection-placement-using-the-laplace-distribution","title":"Custom projection placement using the Laplace distribution","text":"
seed = 246\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e070 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 0.0, rng=seed,\n    proj_dist_fn=proj_laplace)\ne071 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 1.0, rng=seed,\n    proj_dist_fn=proj_laplace)\ne072 = clugen(3, 4, 1000, [1, 0, 0], np.pi / 2, [20, 20, 20], 13, 2, 3.0, rng=seed,\n    proj_dist_fn=proj_laplace)\n
plt = plot_examples_3d(\n    e070, \"e070: lateral_disp = 0\",\n    e071, \"e071: lateral_disp = 1\",\n    e072, \"e072: lateral_disp = 3\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#controlling-final-point-positions-from-their-projections-on-the-cluster-supporting-line","title":"Controlling final point positions from their projections on the cluster-supporting line","text":""},{"location":"generated/gallery/plot_3_3d_examples/#points-on-hyperplane-orthogonal-to-cluster-supporting-line-default-point_dist_fnn-1","title":"Points on hyperplane orthogonal to cluster-supporting line (default): point_dist_fn=\"n-1\"","text":"
seed = 840\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e073 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed)\ne074 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    proj_dist_fn=\"unif\")\ne075 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    proj_dist_fn=proj_laplace)\n
plt = plot_examples_3d(\n    e073, \"e073: proj_dist_fn=\\\"norm\\\" (default)\",\n    e074, \"e074: proj_dist_fn=\\\"unif\\\"\",\n    e075, \"e075: custom proj_dist_fn (Laplace)\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#points-around-projection-on-cluster-supporting-line-point_dist_fnn","title":"Points around projection on cluster-supporting line: point_dist_fn=\"n\"","text":"
seed = 840\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n\ne076 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=\"n\")\ne077 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=\"n\", proj_dist_fn=\"unif\")\ne078 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=\"n\", proj_dist_fn=proj_laplace)\n
plt = plot_examples_3d(\n    e076, \"e076: proj_dist_fn=\\\"norm\\\" (default)\",\n    e077, \"e077: proj_dist_fn=\\\"unif\\\"\",\n    e078, \"e078: custom proj_dist_fn (Laplace)\")\n
"},{"location":"generated/gallery/plot_3_3d_examples/#custom-point-placement-using-the-exponential-distribution","title":"Custom point placement using the exponential distribution","text":"

For this example we require the clupoints_n_1_template() helper function:

from pyclugen import clupoints_n_1_template\n
seed = 840\n
# Custom point_dist_fn: final points placed using the Exponential distribution\ndef clupoints_n_1_exp(projs, lat_std, len, clu_dir, clu_ctr, rng):\n    def dist_exp(npts, lstd, rg):\n        return lstd * rg.exponential(scale=2 / lstd, size=npts)\n    return clupoints_n_1_template(projs, lat_std, clu_dir, dist_exp, rng=rng)\n
# Custom proj_dist_fn: point projections placed using the Laplace distribution\ndef proj_laplace(len, n, rng):\n    return rng.laplace(scale=len / 6, size=n)\n
e079 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=clupoints_n_1_exp)\ne080 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=clupoints_n_1_exp, proj_dist_fn=\"unif\")\ne081 = clugen(3, 5, 1500, [1, 0, 0], np.pi / 3, [20, 20, 20], 22, 3, 2, rng=seed,\n    point_dist_fn=clupoints_n_1_exp, proj_dist_fn=proj_laplace)\n
plt = plot_examples_3d(\n    e079, \"e079: proj_dist_fn=\\\"norm\\\" (default)\",\n    e080, \"e080: proj_dist_fn=\\\"unif\\\"\",\n    e081, \"e081: custom proj_dist_fn (Laplace)\")\n

"},{"location":"generated/gallery/plot_3_3d_examples/#manipulating-cluster-sizes","title":"Manipulating cluster sizes","text":"
seed = 555\n
# Custom clusizes_fn (e083): cluster sizes determined via the uniform distribution,\n# no correction for total points\ndef clusizes_unif(nclu, npts, ae, rng):\n    return rng.integers(low=1, high=2 * npts / nclu + 1, size=nclu)\n
# Custom clusizes_fn (e084): clusters all have the same size, no correction for total points\ndef clusizes_equal(nclu, npts, ae, rng):\n    return (npts // nclu) * np.ones(nclu, dtype=int)\n
# Custom clucenters_fn (all): yields fixed positions for the clusters\ndef centers_fixed(nclu, csep, coff, rng):\n    return np.array([\n        [-csep[0], -csep[1], -csep[2]],\n        [csep[0], -csep[1], -csep[2]],\n        [-csep[0], csep[1], csep[2]],\n        [csep[0], csep[1], csep[2]]])\n
e082 = clugen(3, 4, 1500, [1, 1, 1], np.pi, [20, 20, 20], 0, 0, 5, rng=seed,\n    clucenters_fn=centers_fixed, point_dist_fn=\"n\")\ne083 = clugen(3, 4, 1500, [1, 1, 1], np.pi, [20, 20, 20], 0, 0, 5, rng=seed,\n    clucenters_fn=centers_fixed, clusizes_fn=clusizes_unif, point_dist_fn=\"n\")\ne084 = clugen(3, 4, 1500, [1, 1, 1], np.pi, [20, 20, 20], 0, 0, 5, rng=seed,\n    clucenters_fn=centers_fixed, clusizes_fn=clusizes_equal, point_dist_fn=\"n\")\n
plt = plot_examples_3d(\n    e082, \"e082: normal dist. (default)\",\n    e083, \"e083: unif. dist. (custom)\",\n    e084, \"e084: equal size (custom)\")\n

Total running time of the script: ( 0 minutes 5.724 seconds)

Download Python source code: plot_3_3d_examples.py

Download Jupyter notebook: plot_3_3d_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_4_nd_examples/","title":"Examples in nD","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_4_nd_examples/#examples-in-nd","title":"Examples in nD","text":"

This section contains several examples on how to generate nD (n > 3) data with pyclugen. To run the examples we first need to import the clugen() function:

import numpy as np\nfrom pyclugen import clugen\n

To plot these examples we use the plot_examples_nd function:

from plot_functions import plot_examples_nd\n
"},{"location":"generated/gallery/plot_4_nd_examples/#5d-example-with-default-optional-arguments","title":"5D example with default optional arguments","text":"
seed = 123\n
# Number of dimensions\nnd = 5\n
e085 = clugen(nd, 6, 1500, [1, 1, 0.5, 0, 0], np.pi / 16, 30 * np.ones(nd), 30, 4, 3, rng=seed)\n
plot_examples_nd(e085, \"e085: 5D with optional parameters set to defaults\")\n
"},{"location":"generated/gallery/plot_4_nd_examples/#5d-example-with-proj_dist_fn-unif-and-point_dist_fn-n","title":"5D example with proj_dist_fn = \"unif\" and point_dist_fn = \"n\"","text":"
seed = 579\n
# Number of dimensions\nnd = 5\n
e086 = clugen(nd, 6, 1500, [0.1, 0.3, 0.5, 0.3, 0.1], np.pi / 12, 30 * np.ones(nd), 35, 5, 3.5,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\", rng=seed)\n
plot_examples_nd(e086, \"e086: 5D with proj_dist_fn=\\\"unif\\\" and point_dist_fn=\\\"n\\\"\")\n
"},{"location":"generated/gallery/plot_4_nd_examples/#4d-example-with-custom-projection-placement-using-the-beta-distribution","title":"4D example with custom projection placement using the Beta distribution","text":"
seed = 963\n
# Number of dimensions\nnd = 4\n
# Custom proj_dist_fn: point projections placed using the Beta distribution\ndef proj_beta(len, n, rng):\n    return len * rng.beta(0.1, 0.1, size=n) - len / 2\n
e087 = clugen(nd, 5, 1500, np.ones(nd), np.pi / 6, 30 * np.ones(nd), 60, 15, 6, rng=seed,\n    proj_dist_fn=proj_beta)\n
plot_examples_nd(e087, \"e087: 4D with custom proj_dist_fn (Beta)\")\n

Total running time of the script: ( 0 minutes 7.035 seconds)

Download Python source code: plot_4_nd_examples.py

Download Jupyter notebook: plot_4_nd_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_5_mrg_examples/","title":"Merging and hierarchical cluster examples","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_5_mrg_examples/#merging-and-hierarchical-cluster-examples","title":"Merging and hierarchical cluster examples","text":"

This section contains several examples on how to merge cluster data, either generated with pyclugen or from other sources. To run the examples we first need to import the clugen() and clugen() functions:

import numpy as np\nfrom pyclugen import clugen, clumerge\n

Although it is possible to merge data in any dimension, these examples will focus on merging 2D data. Therefore, we'll use the same plot_examples_2d function used for the 2D examples:

from plot_functions import plot_examples_2d\n
"},{"location":"generated/gallery/plot_5_mrg_examples/#merging-two-data-sets-generated-with-clugen","title":"Merging two data sets generated with clugen()","text":"
seed1 = 444\nseed2 = 555\n
e088 = clugen(2, 5, 1000, [1, 1], np.pi / 12, [20, 20], 14, 1.2, 1.5, rng=seed1,\n    proj_dist_fn=\"unif\", point_dist_fn=\"n\")\ne089 = clugen(2, 3, 1500, [1, 0], 0.05, [20, 20], 0, 0, 4, rng=seed2,\n    point_dist_fn=\"n\", cluster_offset = [20, 0])\ne090 = clumerge(e088, e089)\n
plot_examples_2d(\n    e088, \"e088: data set 1\",\n    e089, \"e089: data set 2\",\n    e090, \"e090: merged data sets\")\n

In the previous example, clusters from individual data sets remain as separate clusters in the merged data set. It's also possible to maintain the original cluster labels by setting the clusters_field parameter to None:

e091 = clumerge(e088, e089, clusters_field=None)\n
plot_examples_2d(\n    e088, \"e088: data set 1\",\n    e089, \"e089: data set 2\",\n    e091, \"e091: merged data sets\")\n

"},{"location":"generated/gallery/plot_5_mrg_examples/#adding-noise-to-a-clugen-generated-data-set","title":"Adding noise to a clugen()-generated data set","text":"
seed = 333\n
prng = np.random.default_rng(seed)\ne092 = {\"points\": 120 * prng.random((500, 2)) - 60, \"clusters\": np.ones(500, dtype=np.int32)}\ne093 = clumerge(e092, e090) # clumerge(e092, e088, e089) would also work\n
plot_examples_2d(\n    e090, \"e090: original merged data sets\",\n    e092, \"e092: random uniform noise\",\n    e093, \"e093: data sets with noise\",\n    pmargin=0)\n
"},{"location":"generated/gallery/plot_5_mrg_examples/#merging-with-data-not-generated-with-clugen","title":"Merging with data not generated with clugen()","text":"

Data generated with clugen() can be merged with other data sets, for example data created with one of scikit-learn's generators:

seed = 321\n
from sklearn.datasets import make_moons\n\nX, y = make_moons(100, noise=0.05, random_state=seed)\n\ne094 = {\"points\": X, \"clusters\": y}\ne095 = clugen(2, 4, 200, [1, 1], np.pi / 12, [1, 1], 0.1, 0.01, 0.25, rng=seed,\n    proj_dist_fn = \"unif\", point_dist_fn = \"n\")\ne096 = clumerge(e094, e095)\n
plt = plot_examples_2d(\n    e094, \"e094: generated w/ make_moons()\",\n    e095, \"e095: generated w/ clugen()\",\n    e096, \"e096: merged data\")\n

We can also hierarchize clusters from different sources:

e097 = {**e094, \"hclusters\": np.ones(100, dtype=np.int32)}\ne098 = {**e095._asdict(), \"hclusters\": 2 * np.ones(200, np.int32)}\ne099 = clumerge(e097, e098, clusters_field=\"hclusters\")\n
plt = plot_examples_2d(\n    e097, \"e097: generated w/ make_moons()\",\n    e098, \"e098: generated w/ clugen()\",\n    e099, \"e099: merged data\",\n    clusters_field=\"hclusters\")\n

Total running time of the script: ( 0 minutes 3.588 seconds)

Download Python source code: plot_5_mrg_examples.py

Download Jupyter notebook: plot_5_mrg_examples.ipynb

Gallery generated by mkdocs-gallery

"},{"location":"generated/gallery/plot_functions/","title":"Plot functions","text":"

Note

Click here to download the full example code

"},{"location":"generated/gallery/plot_functions/#plot-functions","title":"Plot functions","text":"

Several auxiliary functions for plotting the examples in this documentation.

"},{"location":"generated/gallery/plot_functions/#import-the-required-libraries","title":"Import the required libraries","text":"
import os\nimport warnings\n\nimport matplotlib.pyplot as plt  # type: ignore\nimport numpy as np\nimport numpy.typing as npt\nimport pandas as pd\nimport seaborn as sns  # type: ignore\n\nfrom pyclugen import Clusters\n\n# Hide annoying warnings when building docs in CI\nif os.getenv(\"CI\") != None:\n    warnings.filterwarnings(\"ignore\")\n
"},{"location":"generated/gallery/plot_functions/#clusters2df","title":"clusters2df","text":"
def clusters2df(\n    *exs: Clusters | dict[str, npt.ArrayLike], clusters_field: str = \"clusters\"\n) -> pd.DataFrame:\n    \"\"\"Convert a sequence of clusters to a Pandas dataframe.\"\"\"\n\n    dfs = []\n    iex = 1\n\n    for ex in exs:\n        if isinstance(ex, dict):\n            points = ex[\"points\"]\n            clusters = ex[clusters_field]\n        else:\n            points = ex.points\n            clusters = ex.clusters\n\n        df = pd.DataFrame(\n            data=points, columns=[f\"x{i}\" for i in range(np.size(points, 1))]\n        )\n        df[\"cluster\"] = clusters.tolist()\n        df[\"example\"] = [iex] * clusters.size\n        dfs.append(df)\n        iex += 1\n\n    return pd.concat(dfs, ignore_index=True)\n
"},{"location":"generated/gallery/plot_functions/#get_plot_lims","title":"get_plot_lims","text":"
def get_plot_lims(df: pd.DataFrame, pmargin: float = 0.1):\n    \"\"\"Determine the plot limits for the cluster data given in `df`.\"\"\"\n\n    # Get maximum and minimum points in each dimension\n    xmaxs = df.iloc[:, :-2].max()\n    xmins = df.iloc[:, :-2].min()\n\n    # Determine plot centers in each dimension\n    xcenters = (xmaxs + xmins) / 2\n\n    # Determine plots span for all dimensions\n    sidespan = (1 + pmargin) * np.max(np.abs(xmaxs - xmins)) / 2\n\n    # Determine final plots limits\n    xmaxs = xcenters + sidespan\n    xmins = xcenters - sidespan\n\n    return xmaxs, xmins\n
"},{"location":"generated/gallery/plot_functions/#plot_examples_1d","title":"plot_examples_1d","text":"
def plot_examples_1d(*ets, ncols: int = 3, clusters_field: str = \"clusters\"):\n    \"\"\"Plot the 1D examples given in the ets parameter.\"\"\"\n\n    # Get examples\n    ex = ets[0::2]\n    # Get titles\n    et = ets[1::2]\n\n    df = clusters2df(*ex, clusters_field=clusters_field)\n\n    # Set seaborn's dark grid style\n    sns.set_theme(style=\"darkgrid\")\n\n    # Use seaborn to create the plots\n    g = sns.FacetGrid(df, col=\"example\", hue=\"cluster\", col_wrap=ncols)\n\n    # Plot the kernel density estimation plots\n    g.map(sns.kdeplot, \"x0\", multiple=\"layer\", fill=True)\n\n    # Get a flattened view of the axes array\n    g_axes = g.axes.reshape(-1)\n\n    # Determine the height of the rugs in the rug plot to 5% of total height\n    rug_height = g_axes[0].get_ylim()[1] * 0.05\n\n    # Plot the rug markers below the kde plots\n    g.map(sns.rugplot, \"x0\", height=rug_height)\n\n    # Set titles\n    for ax, t in zip(g_axes, et):\n        ax.set_title(t)\n
"},{"location":"generated/gallery/plot_functions/#plot_examples_2d","title":"plot_examples_2d","text":"
def plot_examples_2d(\n    *ets, pmargin: float = 0.1, ncols: int = 3, clusters_field: str = \"clusters\"\n):\n    \"\"\"Plot the 2D examples given in the ets parameter.\"\"\"\n\n    # Get examples\n    ex = ets[0::2]\n    # Get titles\n    et = ets[1::2]\n\n    df = clusters2df(*ex, clusters_field=clusters_field)\n\n    # Get limits in each dimension\n    xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)\n\n    # Set seaborn's dark grid style\n    sns.set_theme(style=\"darkgrid\")\n\n    # Use seaborn to create the plots\n    g = sns.FacetGrid(\n        df,\n        col=\"example\",\n        hue=\"cluster\",\n        xlim=(xmins.iloc[0], xmaxs.iloc[0]),\n        ylim=(xmins.iloc[1], xmaxs.iloc[1]),\n        aspect=1,\n        col_wrap=ncols,\n    )\n\n    g.map(sns.scatterplot, \"x0\", \"x1\", s=10)\n\n    # Set the plot titles and x, y labels\n    for ax, t in zip(g.axes, et):\n        ax.set_title(t)\n        ax.set_xlabel(\"x\")\n        ax.set_ylabel(\"y\")\n
"},{"location":"generated/gallery/plot_functions/#plot_examples_3d","title":"plot_examples_3d","text":"
def plot_examples_3d(\n    *ets,\n    pmargin: float = 0.1,\n    ncols: int = 3,\n    side=350,\n    clusters_field: str = \"clusters\",\n):\n    \"\"\"Plot the 3D examples given in the ets parameter.\"\"\"\n\n    # Get examples\n    ex = ets[0::2]\n    # Get titles\n    et = ets[1::2]\n\n    # Number of plots and number of rows in combined plot\n    num_plots = len(ex)\n    nrows = max(1, int(np.ceil(num_plots / ncols)))\n    blank_plots = nrows * ncols - num_plots\n\n    df = clusters2df(*ex, clusters_field=clusters_field)\n\n    # Get limits in each dimension\n    xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)\n\n    # Reset to default Matplotlib style, to avoid seaborn interference\n    sns.reset_orig()\n\n    # To convert inches to pixels afterwards\n    px = 1 / plt.rcParams[\"figure.dpi\"]  # pixel in inches\n\n    # Use Matplotlib to create the plots\n    _, axs = plt.subplots(\n        nrows,\n        ncols,\n        figsize=(side * px * ncols, side * px * nrows),\n        subplot_kw=dict(projection=\"3d\"),\n    )\n    axs = axs.reshape(-1)\n    for ax, e, t in zip(axs, ex, et):\n        ax.set_title(t, fontsize=10)\n        ax.set_xlim(xmins.iloc[0], xmaxs.iloc[0])\n        ax.set_ylim(xmins.iloc[1], xmaxs.iloc[1])\n        ax.set_zlim(xmins.iloc[2], xmaxs.iloc[2])\n        ax.set_xlabel(\"$x$\", labelpad=-2)\n        ax.set_ylabel(\"$y$\", labelpad=-2)\n        ax.set_zlabel(\"$z$\", labelpad=-2)\n        ax.tick_params(labelsize=8, pad=-2)\n        ax.scatter(\n            e.points[:, 0],\n            e.points[:, 1],\n            e.points[:, 2],\n            c=e.clusters,\n            depthshade=False,\n            edgecolor=\"black\",\n            linewidths=0.2,\n        )\n\n    # Remaining plots are left blank\n    for ax in axs[len(ex) : len(ex) + blank_plots]:\n        ax.set_axis_off()\n        ax.set_facecolor(color=\"white\")\n        ax.patch.set_alpha(0)\n
"},{"location":"generated/gallery/plot_functions/#plot_examples_nd","title":"plot_examples_nd","text":"
def plot_examples_nd(\n    ex: Clusters, t: str, pmargin: float = 0.1, clusters_field: str = \"clusters\"\n):\n    \"\"\"Plot the nD example given in the ex parameter.\"\"\"\n\n    # How many dimensions?\n    nd = ex.points.shape[1]\n\n    df = clusters2df(ex, clusters_field=clusters_field)\n\n    # Get limits in each dimension\n    xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)\n\n    # Set seaborn's dark grid style\n    sns.set_theme(style=\"darkgrid\")\n\n    # Create pairwise plots with nothing on the diagonal\n    g = sns.PairGrid(df.iloc[:, :-1], hue=\"cluster\", palette=\"deep\")\n    g.map_offdiag(sns.scatterplot, s=10)\n    g.figure.suptitle(t, y=1)\n\n    # Decorate plot\n    for i in range(nd):\n        for j in range(nd):\n            if i == j:\n                # Set the x labels in the diagonal plots\n                xycoord = (xmaxs.iloc[i] + xmins.iloc[i]) / 2\n                g.axes[i, i].text(\n                    xycoord, xycoord, f\"$x{i}$\", fontsize=20, ha=\"center\", va=\"center\"\n                )\n            else:\n                # Set appropriate plot intervals and aspect ratio\n                g.axes[i, j].set_xlim([xmins.iloc[j], xmaxs.iloc[j]])\n                g.axes[i, j].set_ylim([xmins.iloc[i], xmaxs.iloc[i]])\n                g.axes[i, j].set_aspect(1)\n

Total running time of the script: ( 0 minutes 0.006 seconds)

Download Python source code: plot_functions.py

Download Jupyter notebook: plot_functions.ipynb

Gallery generated by mkdocs-gallery

"}]} \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 0ea5bd1036eae6ba7109d9371e5364cae492ffaf..d876657471573744cec15426c7dcc0463b473a3a 100644 GIT binary patch delta 15 WcmdnYw3&%bzMF$XXVXSDT}A*RT?6g_ delta 15 WcmdnYw3&%bzMF%?eB(woT}A*RngjR%