From e29135613d079d72961e5838dcf35888fe023a48 Mon Sep 17 00:00:00 2001 From: Tim <16023856+bayesfactor@users.noreply.github.com> Date: Sun, 21 May 2023 21:48:43 -0700 Subject: [PATCH 1/4] includes RMST, difference in RMST and confidence intervals Implements RMST, difference in RMST from the R package: https://cran.r-project.org/package=survRM2 Includes RMST of one population (point estimate and confidence interval), difference in RMST of 2 populations (point estimate, p-value, and confidence intervals). Addresses #821 --- lifelines/filename.joblib | Bin 0 -> 29484 bytes lifelines/statistics.py | 137 +++++++++++++++++++++++++++-- lifelines/tests/test_statistics.py | 25 ++++++ 3 files changed, 155 insertions(+), 7 deletions(-) create mode 100644 lifelines/filename.joblib diff --git a/lifelines/filename.joblib b/lifelines/filename.joblib new file mode 100644 index 0000000000000000000000000000000000000000..143def57845384b95f7bed9cd48853523ee43e91 GIT binary patch literal 29484 zcmeI42V7J~*Z=8V1*9r%8%;sPf-Q0s5d{TBDb~#b3wPJ;MeH3AT^lYcmZ%teFEMto z8&fp;7$q^5L}Q9E#%^ry+}$&P?TN{o|NnVEpJ&&Pd~d&V=gyruzcX`Yf?fGmTPxGg z5G<4Ai)9jpSR1X^No3k+?J${Gq)`Y(c_NigtT7sV`wi>GZpji$a`ZA;m+t+$OLXj> zxyFcz#<9k5gQHL-(um|*(~4W3UL(>;lnP<4Qf6cv4bI}>VujAMX0T2)>J5!*Zz+^3 z^TaZvfnU2NI^T3{ZFHgeZbDJMPH5g{G`PP#3HBnHiV+!;;@4KNzl8=@VXjQ1)e03N zx!B0K861Q%Q4YCRs-clESFF$~H4;UEP&ZN~HX59A^b(m)qR<)*js;?!NT*{Tu3kFL zSz|ObNT6r-XHO)@*r)eQ>e1g=Y)m&AoK<3lNG2IYvL?x#yH-{PCs|9ODL?9M^EE$F zo;lmZBri?L)EbAdnJE*=bMiz&qrpzYD1}C`p#gb*p+rFr%N471#;Jxzrfd_66uAnUZ{Psc0zdpDFH$C4>l!)Jf}8VE>egJ#2WT${YYlY#9d{|++n@VTg{~-e2CHD z_ww0!;t`~*<%qN-<6VRbaiOrMf^1_m)BjCJYD9_x_Rw^e^ia0c*_mE#s=;2X6KQnD z6f1+BR;N@MQ%BJV~;hxcP%3#B_QQQ5!tWQp0`302mw{bxWm;1f}+=reC8F z_+eC1+PNh>7&)(V?<)=M;la~W#)?&bAmOUG0dD?~nyufIG0YRv=Px?1iVuZ+_}9`U z{rzD7J^iuADlY6Ae%aG@bO4O{)#pP9h=gB$N_`UfMPn%VdVAMN%ErLl9pQ4qh6An@ zc5W9#1AyV{R_$6ZfTQxbnhKH+1@kw5y(-chuJ!%xc*iHLp@Hw<#IEo8f`iwDeb}; z7}6XLF6}3)yxj)2eeSwq?_mxUj+;Am@cD~LHHWWuTo3_{HUH{U<;8)6?T*Z9JIM{~ zeh56bGmHnE1An}T>BWQlTUV`oJ~s@$zc68=V!kI#JU*eLwqJAT6LP%q#D{@!>(`@? z)T0FO;92)Y<41+S(7Z9%J&J>X*Z26U=Sjhk*ste}(dWZq{)~2Ek`MuG@jPJ_-P|8` zek4wscq$zBZVVcKc)kGE{Cd4?gkMWwuI^Z3_iHq44+{3()t(1G`CmF!@nv&ZoEFk# zN^?Hg#LRdk?(PfM2d^_WycZ1X&JFsdf;?xZdjW$QRfR!F(7DIybKODeeQj@zF$6vy zyZ>xtZx0yilCWj<*yixvk(7J(YklC<@E_Yu+zCMYr&0p`j2n;Hb~U znN8t9mU{DoEg`VxaYU>4uY15JV~^dKJB|xWy6}Wo}QslFpnP1g#&~4%xFe z1fEqCUetX~(tlxESj1T#eBv|gLi)-OXu9{?wk5xZ!S6-?`lW`H>z*fE`hT^=4{9a} z0@t45ze!x`63p9MvR!ijR8<6YYjFz%liJbxAs+#~hrDdc$@ zy_nMTx|INa+me>M=QkcKmdm%PvKm8kL*&8SQ^8Pm+BWI=aSvFSCfD!zxGAhz{o>R8 zx5HrD2ZJgXmPWwQwwdR1$o=E(J52nb1sBeLkvyXv!-Ma)KKpgYIX=kFR%~4}Bobt8 zJC5GI&<^%&ys26|Bmfk>7o7;Y?*rRryJkk!M8LeVgR47MMZge4Zpy_Qo)9yt&1|a; zJec?5f#Gtz9}MOSqE5F60naC=p3m|LfkC#bm&{ghV3+Rm4}K|&gaVJ~;-4G@aC)Yd zo5N0jxcx=@8|?xbz@vGyy|zE|gd@ZCgDMAxf$Zs)7xJJm*eIQl)qV;G_N^Y6lf60u zcIe8k(j zwfL+%lv=NyA#iE~TiP$mdP2Vc=@c1#_JE7qf_BG_;Xv~B zlaD>Gg~H)qz7P9gd=T9AZ2IHPZUQ*ie6h;Ug#&^2MoKk5dqBAg%@fizGHsZ4q%~za zq%s{-te8$ICz$tW!#S3i1ToPqTSvDvS$1|J(sCP_Sc5|j+t9P-3fm4l3>V4t?2ZJp zImmQpdNRpu%gFR%dNZj^I+HP#$z&TZ2e$3Av|pC?%hG;X+AmA{Wof_uX8UDhjou3< zqECgSF(MkYo`|>C$|Sk0JWsJoIgw(`>My-WuVvMi9V_`svqmMavrtY1f1XGuBCQ$I zL1eIJ)j#XWARdE!nW(_D=xfqwT4A1EMXoH;iOtRgt#PWB>A*PZa|~WWokT8Xb-l1q zL`Y7bL8hT6l_j-YlQwR5SxKy?+h`>DR8 zb{n;~s2xf56tzpKy+idD)vHubQT<2tC)GLBKBsmowI8YOqV^Zn0n~1!x{umh)Lx}_ zGu4gME~omD>RYPQs6L@~8nrX2?xl7c)z?(#Qr$r9XR42>{YLdAwTr1PrMj2udTRGk z-9~jK)mzlgqdJS~eyS^|9;bFBwNI%Yqq>LM{nY-YdWz}?s=uinPIV`>&#CUEx{vB1 zs=KI;p*o7{9jgDS?xuQ_+N)HLQu~|Q$5anfJwWYWYTr|Poa$++lc?^dx`p}$sGg&G zo9bSwJE{JqI)~~h>W`qhh5AXTexv>q>OY`*i|Tu7=TqHCbuHEL)E`2171eFD-JpI2 zs;8+xhuZzr|3LjNRG(9wP5m3xpFs60)xT6nQhy57pH%l!{|@y(P(KLuA5oo4{Z3S$ zP(4QdKvbtu{}uH!P`?G$^;DlzeNXi=)o)a%QoT(5AXLv$KLhncP(KLOYt&yxbuRUb zP`ya?D%G#lk3{t@^;c1SN_8Lgmr;KS_4`mi1l5Vuzd`*6)K5qKebgU9bu0DnP`?88 zn^1j7{d&|dK>b!!zf!+b`Bc<{hK6Qey)uWikHbj=#zdl~i(+EP5^?ALJH1HuzD)zB z8EkS|*7t>ud4n&sRwp)OTeZ5Mb^V`J{gv8fcI%t>m=7WQ>UP&H{mEIimw(5Fx(l0@ ztm@s9y+V2ByXtOcFssRjGLsmgNn|rQZ_QalZLT@d9D6CXb!n=))&9QaE)@Yn5t6@P9A)U{FW)DEEhONp}_Y{>yj4p?%) zk^`0;z#J$y;KEVbEy z2mZPqtNI7@nC1Pv>-(`BXQ|IE^|__Ku+#(QdZ7HYIVY_CiXPib^q491Nv9FXgg9o` z;88#1CZ7axk+2x&Ze%!AchfSB1Y)9?XoK4;@jV6~JXuIWYqZg|fk7m4N~l+ugKUJR zp~7S+)+_y&)+yG-Zyg^TvAO*8ko524%iGiEWPAU}DeOqD6dU$0bM>7rXm1jU9lAQ) zxv9DF()FxUZ2tE3iaSke8u885_;Ouc`fO5c>tC;gjpZVJ<*f|%FU2gGMw-2nblcjI z(xjI>tzN_sGvnoWp?ETajmaoAkENbqlWU3T=X}*$#v<62YfvskSdxJEW@} zOVm#3YUgnB25gJeE_gvxpp%Q*HQR<9pVm(8MgpDG4bs&ONvMHiY#&ofj+Doicd*?)#$C1O>9~qr)3gsdh`Ul%F{`8d`w5B;I zWk&}7XS2xqAIYL@C-T^AkQ_S@Q0*zwsT-60=*i|A`O*9kv+-?=+~CX>EA!XOD9I>{ zdOhOgt9Q|e3p8Ra33()g^nDFY5|kwCC1ez1A2w=}4L|Hng5O94pO$>57}DuP!qLo| z*!<>@!fdc(Ul@r>B#1Sm6{#zF*2X1@8Lq)2U#THQQ6$NY#w8Lq*WJ9$eDPFrJ#)Y) z8Dc^PNYMpHc2acLTv&)lq6hFq*H20an~wk!^u7=pS*m7 zFP@i%`_t)eJT=GQAyMSY^m*d?F_k2n8j@f{5;tgy%4BaP4JDT|XNNR`EJ~Zv^&Dw5 zSu7>zI15d&ez3{79l2{w{Y8%hX9Go34KBDYCW&P$mK^p! z3FLGYnsZG!Ogz$Px)&SWDP%-i#yHKX!@1vUo>unmsVAQ$qcopqXhfbO_ob3#@HB5y zDC?Ckwyb!wVYphMPAOC};wk*~-#jPbw9W+(=$CsV?8rhS>cd z$|1XdD2JR$7SvYIb!~0Sy0$h}?N!(8dXt>uXX5c`s}?Qa^bEUh3(`gh-X-Kd#*{fS5HzOtD1~2~^MhNmE>}Y_oN+rAi=0{r^vuP&;1vKT#hXf2aDO?h~Z- z!8?C{oBwW=A(IIeWG*8z$&U9MohWRCAc^4x|zdkJy3 z$b>2-@p=jKB)K|_hG#ghm=v{UB~$mDjq2Gh>>@tJYY4OT7_WD*g>nIXPVOsLuXTLX zySjDl^~%SR7r_4XB7=aa+XCXK8%8_|j8d5=ER^UNvkK9T6z1y{x$K-xM9`ZSl|-o! z6-Ufw_A%K^@L?JnnhSF^VmA8moi4>1+ePGmxYUujh$2|s?^1Vv^A&?h@a9XY(y$X2 z)lJZ3&TmJPSkjF&pW{ZRrXnJpook6~V(+3)WkWj~y^<`W)Q;SjO}NyaEYu~($DACg zE1|R@`J*+T|4LGf(tB+AC2wDqP!qVXWIVooXCXtJDH4%F_?r2cV*duGruB>K0V{Gl zOV@*A_(AStemNxFzZep8wSvs!&}v#_mjw{**C4ebM^a7qKUeNP^>N-tQ2l(S%at*U z;ar8*vAo?9D85m3Q1Y$1nx0ShH>+9RV14&>p#Of_S59FS;L)h?Q^g5!HN8GPpYAU{ zP>{L1=R(*yZAzfWSy?r`KfOLZpYGqkM^aePr4Y*Lv{t5)K_>hnIq+J5&|heB>G!@~ zpj|%K^P{RF(7W8n=>XKI* z&xUvXQLk#Z3x*|0+@yy9ZI*numsi?lLjbyy^TIZ3yx#BihcQ?LizM77QCw6+a zy=%N|)klk44dTSFFVky(T;^7_AhLbQgF9WTKG1WfX7@PUS&jP}etXZY^`|}J*Clyu zztHUIfdIv}V_$xlQgt`8>Y_iVQTz%#e+%yK+wn|#>GXtnp-1kGTQ}pYCPZ!7F?*L+ z?e*#TLvg>&ok1RXzIpMQW;^l+K*#uFc>kw({c(7{68AS4CmB~3&BT|N(f7ypr{UW( z_oo%H{Z*!_+w=!J7NiZ?7Emc|#Ny>n1$W|1W&4!!gkp_`c%uK8O@Ui|lc?nSil5F8 zUl89XIXLe^ZiREY%1b6>^bWnJZNqF7wbKS_Wg3br9G(aPqXMRtAE$s_+Q!{ zNdIPg@Xp`g1X`b%<8$AoJuujii9gxi5Yk7TFUcd_&)9oH+6A@}`FGW{jk%hZ4q~6H z{=8zZ#EVBN&T`k3R9D$u8u!P>W|dX3mupm)4sWgE1dSZrb>oWo4|%N|T9lOi7Jtpj!ESNOyYc1S>2tF6=1b!SO!ek} zlk>L`$%^)E1R-S3Qn4l~N2yojk&XqrRnJbMid$(vOeM`c?8JY^2F63E(ui4QO?qVI zWI?2OgV=am&eu&o$M!|kA5zipxkw(8kECu1+H(_z+ryC&$Vg-}WOHN`G8)+e8G~$zj77FWwnnx= zwnft6BJFUU4j<}(>(svQgzN7iX-^?Kc&z}pcR_YVCLn1~P$I6=>Ve8*YKHd2{mIA_ zWG`fIWGXTZ*$3GdnU3s-?2pVq(rG2Ka6KD207*UhgK&K?atLxLQiyyXDMIETbCG#S zF)|-nfMk#oq!c*}DMQMU3ZxRLLaLD(q!vksUFmUsIFj}tjKKAg$Wh4A$T7&V$Z^Q= z$O*`a$Vte_$SKGoWHGV?ITcxooQ9l^G$3anXCh}IXCvnz%aC)CMr1j19&$c%0kQ(Q z5V;7s7`X)b0dgsF8FD$Y61f8TA#x>h6>>Fl4RS4V9dbQ#19BsB6LK?h3vw%R8*)2x z2l6B2PUOePUC7JQ^41IIY)vpz0c^ zH|X7Owc*=x(zp=b_4_v5<^nKEl7c>Q<3k3wRp#Ttd|2kWWLf005a{<{tYE-e0rW|@ z^sws|9=tf5c44(^Fg&qy{C?Fj0ogx-<1OAIcVcKO5rfOXf#Tn{#&qAFiGpQ&AehgJRBsKGT*6fO~M4R6aAC@-vapZ#B{m_wMl783A#}3Wu7y>&7&T6XI z9}N6b)rtMq0$AH~>GJW}d=MCi-&q^Xg*Lm|W@TOrhK65fD{g@RJ~_%Q5B-4)7k;Qt zmEI=vBOh_}($$Mo?o!-5lFpBlFSRWq^R9<|)4t8Vg%-S$f^?B{y;j_xCX?vqYtDoA?fmM>C&WyK@qu`*>9DQ|hbTZ9Iw_)uQ-vwQFb z4oD}@9L>z)!h;5qb2CryK(I^g;M9@}Kkt69?fXypaA;kSS51$mFlUKi`X6OnSbKQ< zf_>Ej2>fK%>OI9gxD?T9_R}mrObj2!jItAeouMLd(RL23?Xq)bixC`B)9H5G_6*>|4tL&--P;AAc0R36c^&{M!PXfIqxo>Q$A+VOrwbq}+@mP- zIUiPD&01d*!H3!F zrx~A+^7B);z>7!b$&Pzg{p}hK%pUPeN_-3-D)(QIU5w~c9KaK;=)0Z@RFoMiqT@>fn*_{vb&UGs4x|;_+%kD(2 zAI}BH`4i3ze#VD`B?XlmBz*Wi4vy-94^5=g-(TH104kl5%Z^>+L%;a5E61g9A*%dQ z;`WDR9`bp6Pt5Zn<@>XfGm1V8ffrThv)k?xz{m|A-x=QLg01{Q@e!{eXr7Ul=+~7G zlb2S7^p4}gfdj$M0m&iIvhR#(OSODhBz-T= ztD4L+?*6n>N0QE~*ES~Z$>GAopA)ZW?*zb5V@bKQO}?Jpmj_W*u{*<7b7A`&hs&Pr`Ovzrpd#Lz2WNXvDZ2lZ52G9P`_AnL zK78Ua*LxKy58p&zUhwl+E=UXRh9r{tw^xmClJ=Di`QB>$+c#Um1;5=7lBYZk1n(k2 z6YWk8oIcjmuZGOC-+or->h_1Yu=?YB`c-7!`rl8+9$Fs4hrycai#y427e0);u_G=R z4z{dX-7p{sHZAzQ%NO0a&}Va=R@8(KBM!)?@y-ZfYl}0BZx;l?dTq^xc(xqhf7s^$ znU8a`Ce6LZMByLaH`vb zJ)4FE!*>DtX7-6(2y15ND$3--%+Q#n;J^ouw4f-}Ek4|i|Gex7sh?{`Wp7QoNb>Et zhG{!91il`B_RPIY9B7-*?11cu?1X#|*%=v+1Y{Rv zS7ZXR8!{2u9hrpef$WJ)My4QpA$ucJk!i?2$iB#QWItqoWCk)5nT54nht_ z4nYn@3X$(4MaUdvE;0`(M&=_6kPK3Slp=>AWk@+vfm9+@NHtP})FO3AJ#sj*5IF)l z5;+Pv8aW0z7C8<%9ytLy5jhDt894=6ge*puAg3Zrk<*aVkp|=p|WEpZU z(ugca&O^>eEr03tVSM09zq^Q9zh;O9zz~S zmM7pst^TPSBe>ayv^IY_sb!rgV#fzt^pHglS@e)a4_Wk(MGyTgJ=BV}NAEt5Fd16T H{)_$}8x2hm literal 0 HcmV?d00001 diff --git a/lifelines/statistics.py b/lifelines/statistics.py index e7b227876..53effec39 100644 --- a/lifelines/statistics.py +++ b/lifelines/statistics.py @@ -341,6 +341,129 @@ def z(p): ) +def restricted_mean_survival_time(point_in_time, fitterA) -> pd.Series: + """ + Implements Restricted Mean Survival Time analysis on the population described in fitterA + Returns the RMST value for the population described by fitterA + https://cran.r-project.org/package=survRM2 + + Parameters + ---------- + point_in_time: float, + the point in time to analyze the survival curves at. + + fitterA: + A lifelines univariate model fitted to the data. This can be a ``KaplanMeierFitter``, ``WeibullFitter``, etc. + + Returns + ------- + + pd.Series + a pandas Series with the properties 'RMST', 'RMST_SE', 'RMST_VAR', + 'RMST_LCI', 'RMST_UCI' + + Examples + -------- + .. code:: python + T1 = [4, 5, 7, 11, 14, 20, 8, 8] + E1 = [1, 1, 1, 1, 1, 1, 1, 1] + kmf1 = KaplanMeierFitter().fit(T1, E1) + + from lifelines.statistics import restricted_mean_survival_time + results = restricted_mean_survival_time(12.0, kmf1) + + results + """ + ft = pd.DataFrame({"time": fitterA.timeline}) + data = pd.DataFrame({"time": np.array(fitterA.durations), "event": np.array(fitterA.event_observed)}) + # print(data) + ft.index = ft.time + ft["n_risk"] = [sum(data.time >= i) for i in ft.index] + ft["surv"] = fitterA.survival_function_ + + n_event = data[data.event == 1].groupby("time").count() + n_event = n_event.join(ft["time"], how="right").drop("time", axis=1) + n_event.event.fillna(0, inplace=True) + # print(n_event) + idx = ft.time <= point_in_time + + wk_time = sorted(ft.time[idx].index.tolist() + [point_in_time]) + wk_surv = ft.surv[idx] + wk_n_risk = ft.n_risk[idx] + wk_n_event = n_event[ft.time <= point_in_time] + time_diff = np.diff([0] + wk_time) + areas = time_diff * ([1] + wk_surv.tolist()) + rmst = sum(areas) + + wk_var = wk_n_event.event / (wk_n_risk * (wk_n_risk - wk_n_event.event)) + wk_var = wk_var.tolist() + [0] + rmst_var = sum((np.flip(areas[1:])).cumsum() ** 2 * np.flip(wk_var)[1:]) + rmst_se = np.sqrt(rmst_var) + z = stats.norm.ppf(1 - fitterA.alpha / 2) + out = pd.Series( + {"RMST": rmst, "RMST_SE": rmst_se, "RMST_VAR": rmst_var, "RMST_LCI": rmst - z * rmst_se, "RMST_UCI": rmst + z * rmst_se} + ) + return out + + +def difference_in_restricted_mean_survival_time(point_in_time, fitterA, fitterB) -> pd.Series: + """ + Returns difference in Restricted Mean Survival Time analysis on the populations described in fitterA and fitterB + https://cran.r-project.org/package=survRM2 + + Parameters + ---------- + point_in_time: float, + the point in time to analyze the survival curves at. + + fitterA: + A lifelines univariate model fitted to the data from one population. This can be a ``KaplanMeierFitter``, ``WeibullFitter``, etc. + + fitterB: + A lifelines univariate model fitted to the data from a comparison population. This can be a ``KaplanMeierFitter``, ``WeibullFitter``, etc. + + Returns + ------- + + pd.Series + a pandas Series with the properties 'RMST_DIFF_A_B', 'RMST_DIFF_A_B_LCI', + 'RMST_DIFF_A_B_UCI', 'RMST_DIFF_pval' + + Examples + -------- + .. code:: python + df = load_waltons() + ix = df["group"] == "miR-137" + kmf1 = KaplanMeierFitter().fit(df.loc[ix]["T"], df.loc[ix]["E"]) + kmf2 = KaplanMeierFitter().fit(df.loc[~ix]["T"], df.loc[~ix]["E"]) + + from lifelines.statistics import difference_in_restricted_mean_survival_time + results = difference_in_restricted_mean_survival_time(12.0, kmf1, kmf2) + + results + """ + wk0 = restricted_mean_survival_time(point_in_time, fitterA) + wk1 = restricted_mean_survival_time(point_in_time, fitterB) + alpha = fitterA.alpha + + z = stats.norm.ppf(1 - alpha / 2) + rmst_diff_10 = wk1.RMST - wk0.RMST + rmst_diff_10_se = np.sqrt(wk1.RMST_VAR + wk0.RMST_VAR) + rmst_diff_10_lci = rmst_diff_10 - z * rmst_diff_10_se + rmst_diff_10_uci = rmst_diff_10 + z * rmst_diff_10_se + rmst_diff_pval = stats.norm.cdf(-np.abs(rmst_diff_10) / rmst_diff_10_se) * 2 + string = "RMST_DIFF_A_B" + rmst_diff_result = pd.Series( + { + string: rmst_diff_10, + f"{string}_LCI": rmst_diff_10_lci, + f"{string}_UCI": rmst_diff_10_uci, + "RMST_DIFF_pval": rmst_diff_pval, + } + ) + return rmst_diff_result + + def survival_difference_at_fixed_point_in_time_test(point_in_time, fitterA, fitterB, **result_kwargs) -> StatisticalResult: """ Often analysts want to compare the survival-ness of groups at specific times, rather than comparing the entire survival curves against each other. @@ -438,7 +561,7 @@ def survival_difference_at_fixed_point_in_time_test(point_in_time, fitterA, fitt test_name="survival_difference_at_fixed_point_in_time_test", fitterA=fitterA, fitterB=fitterB, - **result_kwargs + **result_kwargs, ) @@ -451,7 +574,7 @@ def logrank_test( weights_A=None, weights_B=None, weightings=None, - **kwargs + **kwargs, ) -> StatisticalResult: r""" Measures and reports on whether two intensity processes are different. That is, given two @@ -667,7 +790,7 @@ def pairwise_logrank_test( t_0=t_0, name=[(g1, g2)], weightings=weightings, - **kwargs + **kwargs, ) return result @@ -835,7 +958,7 @@ def multivariate_logrank_test( assert abs(Z_j.sum()) < 10e-8, "Sum is not zero." # this should move to a test eventually. # compute covariance matrix - factor = (((n_i - d_i) / (n_i - 1)).replace([np.inf, np.nan], 1)) * d_i / n_i ** 2 + factor = (((n_i - d_i) / (n_i - 1)).replace([np.inf, np.nan], 1)) * d_i / n_i**2 n_ij["_"] = n_i.values V_ = (n_ij.mul(w_i, axis=0)).mul(np.sqrt(factor), axis="index").fillna(0) # weighted V_ V = -np.dot(V_.T, V_) @@ -923,7 +1046,7 @@ def proportional_hazard_test( def compute_statistic(times, resids, n_deaths): demeaned_times = times - times.mean() T = (demeaned_times.values[:, None] * resids.values).sum(0) ** 2 / ( - n_deaths * (fitted_cox_model.standard_errors_ ** 2) * (demeaned_times ** 2).sum() + n_deaths * (fitted_cox_model.standard_errors_**2) * (demeaned_times**2).sum() ) return T @@ -947,7 +1070,7 @@ def compute_statistic(times, resids, n_deaths): null_distribution="chi squared", degrees_of_freedom=1, model=str(fitted_cox_model), - **kwargs + **kwargs, ) else: @@ -970,6 +1093,6 @@ def compute_statistic(times, resids, n_deaths): null_distribution="chi squared", degrees_of_freedom=1, model=str(fitted_cox_model), - **kwargs + **kwargs, ) return result diff --git a/lifelines/tests/test_statistics.py b/lifelines/tests/test_statistics.py index e82bad6e2..9e16385ba 100644 --- a/lifelines/tests/test_statistics.py +++ b/lifelines/tests/test_statistics.py @@ -523,6 +523,31 @@ def test_proportional_hazard_test_with_list(): assert results.summary.shape[0] == 2 * 2 +def test_restricted_mean_survival_time_nonparametric(): + print("testing RMST") + df = load_waltons() + ix = df["group"] == "miR-137" + kmf1 = KaplanMeierFitter().fit(df.loc[ix]["T"], df.loc[ix]["E"]) + result = stats.restricted_mean_survival_time(10, kmf1) + assert np.isclose(result.RMST, 9.794, rtol=1e-2, atol=1e-3) + assert np.isclose(result.RMST_SE, 0.123, rtol=1e-2, atol=1e-3) + assert np.isclose(result.RMST_LCI, 9.553, rtol=1e-2, atol=1e-3) + assert np.isclose(result.RMST_UCI, 10.036, rtol=1e-2, atol=1e-3) + + +def test_difference_in_restricted_mean_survival_time_nonparametric(): + print("testing diff in RMST") + df = load_waltons() + ix = df["group"] == "miR-137" + kmf1 = KaplanMeierFitter().fit(df.loc[ix]["T"], df.loc[ix]["E"]) + kmf2 = KaplanMeierFitter().fit(df.loc[~ix]["T"], df.loc[~ix]["E"]) + result = stats.difference_in_restricted_mean_survival_time(10, kmf1, kmf2) + assert np.isclose(result.RMST_DIFF_A_B, 0.183, rtol=1e-2, atol=1e-3) + assert np.isclose(result.RMST_DIFF_A_B_LCI, -0.063, rtol=1e-2, atol=1e-3) + assert np.isclose(result.RMST_DIFF_A_B_UCI, 0.428, rtol=1e-2, atol=1e-3) + assert np.isclose(result.RMST_DIFF_pval, 0.145, rtol=1e-2, atol=1e-3) + + def test_survival_difference_at_fixed_point_in_time_test_nonparametric(): df = load_waltons() ix = df["group"] == "miR-137" From f447b266549ee666eef37df69469473ba5073076 Mon Sep 17 00:00:00 2001 From: Tim <16023856+bayesfactor@users.noreply.github.com> Date: Mon, 22 May 2023 09:53:50 -0700 Subject: [PATCH 2/4] Update statistics.py add check on point_in_time argument for difference in RMST --- lifelines/statistics.py | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/lifelines/statistics.py b/lifelines/statistics.py index 53effec39..25681c645 100644 --- a/lifelines/statistics.py +++ b/lifelines/statistics.py @@ -442,6 +442,43 @@ def difference_in_restricted_mean_survival_time(point_in_time, fitterA, fitterB) results """ + #check point_in_time argument validity + time_max = max(fitterA.durations.max(), fitterB.durations.max()) + time_lesser_max = min(fitterA.durations.max(), fitterB.durations.max()) + statusA_max = fitterA.event_table.censored.iloc[-1] == 0 + statusB_max = fitterB.event_table.censored.iloc[-1] == 0 + #print(statusA_max, statusB_max) + #case 1: last event in both groups is not censored + if statusA_max and statusB_max: + if point_in_time is not None: + if point_in_time > time_max: + raise ValueError(f'the point in time needs to be shorter than or equal to the largest observed time on each of the two groups: {time_max}') + else: + point_in_time = time_max + #case 2: the last observed event in the shorter arm is observed, the last observed event in the longer arm is censored + if (statusA_max==0 and statusB_max == 1 and fitterA.durations.max() >= fitterB.durations.max()) or \ + (statusA_max==1 and statusB_max == 0 and fitterB.durations.max() > fitterA.durations.max()): + if point_in_time is not None: + if point_in_time > time_max: + raise ValueError(f'The point_in_time needs to be shorter than or equal to the largest observed time on each of the two groups: {time_max}') + else: + point_in_time = time_max + #case 3: the last observed event in the shorter arm is censored, the last observed event in the longer arm is observed + if (statusA_max == 1 and statusB_max == 0 and fitterA.durations.max() >= fitterB.durations.max()) or \ + (statusA_max == 0 and statusB_max == 1 and fitterB.durations.max() > fitterA.durations.max()): + if point_in_time is not None: + if point_in_time > time_lesser_max: + raise ValueError(f'The point in time needs to be shorter than or equal to the minimum of the largest observed time on each of the two groups: {time_lesser_max}') + else: + point_in_time = time_lesser_max + #case 4: the last event in both groups is censored + if (not statusA_max) and (not statusB_max): + if point_in_time is not None: + if point_in_time > time_lesser_max: + raise ValueError(f'the point in time needs to be shorter than or equal to the minimum of the largest observed time on each of the two groups: {time_lesser_max}') + else: + point_in_time = time_lesser_max + wk0 = restricted_mean_survival_time(point_in_time, fitterA) wk1 = restricted_mean_survival_time(point_in_time, fitterB) alpha = fitterA.alpha @@ -796,10 +833,6 @@ def pairwise_logrank_test( return result -def difference_of_restricted_mean_survival_time_test(model1, model2, t): - pass - - def multivariate_logrank_test( event_durations, groups, event_observed=None, weights=None, t_0=-1, weightings=None, **kwargs ) -> StatisticalResult: # pylint: disable=too-many-locals From 107da27a637cc2ae4125666b8ea2b3e217f2c699 Mon Sep 17 00:00:00 2001 From: Tim Date: Sun, 12 Nov 2023 14:23:49 -0800 Subject: [PATCH 3/4] update rmst calculation Changes: 1. fixed an issue of returning NaN confidence intervals if the followup time `point_in_time` is greater than the last observed event. Using `.replace(np.inf, 0)`, we correct the issue and return correct confidence intervals for that edge case. 2. cosmetic refactoring to do less recalculation and rely more on pre-calculated values stored in `fitterA` --- lifelines/statistics.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/lifelines/statistics.py b/lifelines/statistics.py index 25681c645..fdf532461 100644 --- a/lifelines/statistics.py +++ b/lifelines/statistics.py @@ -375,29 +375,25 @@ def restricted_mean_survival_time(point_in_time, fitterA) -> pd.Series: results """ ft = pd.DataFrame({"time": fitterA.timeline}) - data = pd.DataFrame({"time": np.array(fitterA.durations), "event": np.array(fitterA.event_observed)}) - # print(data) ft.index = ft.time - ft["n_risk"] = [sum(data.time >= i) for i in ft.index] + ft["n_risk"] = fitterA.event_table.at_risk ft["surv"] = fitterA.survival_function_ - n_event = data[data.event == 1].groupby("time").count() - n_event = n_event.join(ft["time"], how="right").drop("time", axis=1) - n_event.event.fillna(0, inplace=True) - # print(n_event) + n_event = pd.merge(fitterA.event_table.observed, ft["time"], how='right', left_index=True, right_index=True).drop('time', axis=1) + idx = ft.time <= point_in_time wk_time = sorted(ft.time[idx].index.tolist() + [point_in_time]) wk_surv = ft.surv[idx] wk_n_risk = ft.n_risk[idx] wk_n_event = n_event[ft.time <= point_in_time] - time_diff = np.diff([0] + wk_time) - areas = time_diff * ([1] + wk_surv.tolist()) + time_diff = np.diff(wk_time) + areas = time_diff * wk_surv rmst = sum(areas) - wk_var = wk_n_event.event / (wk_n_risk * (wk_n_risk - wk_n_event.event)) - wk_var = wk_var.tolist() + [0] - rmst_var = sum((np.flip(areas[1:])).cumsum() ** 2 * np.flip(wk_var)[1:]) + wk_var = wk_n_event.observed / (wk_n_risk * (wk_n_risk - wk_n_event.observed)) + wk_var = wk_var.replace(np.inf, 0).tolist()[1:] + [0] + rmst_var = sum((np.flip(areas.values[1:])).cumsum() ** 2 * np.flip(wk_var)[1:]) rmst_se = np.sqrt(rmst_var) z = stats.norm.ppf(1 - fitterA.alpha / 2) out = pd.Series( From 514e89833e334e47d7c808da5390da9a43b954bc Mon Sep 17 00:00:00 2001 From: Tim <16023856+bayesfactor@users.noreply.github.com> Date: Thu, 23 Nov 2023 11:25:49 -0800 Subject: [PATCH 4/4] moved the event_table property into the base fitter class so all fitters get an event_table --- lifelines/fitters/__init__.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py index bb810d4b1..e076c4151 100644 --- a/lifelines/fitters/__init__.py +++ b/lifelines/fitters/__init__.py @@ -84,6 +84,23 @@ def fit_right_censoring(self, *args, **kwargs): """ return self.fit(*args, **kwargs) + @property + def event_table(self) -> t.Union[pd.DataFrame, None]: + if hasattr(self, "_event_table"): + return self._event_table + else: + if utils.CensoringType.is_right_censoring(self): + self._event_table = utils.survival_table_from_events( + self.durations, self.event_observed, self.entry, weights=self.weights + ) + else: + self._event_table = None + return self.event_table + + @event_table.setter + def event_table(self, et): + self._event_table = et + class UnivariateFitter(BaseFitter): @@ -1033,19 +1050,6 @@ def _check_bounds_initial_point_names_shape(self): "_bounds must be the same shape as _fitted_parameter_names must be the same shape as _initial_values.\n" ) - @property - def event_table(self) -> t.Union[pd.DataFrame, None]: - if hasattr(self, "_event_table"): - return self._event_table - else: - if utils.CensoringType.is_right_censoring(self): - self._event_table = utils.survival_table_from_events( - self.durations, self.event_observed, self.entry, weights=self.weights - ) - else: - self._event_table = None - return self.event_table - def survival_function_at_times(self, times, label: t.Optional[str] = None) -> pd.Series: """ Return a Pandas series of the predicted survival value at specific times.