-
Notifications
You must be signed in to change notification settings - Fork 4
/
chep09tmva.tex
980 lines (817 loc) · 40.2 KB
/
chep09tmva.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
% don't use 'we' from instead use passive
\documentclass[a4paper]{jpconf}
%\bibliographystyle{iopart-num}
%\usepackage{citesort}
\usepackage{listings}
\usepackage{graphicx}
%basicstyle=\ttfamily \scriptsize, % the size of the fonts that are used for the code \footnotsize
\lstset{ % General settings
language=, % choose the language of the code
basicstyle= \sffamily \scriptsize, % the size of the fonts that are used for the code \footnotsize
showspaces=false, % show spaces adding particular underscores
showstringspaces=false, % underline spaces within strings
showtabs=false, % show tabs within strings adding particular underscores
frame=, % adds a frame around the code (single)
tabsize=2, % sets default tabsize to 2 spaces
captionpos=t, % sets the caption-position: top (t), bottom (b)
breaklines=true, % sets automatic line breaking
breakatwhitespace=false, % sets if automatic breaks should only happen at whitespace
escapeinside={\%*}{*)}, % if you want to add a comment within your co>de
caption=footnote,
label=listing:relRef
}
\newcommand{\Hplustaunu}{\mbox{${\rm H}^{\pm} \to \tau^{\pm}\nu_{\tau}$}}
\newcommand{\pT}{\mbox{${\rm p}_{\rm T}$}}
\begin{document}
\title{Ideal $\tau$ tagging with TMVA multivariate data-analysis toolkit}
\author{A Heikkinen, P Kaitaniemi, V Karim\"{a}ki,
M~J Kortelainen, T~Lamp\'{e}n, S Lehti, T Lind\'{e}n and L Wendland}
\address{Helsinki Institute of Physics, P.O. Box 64, FIN-00014 University of Helsinki, Finland}
\ead{[email protected]}
\begin{abstract}
The experience on using ROOT package TMVA for
multivariate data analysis is reported for a problem of $\tau$ tagging in the
framework of heavy charged MSSM Higgs boson searches at the LHC.
We investigate with a generator level analysis
how the $\tau$ tagging could be performed in an ideal case,
and hadronic $\tau$ decays separated from the
hadronic jets of QCD multi-jet background present in LHC experiments.
A successful separation of the Higgs signal from the background
requires a rejection factor of $10^5$ or better against the QCD background.
The $\tau$ tagging efficiency and background rejection are studied with various MVA classifiers.
\end{abstract}
\section{Introduction}
Multivariate analysis methods have become increasingly important in high energy physics.
The rare and subtle signals are hidden within voluminous data,
and their analysis can benefit from multivariate algorithms,
since taking full correlations into account can greatly increase the ability to separate signal
from background~\cite{statlearn}.
The ROOT~\cite{root} package TMVA~\cite{tmvaguide} for multivariate data analysis (MVA)
was demonstrated to be applicable to b-tagging in Ref.~\cite{chep07tmva}.
In this study, the TMVA package is applied to $\tau$ identification in the heavy charged MSSM Higgs
boson decay \Hplustaunu~$\to {\rm hadrons}$. In previous studies
conducted in the CMS experiment, this channel has been found to
provide an interesting possibility to discover the charged Higgs
boson~\cite{ptdrII}, should it exist.
One of the main challenges of finding the heavy charged Higgs boson is,
that the cross-section of the largest background,
i.e.~QCD multi-jet events,
which could fake hadronically decaying $\tau$'s,
is up to 10$^7$ times greater than the signal cross-section at the 14~TeV center of mass
collision energy of the LHC.
Since the production of such large Monte Carlo samples is not
currently feasible with full detector simulation, generator level
simulation was used to obtain an estimate for the
benefit of the use of multivariate methods to separate the signal and
background.
It is estimated that a rejection factor of 10$^{5}$ is needed with
$\tau$ identification in order to make the charged Higgs boson signal
visible~\cite{ptdrII}. Therefore, the performance of the selected MVA
classifiers was evaluated for background rejection of 10$^{5}$ and 10$^{6}$,
and some programming was required to map the study into the TMVA framework
and to analyse the Monte Carlo samples.
%This article is organised as follows.
%Section~\ref{sec:tmva} introduces the TMVA toolkit for parallel multivariate data analysis,
%Section~\ref{sec:data} describes the Monte Carlo data prepared for this study,
%Section~\ref{sec:code} outlines the analysis code for TMVA,
%and Section~\ref{sec:results} gives results using various TMVA discriminators.
\section{TMVA - Toolkit for Multivariate Data Analysis with ROOT}\label{sec:tmva}
ROOT-integrated TMVA is a framework for training, testing and performance evaluation
of multivariate classification techniques.
TMVA works in transparent factory mode
to guarantee an unbiased performance comparison between the classifiers, such as:
\begin{itemize}
\item Rectangular cut optimisation
\item Projective likelihood estimation (PDE approach)
\item Multidimensional probability density estimation (PDE - range-search approach, PDERS)
\item Multidimensional k-nearest neighbour classifier
\item Function discriminant analysis (FDA)
\item Predictive learning via rule ensembles (RuleFit)
\end{itemize}
Main characteristics of different TMVA classifiers are summarised in Table~\ref{tab:characteristics}.
\begin{center}
\begin{table}[h]
\footnotesize
\caption{\label{tab:characteristics} Main characteristics of different TMVA
classifiers~\cite{tmvaguide,tmvaPhystat}.}
%\footnotesize\rm
\centering
%\begin{tabular}{@{}*{7}{l}}
\begin{tabular}{lll}
\br
Method & Pros & Cons\\
\mr
Cuts & Easy to understand & Possibly inefficient \\
Likelihood methods & Fast to train and evaluate & Non-linear
correlations treated badly \\
HMatrix, Fisher & Very fast and transparent & fail if PDFs have same
mean,\\
& & and if non-linear correlations\\
PDERS, kNN & Handles well complex class boundaries & Impractical with more
than 10 variables\\
ANN & Very good with non-linear correlations & Black box, needs tuning\\
BDT & Very good out-of-the-box performance & Needs tuning to avoid
overtraining \\
RuleFit & Like BDT but simpler, fast evaluation & Often needs some
tuning\\
SVM & Good with non-linear problems, & Not
transparent\\
& insensitive to overtraining & \\
FDA & Very good classification if boundary is known & Classification
boundary function needed\\
\br
\end{tabular}
\end{table}
\normalsize
\end{center}
\vspace{-0.5cm}
Of the available classifiers,
the following MVA methods were selected for evaluating the TMVA performance:
\begin{itemize}
\item Linear discriminant analysis (LDA) based on Fisher discriminants
\item Boosted/Bagged decision trees (BDT)
\item Support Vector Machine (SVM)
\item Artificial neural networks (ANN)
\end{itemize}
\section{Data}\label{sec:data}
The signal was generated with Pythia~\cite{pythia} version 6.4.19
through the process ${\rm gg} \to {\rm tbH^\pm}$,
\Hplustaunu\ in the maximal
${\rm m}_{\rm H}$ SUSY scenario~\cite{maxsusy} with
${\rm m}_{{\rm H}^{\pm}}=217~{\rm GeV/c}^2$ and $\tan\beta = 30$.
%(m_A = 200 GeV/c², mu=-300Gev/c², M2 = 200Gev/c², m_gluino=800Gev/c²) % comment from Sami not included
The $\tau$ leptons were forced to decay hadronically. The decay of the
$\tau$ leptons was simulated with the Tauola program~\cite{tauola}
version 2.6 to obtain correct polarization for the $\tau$ lepton and
its decay products~\cite{taupolarization}.
A total of $10^5$ and $2\times 10^5$ signal events were produced for training and evaluation of the
multivariate analyzers, respectively.
The dominating background for this physics channel is the QCD
multi-jet background. This background was generated with
Pythia version 8.108. The transverse momentum of the hadronic jets was
limited to the bin $120 < \hat{\rm p}_{\rm T} < 170$~GeV/$c$, which
has been found to be the most difficult $\hat{\rm p}_{\rm T}$
range~\cite{ptdrII}. Training and evaluation samples of $5\times 10^6$
and $10^8$ QCD multi-jet events were produced, respectively.
Another independent sample of $5\times 10^6$ events was used as a second
training sample to estimate the bias caused by the training.
The events were generated with p-p collisions at a center of mass energy
of 14~TeV. The jets were reconstructed with the PYCELL-method in a
cone of 0.5. For the signal, only the jets
corresponding to the $\tau$ decay, i.e.~$\tau$ jets,
were taken as $\tau$-jet candidates.
For the background events, all jets obtained with PYCELL-methods were taken as $\tau$-jet candidates.
In order to save CPU time and disk space, a set of preselection cuts
was applied to the $\tau$-jet candidates. These are standard cuts,
which are used for $\tau$ identification~\cite{tautagging}. Care was
taken, that the preselection cuts were loose enough in order not to
bias the MVA performance. The following preselections were used:
\begin{itemize}
\item jet ${\rm E}_{\rm T} >$ 100 GeV
\item jet $|\eta|<$~2.2
\item matching of the leading track, i.e. the track with the highest
\pT, to the jet direction within a cone of 0.1
% in $(\eta$,$\phi)$ space % AH:::
% around the cone the $(\eta$,$\phi)$ axis of the leading
\item cut on the \pT\ of the leading track, \pT~$>$~20~GeV/$c$
\item charged track isolation, where at most one track was allowed in the
isolation annulus between the cones 0.04 and 0.50
around the leading track direction; tracks, which fulfilled the
following criteria were counted:
\begin{itemize}
\item $\eta$ of tracks, $|\eta|<$~2.5
\item minimum \pT\ of charged tracks,
${\rm p}_{\rm T}>{\rm p}_{\rm T}^{\rm min} =$~0.5~GeV/$c$
\item track matching to primary vertex along the beam axis, $|{\rm
IP}_{\rm z}^{\rm track}-z|<2$~mm
\end{itemize}
One or two tracks were allowed in a signal cone size of 0.04
around the leading track.
\end{itemize}
If at least one of the jets in the generated event fulfilled these
conditions, all the jets in the event were saved. For training and
evaluation of the MVA methods, the jets were required to fulfill
${\rm E}_{\rm T} >$~100~GeV, $|\eta|<$~2.4. Only the leading track was
allowed to be within the signal cone in order to select the
one-prong final state of $\tau$ decays, which dominate the hadronic
$\tau$ decay final states. In the signal samples, the jet was required
to be matched to the $\tau$ jet coming from the \Hplustaunu\ decay.
The total preselection efficiencies were found to be 17.3~\% and 0.7~\% for the
signal and background samples used for the evaluation of the MVA
methods, respectively.
%-----------------------
%In order to save CPU time and disk space, a set of preselection cuts
%was applied to the $\tau$-jet candidates. These are standard cuts,
%which are used for $\tau$ identification~\cite{tautagging}. Care was
%taken, that the preselection cuts were loose enough in order not to
%bias the MVA performance. The following preselections were used:
%\begin{itemize}
%\item jet ${\rm E}_{\rm T} >$ 100
%\item jet $|\eta|<$~2.2
%\item the leading track, i.e.~the track with the highest \pT, within a
% cone of 0.1 of the jet direction in $(\eta$,$\phi)$ space
%\item cut on the \pT\ of the leading track, \pT~$>$~20~GeV/$c$
%\item $\eta$ of tracks, $|\eta|<$~2.5
%\item minimum \pT\ of charged tracks,
% ${\rm p}_{\rm T}^{\rm min} >$~0.5~GeV/$c$
%\item track matching to primary vertex along the beam axis, $|{\rm
% IP}_{\rm z}^{\rm track}-z|<2$~mm
%\item one or two tracks in a signal cone size of 0.04 in $(\eta$,$\phi)$
% space around the leading track
%\item zero or one tracks in an isolation annulus between the signal
% cone and a cone of 0.5 in $(\eta$,$\phi)$ space around the leading track
%\end{itemize}
%The preselection efficiencies were found to be 17.3~\% and 0.7~\% for the
%signal and background samples used for the evaluation of the MVA
%methods, respectively.
%The number of tracks in the signal cone of 0.04 in $(\eta$,$\phi)$ space
%around the leading track was required to be one in order to select the
%one-prong final state of $\tau$ decays, which is dominant of the
%hadronic $\tau$ decay final states.
Most of the generally used variables for the $\tau$ identification
in the \Hplustaunu\ decay~\cite{tautagging} were used as input to the MVA methods.
%Of variables, which are standardly used for the $\tau$ identification in the \Hplustaunu\ decay~\cite{tautagging},
%the most important ones were used as input to the MVA methods.
These variables include a cut on the transverse energy and
pseudo-rapidity of the $\tau$-jet candidate, maximum track \pT\ in the
isolation annulus between cones of 0.04 and 0.50 around the leading
track direction to impose charged track isolation,
the electromagnetic energy sum in the region between cones of 0.10 and 0.50 around the jet axis,
and matching of the hadronic energy deposition to the leading track
momentum to reject electrons.
Furthermore, the ${\rm R}_\tau = {\rm p}_{\rm track} / {\rm E}_{\tau}$ variable, where
${\rm E}_{\tau}$ is the reconstructed $\tau$-jet candidate energy
(excluding neutrinos), was used to take advantage of the boost of the
$\tau$ due to polarization~\cite{taupolarization}.
The variables are summarized in Table~\ref{tab:variables}.
Figure~\ref{fig:variables} shows
the distributions of the jet ${\rm E}_{\rm T}$ and ${\rm R}_\tau$
variables, which were found to have the best separation power.
The input variables were used without transformations as well
as with decorrelation or principal component analysis applied.
\begin{table}[h]
\begin{center}
\footnotesize
\caption{\label{tab:variables}Variables used in the analysis.}
\begin{tabular}{l*{2}{l}r}
%\hline
\br
ID & Variable \\
%\hline
\mr
0 & Jet ${\rm E}_{\rm T}$ \\
1 & Jet $\eta$ \\
2 & Charged track isolation: no tracks with ${\rm p_T} < {\rm p_T^{max}}$ \\
& in isolation annulus of 0.04-0.50 \\
3 & Isolation of electromagnetic energy ($\Delta$R=0.10 - 0.50) \\
4 & Neutral hadron rejection \\
& (i.e.~track p matching to hadronic energy deposition) \\
5 & ${\rm R}_{\tau}$ = p(leading track) / E(jet) \\
%\hline
\br
\end{tabular}
\normalsize
\end{center}
\end{table}
\begin{figure}[h]
\begin{minipage}{7.8cm}
\includegraphics[width=0.9\textwidth]{images/jetet.png}
\end{minipage}
\hfill
\begin{minipage}{7.8cm}
\includegraphics[width=1.0\textwidth]{images/rtau.png}
\end{minipage}
%\begin{minipage}{3.0cm}
\caption{Example of data used in $\tau$ tagging.
Distributions of jet ${\rm E}_{\rm T}$ (left) and
${\rm R}_{\tau}$ (right) variables are shown after all preselections.}
%\end{minipage}
\label{fig:variables}
\end{figure}
%\begin{center}
%\begin{table}[h]
%\caption{\label{opt}Summary of {\sf QGSP\_\-INCL\_ABLA} physics list.}
%\footnotesize\rm
%\centering
%\begin{tabular}{@{}*{7}{l}}
%\br
%Option&Description\\
%\mr
%\verb"Al-"&Targets heavier than Aluminium.\\
%\verb"150~MeV -"&Projectile energies from $\sim$ 150 MeV up to 2.5 GeV $\sim$ 3 GeV.\\
%\br
%\end{tabular}
%\label{tab:}
%\end{table}
%\end{center}
%{\sf QGSP\_\-INCL\_ABLA} with
%\begin{figure}[h]
% \begin{minipage}{7.0cm}
%\includegraphics[width=1.0\textwidth]{images/ahCorrelationMatrixS.png}
%\end{minipage}
% \hfill
%\begin{minipage}{7.0cm}
%\includegraphics[width=1.0\textwidth]{images/ahCorrelationMatrixB.png}
%\end{minipage}
%\begin{minipage}{3.0cm}
%\caption{Right: Variable correlation matrix for signal Right: Variable correlation matrix for background}
%\end{minipage}
%\label{fig:ahCorrelationMatrix}
%\end{figure}
\section{$\tau$ tagging analysis code for TMVA }\label{sec:code}
%Since the performance of the MVA methods was to be evaluated at a
%background rejection 10$^{5}$ and 10$^{6}$,
%some analysis code was developed for the analysis.
%Firstly, since the root trees contained jets, and since a single event
%could contain several jets, a functionality to evaluate the $\tau$
%identification efficiency calculated per event was added to {\tt TMVA::Reader}.
%The algorithm is described in pseudocode in
%Appendix~A. %Listing~\ref{listing:eventcode}.
%The algorithm had to take into account also the preselection
%efficiencies of the event samples used for evaluation. Additionally,
%timing profiling with {\tt TStopwatch} was added.
%Also the evaluation of the signal efficiency at high background rejection
%level required some tuning.
%[describe here in more detail what was done]
% what is the point of the following sentence? please clarify
%Example Listing~\ref{listing:log} in Appendix A from analysis run demonstrates this analysis code.
For $\tau$-tagging it is natural to train and use the MVA methods with jets.
However, in order to obtain results which can be compared with other studies,
the evaluation of the methods should be done with respect to events.
An analysis program was therefore developed to train and evaluate the methods with TMVA in a standard way,
and in addition to re-evaluate the methods with events.
This analysis code was made for TMVA distributed with ROOT 5.22.
The event based evaluation algorithm is described in Appendix~A Listing~\ref{listing:eventcode}.
The algorithm is very similar to the evaluation algorithm in TMVA
with the exception of bookkeeping of events.
The preselection efficiencies are taken into account,
and the signal efficiencies are printed at background efficiency levels $10^{-5}$ and $10^{-6}$.
Example Listing~\ref{listing:log} in Appendix A demonstrates the output from the analysis program.
\section{Results}\label{sec:results}
In the following the usage and results of selected TMVA classifiers are presented.
The amount of data used for training varied depending on the classifier,
but for testing all available data was used.
The systematic uncertainty was estimated by repeating the full analysis with independent background data.
\subsection{Classifying with Fisher discriminant}
The method of Fisher discriminants is a computationally easy method,
which determines the discriminating function analytically in the
multivariate space represented by the input variables.
It has been used in analyses of several HEP experiments,
for instance in BaBar~\cite{fisherbabar} and Belle~\cite{fisherbelle}.
The Fisher method works in a transformed variable space with zero
linear correlations, by distinguishing the mean values of the signal
and background distributions. An axis in the (correlated) hyperspace
of the input variables is chosen so that when projecting the output
classes (signal and background) upon this axis, they are pushed as far
as possible away from each other compared to the average mutual
distance of events belonging to the same class~\cite{tmvasite}.
Fisher discriminants are optimal for Gaussian distributed variables
with linear correlations.
However, when a variable has the same sample mean for signal and background,
no discrimination is achieved.
If the shapes of the distributions are different,
a suitable transformation can then be applied~\cite{tmvasite}.
The Fisher classifier was used as an example of linear discriminant
analysis in the case of ideal $\tau$ tagging.
The following settings were used:
%Perusasetus, ei VarTransformia
%\scriptsize
\begin{verbatim}
Fisher H:!V:!Normalise:CreateMVAPdfs:Fisher:NbinsMVAPdf=50:NsmoothMVAPdf=1
\end{verbatim}
%\normalsize
%HMatrix H:!V:CreateMVAPdfs
The response of TMVA to Fisher classifier is shown in Fig.~\ref{fig:fishersvm}.
%and \ref{fig:hmatrix},
The signal efficiency was found to be 3.5$\pm$0.1\;\% at background rejection of 10$^5$.
Signal efficiencies are also shown in Table~\ref{table:eff} together with other
discrimination methods tested.
\begin{figure}[h]
\begin{minipage}{8.0cm}
\includegraphics[width=1.0\textwidth]{images/mva_Fisher.png}
\end{minipage}
\hfill
\begin{minipage}{8.0cm}
\includegraphics[width=1.0\textwidth]{images/mk_svm_gauss2.png}
% \label{fig:mkSvmGauss2}
%\end{minipage}
\end{minipage}
%\begin{minipage}{3.0cm}
\caption{\label{fig:fishersvm}TMVA response to Fisher discriminant (left) and
the output of the SVM classifier with Gaussian kernel (right).}
%\end{minipage}
\end{figure}
%\begin{figure}[h]
%\begin{center}
%\caption{\label{fig:fisher}TMVA response to Fisher discriminant.}
%\caption{\label{fig:fisher}TMVA response to Fisher discriminant.}
%\end{center}
%\end{figure}
%\begin{figure}[h]
%\begin{center}
%\includegraphics[width=0.8\textwidth]{images/mva_HMatrix.png}
%\caption{\label{fig:hmatrix}TMVA response to H-Matrix discriminant}
%\end{center}
%\end{figure}
\subsection{Boosted Decision Trees}
Recently, the Boosted Decision Trees (BDTs) have been advocated as an alternative to
artificial neural networks for particle identification~\cite{bdt}.
The BDT method is based on binary decision trees visualized in Fig.~\ref{fig:bdt}.
Repeated yes/no decisions are made on the variable with the
best separation power until the subsamples become small,
or until the subsamples are declared as signal or background.
The variable phase-space is hence divided into a large number of hypercubes,
which is why BDT is effective for both with linear and non-linear samples.
%Because the splitting is always based on the variable with
%best separation, variables with small separation power can be input to
%the method without risking loss of performance.
In order to make the decision trees robust against statistical
fluctuations of the training sample, boosting, i.e. reweighting, is
applied to the training sample. After each reweighting, a new decision
tree is constructed.
The boosting thus combines iteratively many weak classifiers or hypotheses
into a single stronger rule called the combined hypothesis \cite{bdt}.
The outcome of a tested event is carried out
by evaluating the decisions of typically hundreds of
trees. Overtraining is countered by pruning nodes with insignificant
separation.
The BDT was evaluated with the following setup:
\begin{verbatim}
BDT V:NTrees=400:BoostType=AdaBoost:SeparationType=GiniIndex:nEventsMin=20:
nCuts=20:PruneMethod=CostComplexity:PruneStrength=4.0
\end{verbatim}
Increasing the number of trees was found not to yield significant
improvement. Different pruning strengths were tried out in order to
determine a level at which overtraining is tolerable.
The signal efficiency was found to be 7.3$\pm$1.3~\% at the background rejection of 10$^5$.
Decorrelation and principal component analysis were tried out for the input variables,
but they were found to yield only minimal changes in the signal efficiency.
The full training samples were used to obtain the results.
\begin{figure}[h]
\begin{minipage}{9.0cm}
\includegraphics[width=0.9\textwidth]{images/bdt.png}
\end{minipage}
% \hfill
%\begin{minipage}{7.0cm}
%\end{minipage}
\begin{minipage}{6.0cm}
\caption{A visualization of the yes/no chain of decisions of a boosted
decision tree. Some of the nodes have been declared as signal or
background.}
%Data sample of 24193 events is first separated to intermediate nodes and finally to leaf nodes.}
\label{fig:bdt}
\end{minipage}
\end{figure}
%---------------------
%Recently, the Boosted Decision Trees (BDTs) have been advocated as an alternative to
%artificial neural networks for particle identification~\cite{bdt}.
%The BDT method is based on binary decision trees.
%Repeated yes/no decisions are made on the variable with the
%best separation power until the subsamples become small or until the
%subsamples are declared as signal or background. The variable
%phase-space is hence divided into a large number of hypercubes, which is
%why BDT is effective for both with linear and non-linear
%samples.
%Because the splitting is always based on the variable with
%best separation, variables with small separation power can be input to
%the method without risking loss of performance.
%In order to make the decision trees robust against statistical
%fluctuations of the training sample, boosting, i.e.~reweighting, is
%applied to the training sample. After each reweighting, a new decision
%tree is constructed.
%The boosting thus combines iteratively many weak classifiers or hypotheses
%into a single stronger rule called the combined hypothesis \cite{bdt}.
%The outcome of a tested event is carried out
%by evaluating the decisions of typically hundreds of
%trees. Overtraining is countered by pruning nodes with insignificant
%separation.
%The BDT method was evaluated with a set of 400 trees. Increasing the
%tree number was found not to yield significant improvement. The number
%of cuts was set to 20 and the pruning strength parameter of 4.0 was chosen.
%An example of a decision tree generated
%with these parameters is shown in Fig.~\ref{fig:bdt}. The signal efficiency was
%found to be 7.3$\pm$1.3\% at background rejection of 10$^5$.
%Decorrelation and principal component analysis were tried out for the
%input variables, but they were found to yield only minimal changes in
%the signal efficiency for the evaluated background rejection points 10$^5$ and 10$^6$.
% Figure of a decision tree to be added here
%AH::: add TMVA parameter rows for all classifiers
%\begin{figure}[h]
% \begin{minipage}{9.0cm}
%\begin{center}
%\includegraphics[width=0.9\textwidth]{images/bdt.png}
%\end{minipage}
% \hfill
%\begin{minipage}{7.0cm}
%\end{minipage}
%\begin{minipage}{6.0cm}
%\caption{An example of boosted decision tree.
%Data sample of 24193 events is first separated to intermediate nodes and finally to leaf nodes.}
% \label{fig:bdt}
%\end{minipage}
%\end{center}
%\end{figure}
\subsection{Support Vector Machine}
The Support Vector Machine (SVM) learning algorithm is a recent addition to MVA methods.
One of the first applications in HEP was the classification problem
of signal/background discrimination in the $t\bar{t}$ dilepton channel~\cite{svmtt}.
The SVM maps the input vectors into the feature space through some
non-linear mapping.
In this space an optimal hyperplane is constructed and evaluated by a kernel function.
%${\rm K(u,v)}$~\cite{svmintro}.
Potential advantages of the SVM method compared to the ANN method
include the existence of only few user chosen parameters, ability to find global minimum, and
correspondence to a linear method which makes the SVM theoretically easy to analyse.
For $\tau$ tagging the SVM was trained with Gaussian kernel
with $4\times 10^3$ signal jets and $3.2\times 10^4$ background jets.
The SVM training time scales as
O(${\rm n}^2$), where ${\rm n}$ is the size of the training sample.
Therefore the training sample size was kept small for this method.
The Gaussian kernel has two parameters (\texttt{Sigma}, \texttt{C}) and the optimisation was done in
this parameter space with grid scan.
The training for individual points were run in parallel in a Linux cluster.
The best signal efficiency was found to be $5.6\pm 0.1\;\%$
at the background rejection of $10^5$ with the following parameters:
\begin{verbatim}
SVM_Gauss Sigma=0.5:C=17:Tol=0.001:MaxIter=20000:Kernel=Gauss
\end{verbatim}
Figures \ref{fig:fishersvm} and \ref{fig:mkSvmParallels} demonstrate the TMVA output of the SVM
classifier with the Gaussian kernel.
%The signal efficiency was found to be 5.6$\pm$0.1\% at background rejection of 10$^5$
%with Gaussian kernel, with optimised parameters $\sigma$=0.5 and C$=$17.
%Figures \ref{fig:fishersvm} and \ref{fig:mkSvmParallels} demonstrate the TMVA output of the SVM classifier.
%and in practice training took roughly one hour.
%\begin{itemize}
%\item Training 4000 signal jets, 32000 background jets
% \end{itemize}
%\item Results
% \begin{itemize}
% \item Background 1
% \begin{itemize}
% \item $10^{-5}$ bkg eff: $5.68\pm 0.12\;\%$
% \item $10^{-6}$ bkg eff: $1.66\pm 0.06\;\%$
% \end{itemize}
% \item Background 2
% \begin{itemize}
% \item $10^{-5}$ bkg eff: $5.49\pm 0.12\;\%$
% \item $10^{-6}$ bkg eff: $1.80\pm 0.07\;\%$
% \end{itemize}
%
% \end{itemize}
%\end{itemize}
%\begin{figure}[h]
% \begin{minipage}{7.0cm}
%\includegraphics[width=0.8\textwidth]{images/mk_svm_gauss2.png}
%\end{minipage}
% \hfill
%\begin{minipage}{7.0cm}
%\end{minipage}
%\begin{minipage}{3.0cm}
%\caption{The output of the SVM classifier with Gaussian kernel.}
% \label{fig:mkSvmGauss2}
%\end{minipage}
%\end{figure}
\begin{figure}[h]
\begin{center}
%\includegraphics[width=21pc]{poster/images/.png}\hspace{2pc}%
\includegraphics[width=0.9\textwidth]{images/svm_parallels2.png}
%\begin{minipage}[b]{14pc}
\caption{TMVA plot for parallel coordinates.
In this kind of plot each line going through specific variable,
as explained in Table~\ref{tab:variables}, represent one event.
Poorly classified background events with value 0.9--1.0 are
selected from the vertical histogram on the left.
}
%Variables {\tt jetEt, jeteta, isolMaxPt, ecalIsolEt, hcalRatio}, and {\tt rtau}
%are represented with Var00-Var05 in the figure.
\label{fig:mkSvmParallels}
%\end{minipage}
\end{center}
\end{figure}
\subsection{Neural Networks}
An interesting study related to our $\tau$ tagging is presented in~\cite{tauneural},
where an artificial neural network (ANN) was trained
to choose the polarity of $\tau$ particles from the decay angles.
It was shown that the $\tau$ helicity found by the ANN
approximated well the optimal Bayesian classifier.
%obtain correct polarization for the $\tau$ lepton and
%its decay products~
%Results for MLP discriminator are shown for {\tt jetEt} transformation $\log({\rm E}_{\rm T})$ and
%Principal Component Analysis ({\tt VarTransform=PCA}).
Of the three Multilayer Perceptrons (MLP) implementations,
supported by TMVA,
{\tt TMVA::Types::kMLP} was selected.
Data for 6-15-1 MLP configuration with neurons of sigmoid type
was trained for $10^3$ cycles (see Fig.~\ref{fig:nn}) with
$10^4$ signal jets and $4\times 10^4$ background jets.
%Rest of the data was used for testing (30k signal and 2230k background jets).
A ROOT TMVA configuration for these settings can be written as follows:
\begin{verbatim}
NSigTrain=10000:NBkgTrain=40000:SplitMode=Random:NormMode=NumEvents:!V
MLP_v0 H:!V:!Normalise:NeuronType=sigmoid:NCycles=1000:HiddenLayers=N+9,N:
TestRate=5:VarTransform=PCA
\end{verbatim}
In addition to variable transformation $\log({\rm E}_{\rm T})$,
Principal Component Analysis, PCA,
(see {\tt VarTransform=PCA} above) was found to improve classification power.
In our $\tau$ tagging problem the PCA method simply performs a rotation
in the 6-dimensional orthogonal parameter space to a new coordinate system whose unit vectors
are the eigenvectors of the system.
Convergence of the neural network training and
%the background rejection vs. signal efficiency
for test data is shown in Fig.~\ref{fig:nn}.
The signal efficiency for the MLP discriminator was found to be 6.5$\pm$0.1\% at
the background rejection of 10$^5$.
\begin{figure}[h]
\begin{minipage}{8.5cm}
\includegraphics[width=1.0\textwidth]{images/MLPConvergenceTest.png}
\end{minipage}
% \hfill
\begin{minipage}{9.0cm}
%\includegraphics[width=1.0\textwidth]{images/roc.png}
\end{minipage}
\begin{minipage}{7.0cm}
\caption{Evolution of training and validation errors of TMVA MLP classifier during 1000 training cycles.}
\end{minipage}
\label{fig:nn}
\end{figure}
\subsection{Summary of results}
The performance of the various TMVA discriminators for the ideal $\tau$ tagging problem
is summarised in Table~\ref{table:eff}.
Overall discrimination performance of selected TMVA classifiers are
demonstrated with Receiver Operating Characteristics (ROC) curves in
Fig.~\ref{fig:roc}.
In order to take the preselection efficiencies into account,
signal efficiency shown in Fig.\ref{fig:roc} should be multiplied by 0.17 and
correspondingly background efficiency by 0.007.
For example, the required $10^{-5}$ background efficiency
corresponds to 0.9986 background rejection after preselections.
\begin{table}[h]
\caption{\label{table:eff}Summary of performance of various
TMVA discriminators for the ideal $\tau$ tagging problem.}
\begin{center}
\footnotesize
%\begin{tabular}{l*{2}{l}r}
\begin{tabular}{l*{2}{l}{l}r}
\br
Discriminator & Signal efficiency (\%) & \\
& for background efficiency & \\
& $10^{-5}$ & $10^{-6}$ \\
\mr
Fisher & 3.5 $\pm$ 0.1 (stat) $\pm$ 0.0 (syst) & 1.6 $\pm$ 0.1 $\pm$ 0.1 \\
BDT & 7.3 $\pm$ 1.3 $\pm$ 0.1 & 2.6 $\pm$ 0.8 $\pm$ 0.1\\
SVM & 5.6 $\pm$ 0.1 $\pm$ 0.1 & 1.7 $\pm$ 0.1 $\pm$ 0.1 \\
MLP & 6.5 $\pm$ 0.1 $\pm$ 0.2 & 2.2 $\pm$ 0.1 $\pm$ 0.2 \\
\br
\end{tabular}
\normalsize
\end{center}
\end{table}
% BDT7.3+-1.3+-0.1, e-6 bkg: 2.6+-0.8+-0.1
\begin{figure}[h]
\begin{minipage}{8.0cm}
\includegraphics[width=1.0\textwidth]{images/mk_roc.png}
\end{minipage}
\hfill
\begin{minipage}{8.0cm}
\includegraphics[width=1.0\textwidth]{images/mk_roc_zoomed.png}
\end{minipage}
%\begin{minipage}{3.0cm}
\caption{Overall discrimination performance of selected TMVA classifiers.
On the right a closeup image of background rejection vs. signal efficiency curves is shown.}
%\caption{Overall discrimination performance of selected TMVA classifiers.
%On the right closeup image of background rejection vs. signal efficiency curves is shown.}
%Receiver Operating Characteristics (ROC) Curve
%\end{minipage}
\label{fig:roc}
\end{figure}
\section{Conclusion}
The usage of the TMVA package in ROOT for $\tau$ identification in the
framework of a charged Higgs boson study was discussed from the user's point of view.
It was observed, that the TMVA package has
matured since CHEP'07 and it is now fully integrated to ROOT toolkit.
It also provides an interface for adding new classifiers.
Some analysis code was prepared to evaluate the study case in the TMVA framework.
The multivariate data-analysis techniques were found to be promising in $\tau$ identification.
At $10^{-5}$ background efficiency,
TMVA classifiers were found to yield signal efficiencies in the range 3.5--7.3~\%.
Several methods gave comparable results,
which suggests that they are close to the Bayesian limit of achievable ideal separation.
Areas where the study can be improved have been identified.
%(e.g. using additional variables, and yet unused classifiers available in TMVA),
One possibility would be to use more a fundamental set of variables,
instead of those chosen in this study,
such as the three-momenta components ${\rm p_x}$, ${\rm p_y}$, ${\rm p_z}$ of the final state particles.
This kind of jet analysis based on neural networks has been shown to simulate a sophisticated
jet algorithm ${\rm k_\bot}$~\cite{jetanalysis}.
%Additional research on $\tau$ identification is planned.
\ack %command \ack sets the acknowledgments heading as an unnumbered section.
The work was funded partly by the
Vilho, Yrj\"o and Kalle V\"ais\"al\"a fund of the Finnish Academy of Science and Letters.
\section*{References}
\begin{thebibliography}{9}
%\bibitem{incl} A. Boudard et al., \emph{Intranuclear cascade model for
% a comprehensive description of spallation reaction data}, Phys.
% Rev. C66 (2002) 044615
%\bibitem{g4} \emph{Geant4 collaboration website} \\ {\tt http://\-cern.ch/\-geant4}
%\bibitem{pk08bProceedings}
%A. Heikkinen, P. Kaitaniemi, and A. Boudard,
%{\em Implementation of INCL4 cascade and ABLA evaporation codes in Geant4},
%Journal of Physics: Conference Series 119 (2008) 032024,
%{\sf [doi:10.1088/1742-6596/119/3/032024]}
\bibitem{statlearn} J.~Zimmermann and C.~Kiesling,
\emph{Statistical learning methods in high-energy and astrophysics analysis},
Nucl. Instr. and Meth. A 534 (2004) 204-210
\bibitem{root} R.~Brun and F.~Rademakers, \emph{ROOT - An Object Oriented Data Analysis Framework},
Nucl. Inst. and Meth. in Phys. Res. A 389 (1997) 81-86. See also {\tt http://root.cern.ch}
\bibitem{tmvaguide} A.~H\"{o}cker et al.~\emph{TMVA, Toolkit for Multivariate Data Analysis with ROOT},
CERN-OPEN-2007-007, {\tt arXiv:physics/0703039}, June 19, 2007
\bibitem{chep07tmva} T.~Lampen et. al., \emph{Testing TMVA software in b-tagging
for the search of MSSM Higgs bosons at the LHC},
CHEP’07 proceedings, Journal of Physics: Conference Series 119 (2008) 032028
\bibitem{ptdrII} {CMS Collaboration}, \emph{{CMS} Physics Technical Design Report, Volume {II}:
Physics Performance}, J. Phys. G: Nucl. Part. Phys. 34 (2007) 995-1579
\bibitem{tmvaPhystat} F.~Tegenfeldt, \emph{TMVA - Toolkit for multivariate
data analysis with ROOT}, presentation at PHYSTAT-LHC Workshop on
Statistical Issues for LHC Physics, June 27--29 2007.
\bibitem{pythia} T. Sj\"ostrand et al., \emph{High-Energy-Physics
Event Generation with {PYTHIA} 6.1}, Comp. Phys. Comm. 135 (2001) 238-259
\bibitem{maxsusy} M.~Carena et al. \emph{Suggestions for Improved Benchmark Scenarios for Higgs-Boson Searches at LEP2},
{\tt arXiv:hep-ph/9912223} %\href{http://arxiv.org/pdf/hep-ph/9912223}{arXiv:hep-ph/9912223}
\bibitem{tauola} S.~Jadach et al., \emph{The $\tau$ decay library
{TAUOLA}: Version 2.4}, Comp. Phys. Comm. 76 (1993) 361-380
\bibitem{taupolarization} D.~P.~Roy, \emph{The hadronic tau decay signature of a heavy charged Higgs
boson at {LHC}}, Phys. Lett. B 459 (1999) 607-614
\bibitem{tautagging} S.~Gennai et al., \emph{Tau jet reconstruction
and tagging with {CMS}}, Eur. Phys. Jour. C 46 (2006) 1-21
\bibitem{fisherbabar} G.~Cavoto,
\emph{Measurements of Charmless Hadronic B Decays Branching Fraction at BABAR},
{\tt arXiv:\-hep-ex\-/0105018v1} (2001) 5p.
\bibitem{fisherbelle} K.~Abe et al,
\emph{Measurement of the branching fraction for $B\rightarrow \eta' K$
and search for $B \rightarrow \eta' \pi^{+}$ },
Phys. Lett. B 517 (2001) 309-318
\bibitem{tmvasite} TMVA documentation at {\tt http://tmva.sourceforge.net}
\bibitem{bdt} B.~Roe et al., \emph{Boosted Decision Trees as an
Alternative to Artificial Neural Networks for Particle Identification},
{\tt arXiv:\-physics\-/0408124}
\bibitem{svmtt} A.~Vaiciulis, \emph{Support vector machines in analysis of top quark production},
Nucl. Instr. and Meth. A 502 (2003) 492-494
\bibitem{svmintro}N.~Cristianini and J.~Shawe-Taylor, \emph{An Introduction to Support Vector Machines},
Cambridge University Press, UK 2000
\bibitem{tauneural} L.~Garrido and V.~Gaitan,
\emph{Use of Neural Nets to Measure the Tau Polarization and its Bayesian Interpretation},
UAB-LFAE-91-04, April, 1991 Universitat Atonoma de Barcelona
\bibitem{jetanalysis} P.~De Felice at al.,
\emph{Jet analysis by neural networks in high energy hadron-hadron collisions},
Physics Letters B 354 (1995) 473-480
%\bibitem{htau} {CMS Collaboration}, \emph{{CMS} Physics Technical Design Report, Volume {II}:
% Physics Performance}, J. Phys. G: Nucl. Part. Phys. 34 (2007) 995-1579
%\bibitem{abla} J. Benlliure et al., \emph{Calculated nuclide
% production yields in relativistic collisions of fissile nuclei},
% Nuc. Phys. A628 (1998) 458
%\bibitem{abla1} J. J. Gaimard et al., \emph{},
% Nuc. Phys. A531 (1991) 709
%\bibitem{abla2} A. R. Junghans et al., \emph{},
% Nuc. Phys. A629 (1998) 635
%\bibitem{gsifragments} T. Enqvist et al. \emph{},
% Nucl. Phys. A686 (2001) 481
%\bibitem{g4incl} \emph{Geant4 Physics Reference Manual: INCL~4.2 Cascade and ABLA~V3 Evaporation with Fission}
%\\ {\tt http://geant4.web.cern.ch/\-geant4/\-UserDocumentation/\-UsersGuides/\-PhysicsReferenceManual/\-html/\-node185.html}
%\bibitem{data} X.Ledoux et al., \emph{Spallation Neutron Production by
% 0.8, 1.2, and 1.6 GeV Protons on Pb Targets} Phys. Rev. Lett. 82
% (1999)
%\item Strite S and Morkoc H 1992 {\it J. Vac. Sci. Technol.} B {\bf 10} 1237
\end{thebibliography}
\appendix % The command \appendix" is used to signify the start of the appendixes.
\section{Analysis code for TMVA event evaluation}
Analysis code for simulated $\tau$ tagging events is demonstrated in
Listing~\ref{listing:eventcode} and corresponding example from TMVA run is
shown in Listing~\ref{listing:log}.
\lstset{%emph={If,For,Else},emphstyle=\underbar,
language=python,
keywordstyle=\underbar,
caption=Pseudocode for the event evaluation.,
breaklines=true,
stepnumber=99999, %trick to remove line numbering
showlines=false,
label=listing:eventcode
}
\lstinputlisting{eventcode.txt}
%\newpage
\lstset{emph={Jog},emphstyle=\underbar,
language=,
caption=Example listing showing analysis code for TMVA.,
breaklines=true,
stepnumber=99999, %trick to remove line numbering
showlines=false,
label=listing:log
}
\lstinputlisting{log.txt}
%\begin{equation}
%time= money
%\end{equation}
%To obtain a simple heading of `Appendix' use the code \verb"\section*{Appendix}".
%If it contains numbered equations, figures or tables the command \verb"\appendix" should
%precede it and \verb"\setcounter{section}{1}" must follow it.
\end{document}