-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.tex
1987 lines (1730 loc) · 146 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\input{preamble.tex}
\usepackage[right=5cm, marginparwidth=4cm, marginparsep=6mm]{geometry}
\usepackage{pgfplots}
\usepackage{nccmath}
\usepackage{dsfont}
\usepackage{csquotes}
\usepackage{tabularx}
\usepackage{layouts}
\usepackage{emptypage}
\newcommand{\notimplies}{\mathrel{{\ooalign{\hidewidth$\not\phantom{=}$\hidewidth\cr$\implies$}}}}
\newcommand\Cfin{\C_{\text{fin}}}
\newcommand\eps{\varepsilon}
\newcommand\samplecomp{N_{\eps,\delta}}
\newcommand\ra{\rightarrow}
\newcommand\VCdim[1]{\operatorname{VCdim}(#1)}
\newcommand\Ldim[1]{\operatorname{Ldim}(#1)}
\newcommand\Xtwo{\prescript{X}{}{2}}
\newcommand\Ytwo{\prescript{Y}{}{2}}
\newcommand\overlinea{\overline{a}}
\newcommand\shatterfunc{\pi_\C}
\newcommand\Lshatterfunc{\rho_\C}
\newcommand\mis{\operatorname{mis}(H,f,\overline{a})}
\newcommand\err{\operatorname{err}_\mu(H,f,\overline{a})}
\newcommand\errNoA{\operatorname{err}_\mu(H,f,-)}
\newcommand\errNoANoF{\operatorname{err}_\mu(H,-,-)}
\newcommand\errNoF{\operatorname{err}_\mu(H,-,\overline{a})}
\DeclareMathOperator{\EX}{\mathbb{E}}
\DeclareMathOperator{\Var}{Var}
\newcommand\symmdiff{\bigtriangleup}
\newcommand\supp{\operatorname{supp}}
\newcommand\dens{\operatorname{dens}}
\newcommand\shelahrank{R^\phi(p)}
\newcommand\shelah[1]{R^\phi(p \land #1)}
\newcommand\Th{\operatorname{Th}}
\newcommand\Adm[2]{\operatorname{Adm}^{#1}_{#2}}
\newcommand{\tF}{\texttt{F}}
\newcommand{\tx}{\texttt{x}}
\newcommand{\ty}{\texttt{y}}
\let\HH\H
\renewcommand\H{\cH}
\renewcommand{\exp}{\mathrm{e}}
\DeclarePairedDelimiter{\ceil}{\lceil}{\rceil}
\DeclareMathOperator*{\argmax}{arg\,max}
\newcommand\ACF{\operatorname{ACF}}
\newcommand\DCF{\operatorname{DCF}}
\let\emptyset\varnothing
\let\phi\varphi
\newcommand\cFx{\cF_x}
\newcommand\cFnotx{\cF_{\overline{x}}}
\newcommand\rhox{\rho_x}
\newcommand\rhoNotx{\rho_{\overline{x}}}
\newcommand\ops{\operatorname{op}_s}
\def\outlinespacingscalar{0.8}
\newcommand{\fa}{\mathfrak{a}}
\newif\ifshowmarginnotes
%%% \showmarginnotestrue to show
\showmarginnotesfalse
%%% custom margin note for attribution
\newcommand{\contribution}[1]{%
\ifshowmarginnotes
\marginnote{\textcolor{gray}{#1}}
\fi
}
\begin{document}
\maketitle
\pagenumbering{gobble}
\section*{Acknowledgments}
My journey in mathematics began in the summer of 2020, many years after I last had to pick up pen and paper and solve an equation in high school. I had always enjoyed challenges, and the prospect of tackling difficult problems excited me. However, personal circumstances meant that I could never afford the education I wanted. It wasn't until six years after leaving my home country and a year into my new job that I finally decided to pursue what I had long desired.
Now, more than four years later, I've learned to respect and appreciate mathematics for its incredible complexity and its abstract beauty, a perspective only so many people get to experience. This journey has been incredibly challenging, many times pushing me to the absolute limits of my abilities---and beyond---and for that, I'm grateful. I've grown and hardened not only intellectually, but also personally. While I'm proud of what I've accomplished, I'm also relieved that my time in Bonn is coming to an end.
I am grateful to my employer, NRW.BANK, and my manager, Klaus-Martin Karl, for giving me the opportunity to pursue my academic dreams. It was their support that made it possible in the first place.
I am grateful to Lisanne Göbel for guiding me through many of the difficult obstacles during my studies. It is my working alliance with them that has enabled me to complete this thesis. It really means a lot to me.
Most importantly, I would like to thank my wife, Polina. Her emotional support over the past few years has been invaluable to me. She has been my emotional bedrock. Beyond her help with my studies, I am grateful for the wonderful life we share together.
\newpage\phantom{blabla}\newpage
\setcounter{tocdepth}{3}
\tableofcontents
\subsection*{Conventions}
Throughout the text we make use of shorthand notation as common in model theory. We denote
\begin{outline}
\1 the set of integers $\{0,1,\ldots,n-1\}$ as $n$,
\1 a finite number $n\in \N$ as $n<\omega$,
\1 the set of all subsets of $X$ of size $n$ as $\binom{X}{n}$,
\1 the set of all functions $f:X\ra Y$ as $\prescript{X}{}{Y}$.
\0 For example, the set of all functions $f:X \ra \{0,1\}$ is denoted as $\Xtwo$.
\1 $\log$ denotes the logarithm to the base 2,
\end{outline}
For any set $Y$ there exists only one function $f:\emptyset\ra Y$, also known as the \emph{empty function}.
%I should probably find the exact place where I use it and put it as a footnote.
\clearpage
\newpage\null\thispagestyle{empty}\newpage
\pagenumbering{arabic}
\addcontentsline{toc}{section}{\protect\numberline{}Introduction}
\section*{Introduction}
\begin{outline}
\0 This thesis explores the fascinating connections between two seemingly disparate fields of mathematics and computer science: model theory and machine learning. At first glance, these areas may appear to have little in common --- model theory is a branch of mathematical logic concerned with the formal study of mathematical structures, while machine learning focuses on algorithms that can learn and make predictions from data. However, there are deep and surprising links between fundamental concepts in these domains.
Our investigation centers on two key relationships:
\1 The connection between Probably Approximately Correct (PAC) learnability in computational learning theory and the model-theoretic notion of NIP (Non-Independence Property) formulas.
\1 The correspondence between online learnability in computational learning theory and stable formulas in model theory.
\0 These connections allow us to bridge abstract logical properties of theories with concrete learnability guarantees for concept classes. By translating between the languages of logic and learning theory, we gain new insights into the theoretical foundations of machine learning and expand our understanding of the expressiveness of logical theories. This thesis is structured in two main parts:
\0 In the first part, we prove the fundamental theorem of PAC learning, which establishes that a concept class is PAC learnable if and only if it has finite Vapnik-Chervonenkis (VC) dimension. We then introduce NIP theories and demonstrate the equivalence between finite VC dimension and the NIP property.
\0 The second part explores online learning and the Littlestone dimension as a measure of concept class complexity. We present the Standard Optimal Algorithm for online learning and prove its optimality. On the model theory side, we establish the equivalence between finite Littlestone dimension and Shelah's 2-rank. This allows us to show that stable formulas correspond to concept classes with finite Littlestone dimension.
\0 Finally, we illustrate these concepts with several examples of stable theories. These examples demonstrate how model theory provides a rich source of concrete, learnable concept classes.
\0 While a deep understanding of these areas is not required, readers are expected to have some background in key areas. In model theory, familiarity with first-order logic, languages, structures, models and theories is beneficial. Knowledge of key theorems such as the Löwenheim-Skolem theorem and the compactness theorem will be particularly helpful. In probability theory, knowledge of basic concepts including probability spaces, measures, random variables and expectation will be helpful.
\0 All contributions are marked according to the official requirements for theses in the bachelor study program Mathematik at the University of Bonn. We indicate the source material in round brackets after theorem or lemma. Margin notes indicate the degree of personal contribution:
\1 0 indicates exact copy of the source material,
\1 1 indicates minor contribution: worked out details and improved structure and clarity of the argument,
\1 2 indicates major contribution: there was a significant amount of work done to extend and explain the argument,
\1 3 indicates original contribution.
\0 In some cases, we will provide a brief explanation of our exact contribution in the margin notes.
\end{outline}
\newpage
\section{VC dimension and NIP theories}
\subsection{PAC learning framework}
The Probably Approximately Correct (PAC) learning framework, introduced by Leslie Valiant in 1984, provides a formal foundation for analyzing machine learning problems. It offers a mathematical model to quantify when and how learning is possible, bridging the gap between computational learning theory and practical machine learning algorithms.
In the PAC framework, the learner receives a sample of labeled examples $\{(x_i,f(x_i)) : i\in n\}$, where each $x_i$ is drawn independently from $X$ according to the unknown distribution $\mu$, and $f \in \C$ is the ground truth they aim to learn. The learner's goal is to output a hypothesis $h\in\H$ that, with high probability, closely approximates the target concept $f$ on future examples drawn from the same distribution.
At its core, PAC learning addresses a fundamental question: Under what conditions can a learning algorithm reliably generalize from a finite set of examples to accurately predict outcomes on unseen data?
\begin{definition}
\contribution{2 --- Synthesized from multiple sources, independently worded.}
\begin{outline}
\0 The framework formalizes this idea by introducing several key components:
\1 An \emph{input space} $X$ is a set of all possible instances,
\1 A \emph{concept} $f$ is a binary-valued function $X\rightarrow\{0,1\}$,
\1 A \emph{concept class} $\C\subseteq \Xtwo$ is a class of concepts,
\1 A \emph{target concept} $f\in\C$ is the true function to be learned,
\1 A \emph{hypothesis} $h$ is a function, representing the learner's prediction,
\1 A \emph{hypothesis class} $\H\subseteq \Xtwo$ is a set of hypotheses $h$,
\1 A \emph{sample} $S=\{(x_1,f(x_1)),\ldots,(x_n,f(y_n))\}\subseteq (X\times 2)^n$, $n<\omega$, corresponding to the restriction of target function $f$ to $\{x_1,\ldots,x_n\}$.
\1 A \emph{hypothesis function} or a \emph{learning function} $H:\Cfin \rightarrow \H$. It represents an algorithm or deterministic procedure, which given a sample $S$ corresponding to the restriction $f|_S\in\Cfin$ outputs a prediction $H(f)$.
\2 $\Cfin = \{\C|_Y : Y\subseteq X, Y\text{ finite}\}$ represents all possible labeled samples from $X$.
\end{outline}
\end{definition}
\begin{remark}[Restrictions]
\label{remark:consistentRealizable}
\contribution{2 --- Synthesized from multiple sources, independently worded.}
\begin{outline}
\0 In this thesis, we focus on PAC learning with two important properties:
\1 Consistency: A hypothesis $h$ is \emph{consistent} with a labeled sample, if it correctly classifies all instances in that sample. Formally, given a sample $S$, $h$ satisfies $\forall x_i \in S : h(x_i)=f(x_i)$. Similarly, a hypothesis function $H$ is \emph{consistent}, if for all $f\in\C$ and all $S\subseteq X$ finite it holds $\forall x\in S : H(f|_S)(x)=f(x)$.
\1 Realizability: This assumes that there exists a hypothesis $h\in\H$ which perfectly classifies all instances, that is $\forall x\in S : h(x)=f(x)$.
\0 Given our focus on the realizable case, we will assume that the hypothesis class $\H$ is equal to the concept class $\C$. Consequently, we can refine our notation and sometimes write $H:\Cfin \rightarrow \C$ or $H:\Cfin \rightarrow \Xtwo$ in special cases.
\end{outline}
\end{remark}
\begin{definition}[PAC learnability]
\label{def:PAClearnability}
\contribution{2 --- Synthesized from multiple sources, independently worded.}
\begin{outline}
\0 Let $\C$ be a concept class on a set $X$. We say that $\C$ is \emph{probably approximately correct (PAC) learnable} if there exists a hypothesis function $H:\Cfin \rightarrow \Xtwo$ such that:
\1 For all $\eps, \delta \in (0,1)$, there exists a natural number $\samplecomp < \omega$ satisfying the following condition:
\1 For all $n \geq \samplecomp$, all $f \in \C$, and all probability measures $\mu$ on $X$ (with the correct sets being $\mu$-measurable),
$$\mu^n(\{\overline{a} \in X^n : \err > \eps\}) \leq \delta$$
where:
\2 $\eps\in(0,1)$ is the \emph{accuracy parameter}, specifying the acceptable error rate,
\2 $\delta\in(0,1)$ is the \emph{confidence parameter} indicating the desired probability of successfull learning,
\2 $\samplecomp: (0,1)^2\rightarrow \N, (\eps,\delta) \mapsto \samplecomp$ is the \emph{sample complexity function}, which determines the minimum number of examples required to guarantee PAC learning.
\2 $\err$ is the error of the hypothesis function $H$ predicting $f$ given sample $\overline{a}=(a_1,\ldots,a_n)$, defined by
$$\mu^n(\{x\in X : H(f|_{\overline{a}})(x)\neq f(x)\})$$
\0 This definition ensures that with high probability $(1-\delta)$, the learning function $H$ will produce a hypothesis that is approximately correct (up to an error of $\eps$) when the sample size is at least $\samplecomp$.
\end{outline}
\end{definition}
\subsection{The fundamental theorem of PAC Learning}
\label{section:fdmThmofPAC}
The fundamental theorem of PAC learning, first proven by Blumer, Ehrenfeucht, Haussler and Warmuth in 1989, establishes that a concept class is PAC learnable if and only if it has finite VC dimension. This section presents a complete proof of this theorem.
\subsubsection{VC dimension and shatter function}
\label{subsection:1}
\begin{definition}[VC dimension]
\contribution{1 --- Synthesized from multiple sources, independently worded.}
Let $X$ be an input space and let $\C\subseteq\Xtwo$ be a concept class on $X$. For any $Y\subseteq X$:
\begin{outline}
\1 the \emph{restriction} of $\C$ to $Y$ is $\C|_Y := \{ f|_Y : f\in \C\}$.
\1 $\C$ \emph{cuts out} $Y$ from $X$ if $\exists f\in\C : f|_X = \mathds{1}_Y$.
\1 $\C$ \emph{shatters} $Y$ if $\C|_Y = \Ytwo$ or, equivalently, if $\C$ cuts out every subset of $Y$.
\0 The Vapnik-Chervonenkis (VC) dimension of $\C$, denoted $\VCdim{\C}$, is defined as $$\sup\{ |Y| : Y\subseteq X \text{ is finite and } \C \text{ shatters } Y\}.$$
If $\C$ shatters arbitrarily large finite sets, then $\VCdim{\C} = \infty$. If $\C$ shatters no set, then $\VCdim{\C} = -\infty$. A concept class $\C$ is called a \emph{VC class} if it has finite VC dimension.
\end{outline}
\end{definition}
\begin{remark}[Learning problems and set systems]
\contribution{3.}
\label{rem:dualitySetSystems}
There exists a natural correspondence between learning problems $(X,\C)$ and set systems $(X,\cF)$. Each $f\in \C$ defines a unique set $A_f\subseteq X$ where $A_f = \{ a\in X :f(a)=1\}$.
The set system perspective often simplifies proofs and combinatorial arguments, while the function-based view aligns more closely with the learning theory framework. We will sometimes use the notation $(X,\cF)$ interchangeably with $(X,\C)$ in our proofs, as this leads to more concise and intuitive arguments.
\end{remark}
Informally, the VC dimension measures the complexity or expressiveness of a concept class in relation to its domain $X$ by looking at how many points it can label arbitrarily. A higher VC dimension indicates that the concept class can represent more complex decision boundaries.
\begin{example}[Easy]
\contribution{1.}
Consider $X=\R$ and the class of threshold functions $\C = \{f_a(x) : f_a(x)=1 \text{ if } x \geq a, f_a(x)=0 \text{ if } x < a\}$. This class has VC dimension $2$, since it can shatter any set of two points and cannot shatter any set of three points. Note that the definition of VC dimension requires only one set of maximum size to be shattered.
\end{example}
It is important to note that the VC dimension is not an equivalence.
\begin{center}
VC dimension $n$ $\notimplies$ any set of cardinality $n$ can be shattered \\
VC dimension $n$ $\impliedby$ any set of cardinality $n$ can be shattered
\end{center}
\begin{example}[Intermediate]
\contribution{2.}
Consider $X=\R^2$ and the class of half-planes $\C = \{f_{a,b}(x) : f_{a,b}(x) = 1 \text{ if } \langle a,x \rangle \geq b, f_{a,b}(x) = 0 \text{ if } \langle a, x \rangle \leq b \}$. Using elementary geometry (specifically, Radon's theorem), one can prove that $\C$ has VC dimension 3.
\end{example}
\begin{remark}
\contribution{3.}
Prominent mathematician Terry Tao writes in his \href{https://terrytao.wordpress.com/2007/05/23/soft-analysis-hard-analysis-and-the-finite-convergence-principle/}{blogpost}:
\begin{quote}
In the field of analysis, it is common to make a distinction between \enquote{hard}, \enquote{quantitative}, or \enquote{finitary} analysis on one hand, and \enquote{soft}, \enquote{qualitative}, or \enquote{infinitary} analysis on the other. \enquote{Hard analysis} is mostly concerned with finite quantities (e.g. the cardinality of finite sets, the measure of bounded sets, the value of convergent integrals, the norm of finite-dimensional vectors, etc.) and their quantitative properties (in particular, upper and lower bounds). \enquote{Soft analysis}, on the other hand, tends to deal with more infinitary objects (e.g. sequences, measurable sets and functions, $\sigma$-algebras, Banach spaces, etc.) and their qualitative properties (convergence, boundedness, integrability, completeness, compactness, etc.). To put it more symbolically, hard analysis is the mathematics of $\eps$, $N$, $O()$, and $\leq$; soft analysis is the mathematics of $0$, $\infty$, $\in$, and $\to$.
\end{quote}
This distinction also characterizes the approaches to VC dimension in computer science and model theory.
In computer science, researchers typically employ a quantitative approach to VC dimension. They often seek to determine or estimate the exact VC dimension of concept classes, as this provides explicit bounds on learning complexity. Their proofs typically follow a two-step approach:
First, they show that any set of cardinality $n+1$ cannot be shattered and then they explicitly shatter a set of cardinality $n$.
In contrast, model theorists generally adopt a qualitative approach to VC dimension. Their primary concern is whether the VC dimension is finite or infinite, rather than its exact value. A model theorist might prove that a theory $T$ has finite VC dimension (equivalently, is NIP) without necessarily computing the exact VC dimension of any particular formula in $T$.
\end{remark}
The following example illustrates the \enquote{soft} approach in model theory:
\begin{example}[Hard]
\contribution{3.}
The theory of real ordered field\footnote{A reader without a background in model theory can safely skip this example until later sections, where we discuss it in detail.} with an exponential function is o-minimal, and thus NIP. This implies that any formula $\phi$ is NIP and the concept class uniformly defined by $\phi$ is a VC class. Such examples provide only qualitative information about the learnability, without specifying the exact VC dimension.
\end{example}
% \begin{remark}[Concept classes vs. definable sets]
% \label{rmk:setsystemduality}
% Since we restrict ourselves to concept classes with values in $\{0,1\}$, we can and will sometimes identify functions with their support, since any function is defined by its support and vice versa. A model theorist would say we identify formulas with sets defined by formulas. \textcolor{red}{Should add about CS input space / concept class $X, \C$ vs logic set system two-sorted structure $X, \cF$? Relevant for Ldim equivalence part later...}
% \end{remark}
The next two theorems establish elementary properties of VC dimension. The first theorem demonstrates the monotonicity properties of VC dimension with respect to both the input space and the concept class.
The second theorem shows how VC dimension changes when concept classes are combined using different Boolean operations.
\begin{theorem}[Basic properties I]
\label{thm:VCbasic1}
Let $\C \subseteq 2^X$ be a concept class on input space $X$ with $\VCdim{\C}=d<\infty$.
\begin{outline}
\1[(1)] Any subset of a shattered set $A$ is shattered.
\1[(2)] For any set $Y$ with $Y \subseteq X$:
$\VCdim{\C|_Y} \leq \VCdim{\C}$.
\1[(3)] For any concept classes $\C'$ with $\C' \subseteq \C$:
$\VCdim{\C'} \leq \VCdim{\C}$.
\end{outline}
\end{theorem}
\begin{proof}
\begin{outline}
\contribution{3 --- Independent proof of widely known properties, warm-up.}
\0 Let $(X,\cF)$ be the set system corresponding to $(X,\C)$ by $\cref{rem:dualitySetSystems}$.
\1[(1)] Let $B\subseteq A$. To show $B$ is shattered, we need to prove that $\forall S \subseteq B : \exists F \in \cF : F\cap B = S$. Since $B$ is a subset of $A$, $S$ is also a subset of $A$. Since $A$ is shattered by $\cF$, there exists $F\in\cF : F\cap A = S$. Now, $F\cap B = (F\cap A) \cap B = S\cap B = S$. Therefore, we have found $F\in\cF$ such that $F$ cuts out $S$. Since $S$ was arbitrary, this holds for all subsets of $B$. Thus, $B$ is shattered by $\cF$.
\1[(2)] Let $A\subseteq Y$ be any set shattered by $\cF|_Y$. We need to show that $|A| \leq d$.
For every subset $S\subseteq A$, there exists a set $(F\cap Y)\in\cF|_Y : (F\cap Y)\cap A = S$. Since $A\subseteq Y$, this implies $(F\cap Y)\cap A = F\cap A = S$. Therefore $A$ is also shattered by $\cF$. Since $\VCdim{\C} = d$, we must have $|A|\leq d$.
\1[(3)] Let $\cF' \subseteq \cF$ and let $A$ be any set shattered by $\cF'$. Since $\cF'\subseteq \cF$ this automatically implies that $A$ is shattered by $\cF$. Since $\VCdim{\C} = d$, we must have $|A|\leq d$.
\end{outline}
\end{proof}
\begin{theorem}[Basic properties II]
\label{thm:VCbasic2}
Let $X$ be an input space, $f\in\Xtwo$ a function, and $\C_1$ and $\C_2$ concept classes of VC dimension $n_1<\omega$ and $n_2<\omega$. Then the following holds regarding concept classes and their VC dimensions:
\begin{outline}
\1[(1)] intersection $\C_\cap = \{ f_1 \cdot f_2 : f_1 \in \C_1, f_2 \in \C_2\}$, $\VCdim{\C_\cap} \leq \max \{n_1,n_2\}$,
\1[(2)] union $\C_\cup= \{f_1+f_2 - (f_1\cdot f_2) : f_1 \in \C_1, f_2 \in \C_2\}$, $\VCdim{\C_\cup} \geq \max \{n_1,n_2\}$
\1[(3)] negation $\C_\lnot= \{1-f_1 : f_1\in\C_1\}$, $\VCdim{\C_\lnot} =n_1$
\1[(4)] symmetric difference $\C\symmdiff f= \{|g-f| : g\in\C_1\}$, $\VCdim{\C\symmdiff f} = n_1$
\end{outline}
\end{theorem}
\begin{proof}
\begin{outline}
\contribution{3 --- Independent proof of widely known properties, warm-up.}
\0 Let $(X,\cF_1), (X,\cF_2)$ be the set systems corresponding to $(X,\C_1),(X,\C_2)$. Let $(X,\cF_1\symmdiff B), B=\{x\in X : f(x)=1\}$ be a set system corresponding to $(X,\C\symmdiff f)$. The statements to prove correspond to Boolean operations on $\cF_1$ and $\cF_2$.
\1[(1)] By \cref{thm:VCbasic1}(3), $\VCdim{\C_1\cap \C_2} \leq \max \{n_1,n_2\}$.
\1[(2)] By \cref{thm:VCbasic1}(3), $\VCdim{\C_1\cup \C_2} \geq \max \{n_1,n_2\}$.
\1[(3)] Let $A\subseteq X$ of cardinality $n_1$ be shattered by $\C_1$. For every subset $S\subseteq A$, $\exists F\in \cF_1 : F\cap A = S$. This implies $\forall (A\setminus S)\subseteq A, \exists F \in \cF_1 : A \setminus (F\cap A) = A\setminus F = A\setminus S$. Since each $S$ is in one-to-one correspondence with $(A\setminus S)$, $\VCdim{\C_\lnot} = n_1$.
\1[(4)] Let $A\subseteq X$ of cardinality $n_1$ be shattered by $\C_1$. The proof is a character-building exercise in basic set theory. We show that any two sets in $\cF_1$ cut out the same subset from $A$ if and only if they cut out the same subset in $(\cF_1\symmdiff B)$.
\2 For any $F_1,F_2 \in \cF_1$, we have:
\begin{align*}
F_1 \cap A = F_2 \cap A
\iff & F_1 \cap B \cap A = F_2 \cap B \cap A \text{ and } \\
& F_1\cap (X\setminus B)\cap A = F_2\cap (X\setminus B)\cap A
\end{align*}
This equivalence holds because $A$ can be partitioned into $A\cap B$ and $A\cap (X\setminus B)$.
\begin{align*}
\iff & (X\setminus F_1) \cap B \cap A = (X\setminus F_2) \cap B \cap A \text{ and } \\
& F_1 \cap (X\setminus B) \cap A = F_2 \cap (X\setminus B) \cap A
\end{align*}
This equivalence holds because membership in $F_1$ completely determines membership in $X\setminus F_1$.
\begin{align*}
\iff &
\left((X\setminus F_1)\cap B\right) \cup \left(F_1\cap (X\setminus B)\right) \cap A = \\
& \left((X\setminus F_2)\cap B\right) \cup \left(F_2\cap (X\setminus B)\right) \cap A
\end{align*}
This step combines the two conditions using set union. This equivalence holds because the sets $(X\setminus F_1)\cap B$ and $F_1\cap (X\setminus B)$ are disjoint (and similarly for $F_2$).
\begin{align*}
\iff &
(F_1\symmdiff B)\cap A = (F_2\symmdiff B)\cap A.
\end{align*}
The final step uses the definition of symmetric difference.
\2 Therefore, $\C_1|_A$ cuts out $2^{n_1}$ subsets from $A$ if and only if $(\C_1\symmdiff f)|_A$ cuts out $2^{n_1}$ subsets from $A$. This implies that they both shatter $A$ and have the same VC dimension.
% You can align multiples align* with a table: (looks kinda ugly though)
% \begin{tabularx}{\linewidth}{l@{}c@{}X}
% $F_1 \cap A = F_2 \cap A$ & $\iff$ & $F_1 \cap B \cap A = F_2 \cap B \cap A$ and \\
% & & $F_1\cap (X\setminus B)\cap A = F_2\cap (X\setminus B)\cap A $ \\
% \multicolumn{3}{l}{This equivalence holds because $A$ can be partitioned into $A\cap B$ and} \\
% \multicolumn{3}{l}{$A\cap (X\setminus B)$.} \\
% & $\iff$ & $(X\setminus F_1) \cap B \cap A = (X\setminus F_2) \cap B \cap A$ and\\
% & & $F_1 \cap (X\setminus B) \cap A = F_2 \cap (X\setminus B) \cap A$ \\
% \multicolumn{3}{l}{This step uses the fact that $F\cap B = X\setminus ( (X\setminus F)\cap B)$ for any $F$.} \\
% & $\iff$ &
% \end{tabularx}
% \2 To show that $\VCdim{\C} \leq \VCdim{\C\symmdiff f}$ it suffices to prove that $\pi$ is injective. $S'=(S\setminus B) \cup ((B\setminus S)\cap A)$ satisfies $\pi(S')=S$.
% \setlength\delimitershortfall{-1pt}
% \begin{align*}
% S'\symmdiff B & = \left(S'\setminus B\right) \cup
% \left(B\setminus S'\right) \\
% & = \left(\left(\left(S\setminus B\right) \cup \left(\left(B\setminus S\right)\cap A\right)\right)\setminus B\right) \cup \left((B\setminus \left((S\setminus B) \cup \left(\left(B\setminus S\right)\cap A\right)\right)\right)\\
% & = \left( S \setminus B\right) \cup \left(B \cap \left(B\setminus\left(B\setminus S\right) \cap A \right)\right) \\
% & = \left( S \setminus B\right) \cup B \\
% & = S.
% \end{align*}
% \2 To show $\VCdim{\C\symmdiff f} \leq n_1$, note that for any $G \in \C\symmdiff f$, there exists $F \in \C_1$ such that $G = F \symmdiff B$. If a set $A'$ is shattered by $\C\symmdiff f$, then for any $S' \subseteq A'$, there exists $F \in \C_1$ such that $(F \symmdiff B) \cap A' = S'$. This implies $F \cap A' = S' \symmdiff (B \cap A')$, which means $A'$ is shattered by $\C_1$. Therefore, $|A'| \leq n_1$.
% Combining both inequalities, we conclude $\VCdim{\C\symmdiff f} = n_1$.
\end{outline}
\end{proof}
While VC dimension considers all possible subsets of $X$, it is often useful to analyze how the concept class behaves on subsets of specific sizes. This allows us to examine how the \enquote{shattering power} of the concept class grows as we increase the size of the subsets we consider.
\begin{definition}[Shatter function]
\contribution{0.}
\label{def:shatterfunc}
Define the \emph{shatter function} $\shatterfunc(m):\N \ra \N$ as
$$\shatterfunc(m):= \max \left\{|\C_Y| : Y \in \binom{X}{m}\right\}.$$
\end{definition}
\begin{lemma}[Sauer-Shelah, 1972]
\marginnote[0cm]{The function $\Phi_n(m)$ counts the number of subsets of an $m$-element set that have size less than or equal to $n$. In other words, it represents the number of subsets of $\{1,2,\ldots,m\}$ with cardinality at most $n$.}
\label{lemma:Sauer-Shelah}
Define $\Phi_n(m):=\sum^n_{i=0}\binom{m}{i}$. If $\C$ has VC dimension $n$ and $m>n$, then $\shatterfunc(m) \leq \Phi_n(m)$.
\end{lemma}
There exist numerous proofs and versions of this lemma, which has important applications in model theory, graph theory, computational geometry and other disciplines. We give a proof using the \enquote{shifting} technique commonly used in extremal set theory.
% https://cse.buffalo.edu/~hungngo/classes/2010/711/lectures/sauer.pdf
\begin{proof}[Lemma 1.5, \cite{Chernikov}]
\begin{outline}
\contribution{1 --- Improved proof structure and clarity.}
\0 We proceed by contradiction. Fix some $m>n$ and suppose $\shatterfunc(m) > \Phi_n(m)$.
\1 By \cref{def:shatterfunc}, there exists $Y\subseteq X$ with $|Y|=m$ such that $|\C|_Y| = \shatterfunc(m) > \Phi_n(m)$. Since $\shatterfunc(m)$ depends only on the size of $|\C|_Y|\leq 2^Y$, we can without loss of generality assume that:
\2 $\C$ is finite with $|\C| = \shatterfunc(m)$,
\2 $X=\{x_1,\ldots,x_m\}$ with $|X|=m$.
\0 We construct a sequence of concept subclasses $\C_0,\ldots,\C_m$ of $\C$ using a \enquote{shifting} operation.
\1 Let $\C_0 := \C$.
%\marginnote[0cm]{Assume $\C_0=\{f\}$ with $f(x_1,x_2,x_3,x_4,x_5)=(1,1,0,1,0)$ or $11010$.\\
% $\C_1 = \{01010\}$.\\
% $\C_2 = \{00010\}$.\\
% $\C_3 = \{00010\}$.\\
% $\C_4 = \{00000\}$.\\
% $\C_5 = \{00000\}$.}
\1 Given $\C_k$, construct $\C_{k+1}$ as follows:
\2 For each $f\in\C_k$, if $f(x_{k+1})=1$ and exists $g\not\in\C_k$ such that $g(x_{i\neq k+1})=f(x_{i\neq k+1})$ and $g(x_{k+1})=0$
% $$\begin{cases}
% g(x)=f(x) & x\in \{x_1,\ldots, x_k\} \\
% g(x)=0 & x\in \{x_{k+1}\}
% \end{cases},$$
then replace $f$ by $g$ in $\C_{k+1}$. Otherwise, keep $f$ in $\C_k$.
\1 This construction has three key properties:
\2[(1)] For each $l$, $|\C_k|=|\C_{k+1}|$,
\2[(2)] If $A$ is shattered by $\C_{k+1}$, then $A$ is shattered by $\C_k$,
\2[(3)] If $f\in \C_m$, then $\supp(f)$ is shattered by $\C_m$.
\1 Proofs of properties:
\2[(1)] Holds by construction: each replacement preserves cardinality.
\2[(2)] Let $A$ be shattered by $\C_{k+1}$. For any $B\subseteq A$:
\3 If $g$ cuts out $B$ and $g\in\C_k$, then $\C_k$ cuts out $B$.
\3 If $g$ cuts out $B$ but $g\not\in\C_k$, then $g$ must have been added to $\C_{k+1}$ to replace some $f\in\C_k$ during the shifting operation.
\4 If $x_{k+1}\not\in A$, then $f\in\C_k$ that $g$ replaced cuts out $B$, since $g|_A=\mathds{1}_B=f|_A$. Thus $\C_k$ cuts out $B$.
\4 If $x_{k+1}\in A$, then $x_{k+1} \not\in B$ (since $g(x_{k+1})=0$). Since $\C_{k+1}$ shatters $A$, there exists $h\in\C_{k+1}$ cutting out $B\cup \{x_{k+1}\}$. By construction, $h$ must have been in $\C_k$ and $h$ should have been replaced with $h'\not\in \C_k$ cutting out $B$. Since it was not replaced, $h'$ was already in $\C_k$, thus $\C_k$ cuts out $B$.
%Since it is both in $\C_k$ and $\C_{k+1}$ and it wasn't swapped out, the only possible combination is if condition $g_h \not\in\C_k$ was not satisfied, so $g_h\in \C_k$ and $g_h$ cuts out $B$ from $A$.
\2[(3)] Assume $\exists f\in\C_m$ with $\supp(f)$ not shattered by $\C_m$. Then $\exists x_{i+1}\in\supp(f)$ with no $g\in\C_m$ such that $g(x_{i+1})=0$. But $f$ would have been replaced at step $i$ by construction, removing $x_{i+1}$ from $\supp f$. This contradicts the assumption $f\in\C_m$.
\0 It follows from $(2)$ that $\VCdim{\C_m} \leq n$. From $(3)$, no $f\in\C_m$ has $|\supp(f)|>n$. Therefore:
$\Phi_n(m) \geq |\C_m| \mathrel{\overset{\makebox[0pt]{\mbox{\normalfont\tiny\sffamily $(1)$}}}{=}}
|\C| = |\shatterfunc(m)|.$
\end{outline}
\end{proof}
\begin{corollary}[Growth of the shatter function]
\label{cor:shatterfuncGrowth}
\contribution{0.}
Let $\C$ be a concept class of VC dimension $n$. Then,
$$
\shatterfunc(m)
\begin{cases}
= 2^m & m \leq n \\
\leq \Phi_n(m) & m > n
\end{cases}
$$
and, in particular, $\shatterfunc(m)\in O(m^n)$.
\end{corollary}
\begin{figure}
\centering
\begin{tikzpicture}
\begin{semilogyaxis}[
axis lines = left,
anchor=origin,
xlabel = \(m\),
ylabel = {\(\shatterfunc(m)\)},
xlabel style={at={(axis description cs:1,0.05)},anchor=west},
ylabel style={at={(axis description cs:0.1,1)},anchor=south,rotate=270},
xmin= 0, xmax= 15,
ymin= 0, ymax= 500,
xtick=\empty,
ytick=\empty,
extra x ticks={5},
extra y ticks={32},
extra x tick labels={$n$},
extra y tick labels={$2^n$},
extra tick style={grid=major, grid style={dashed, black}},
]
\addplot [
domain=0:5,
samples=20,
color=blue,
]
{2^x};
\addplot [
domain=5:15,
samples=20,
color=blue,
]
{7+x^2};
\end{semilogyaxis}
\end{tikzpicture}
\caption{A schematic depiction of shatter function growth for a concept class $\C$ with $\VCdim{\C}=n$. The y-axis uses a logarithmic scale. For $m \leq n$, the function grows exponentially as $2^m$. At $m = n$, the function transitions to slower growth, bounded by a polynomial function.}
\label{fig:enter-label}
\end{figure}
\begin{remark}
\contribution{0.}
In the special case where $\C = \binom{X}{n}$, this bound is tight for $m>n$.
\end{remark}
\begin{proof}[\cref{cor:shatterfuncGrowth}]
~
\begin{outline}
\contribution{1 --- Added details on growth bound.}
\1 For $m\leq n$, the bound holds by \cref{thm:VCbasic1}(1).
\1 For $m>n$, the bound holds by \cref{lemma:Sauer-Shelah}.
\1 The growth estimate holds due to the following well-known binomial inequality:
$$\Phi_n(m) = \sum^n_{i=0}\binom{m}{i} \leq \sum^n_{i=0}\left(\frac{me}{i}\right)^i \leq m^n.$$
For each term in the sum above, we have
\begin{align*}
\binom{m}{i} = \frac{m!}{i! (m-i)!} = \frac{(i+1)(i+2)\ldots(m)}{i!} \leq \frac{m^i}{i!} < \left(\frac{me}{i}\right)^i.
\end{align*}
\end{outline}
\end{proof}
This result implies that for any given concept class $\C$, its shatter function can only grow either exponentially or polynomially in terms of $m$. As the cardinality of the set increases beyond $n$, the fraction of subsets of the set that can be shattered approaches $0$.
\subsubsection{$\eps$-nets and the VC theorem}
\label{subsection:2}
The next sections lie at the intersection of statistics, probability theory and computer science. Our primary objective is to examine and prove the Vapnik-Chervonenkis (VC) theorem, a fundamental result in statistical learning theory.
To provide additional context, we present a correspondence between some statistical and computational concepts, adapted from a widely-used textbook \enquote{All of Statistics: A Concise Course in Statistical Inference} by Wasserman:
\begin{center}
\begin{tabularx}{\textwidth}{X X X}
Statistics & Computer Science & Meaning \\
\hline
estimation & learning & using data to estimate an unknown quantity \\
classification & supervised learning & predicting a discrete $Y$ from $X$\\
large deviation bounds & PAC learning & uniform bounds on probability errors
\end{tabularx}
\end{center}
The main goal of this section is the VC theorem, which is a variation on the theme of large deviation bounds. These bounds, which include the well-known Chernoff, Hoeffding, and Markov inequalities, provide estimates for the probability that an average of independent random variables deviates significantly from its expected value.
To build towards the VC theorem, we will introduce the concept of $\epsilon$-nets. These are subsets of our sample space that, in a sense, approximate the entire space well.
\begin{remark}
\contribution{3.}
In this section, we let $(X,\cA,\mu)$ denote a probability space. Let $\C$ be a concept class on $X$ so that each $f\in\C$ has $\mu$-measurable support, i.e. for all $f\in\C : \supp(f)\in \cA$. For each $f\in\C$, let $\mu(f)=\mu(\supp(f))=\int_X f d\mu$. In other words, $\mu(f)$ is the $\mu$-probability that, given $a\in X, f(a)=1$. For any $n<\omega$ we consider the product measure of $\mu$ on $X^n$, which we will denote by $\mu^n$.
\end{remark}
% \begin{definition}[Error function, algorithm]
% The error of an algorithm $H:\C_{\operatorname{fin}} \ra \Xtwo$ is defined as
% $$\errNoA = \mu(\{x\in X : H(f)(x)\neq f(x)\}).$$
% Put in words, the error is equal to the measure of a set on which our algorithm $f$ is false. This quantifies the probability of our algorithm\ldots \textcolor{red}{I should focus on this and rewrite this soon. This is algorithm error and not function error.}
% \end{definition}
\begin{lemma}[Lemma 2.3.4, \cite{Guingona}]
\label{lemma:expectedValueOfError}
Let $\C$ be a PAC learnable concept class on $X$ and let $H$ be a learning function for $f\in \C$ with sample complexity $\samplecomp$. Then, for all $\eps\in(0,1), \delta \in (0,1)$, probability measures $\mu$ on $X$, and $n\geq \samplecomp$,
$$\EX(\overline{a}\mapsto \err) \leq \delta + \eps(1-\delta).$$
\end{lemma}
\begin{proof}
\contribution{1 --- Improved proof structure and readability.}
Let $Y_0=\{\overline{a} \in X^n : \err > \eps\}$. Let $Y_1 = X\setminus Y_0$. By definition, since $\C$ is PAC learnable, the probability of sampling $\overline{a} \in Y_0$ with error $> \eps$ is less than $\delta$, so
\begin{align*}
\EX(\overline{a}\mapsto \err) & = \int_{X^n}\errNoA d\mu^n \\
& \leq \int_{Y_0}\errNoA d\mu^n + \int_{Y_1}\errNoA d\mu^n \\
& \leq 1\cdot \mu^n(Y_0)+\eps \cdot \mu^n(Y_1) \\
& \leq \delta + \eps (1-\delta).
\end{align*}
\end{proof}
\begin{definition}[Definition 2.2.6, \cite{Guingona}]
For $\eps\in(0,1)$, a subset $N \subseteq X$ is called an $\eps$-net for $\C$ if for every $f\in \C$ with $\mu(f)=\mu(\supp(f))\geq \eps$, there exists $a\in N$ such that $f(a)=1$.
\end{definition}
Intuitively, an $\eps$-net intersects every function $f\in \C$ whose support has $\mu$-measure at least $\eps$. This allows it to serve as an approximation of $X$ with respect to $\C$ capturing all the \enquote{large} sets.
\begin{fact}[Chebyshev's inequality]
\label{fact:Chebyshev}
If $f:X\rightarrow \R$ is a random variable and $\eps >0$, then
$$\mu\left(\{a\in X : |f(a)-\EX(f)|\geq \eps\}\right)\leq \frac{\Var(f)}{\eps^2}.$$
\end{fact}
\begin{lemma}[Lemma 2.2.5, \cite{Guingona}]
\label{lem:suppfUB}
Fix $p\in [0,1]$ and finite $n\geq \frac{8}{p}$. Let $(f_0,\ldots,f_{n-1})\in \Xtwo$ such that $\mu(f_i)=p$ for all $i<n$. Then,
$$\mu\left(\left\{
(a_1,\ldots,a_n) \in X^n : \sum^n_{i=1} f_i(a_i)\leq \frac{1}{2}np
\right\}\right) \leq \frac{1}{2}.$$
\end{lemma}
\begin{proof}
\contribution{1 --- Improved proof structure and readability.}
Define $f:X^n\rightarrow \R$ as $f(a_1,\ldots,a_n)=\sum_{i=1}^n f_i(a_i)$. The expected value of $f$ is equal to $\EX(f)=\sum_{i=1}^n \EX(f_i)=np$ and the variance of $f$ is equal to $\Var(f)=\sum_{i=1}^n \Var(f)=np(1-p)$. Applying \cref{fact:Chebyshev} with $\eps=np/2$, we get:
\begin{align*}
\mu\left(\left\{\overline{a}\in X^n : |f(\overline{a})-np|\geq \frac{np}{2}\right\}\right) & \leq \frac{np(1-p)}{(np/2)^2} \\
& = \frac{4(1-p)}{np} \\
& \leq \frac{4}{np} \\
& \leq \frac{1}{2}.
\end{align*}
Therefore, the $\mu$-probability of $f(\overline{a})\not\in \left(\frac{np}{2},\frac{3np}{2}\right)$ is at most $\frac{1}{2}$. Consequently, the probability of $f(\overline{a})\not\in(\frac{np}{2},\infty)$ is also at most $\frac{1}{2}$, which proves the lemma.
\end{proof}
The goal now is to show that with high probability, a randomly chosen set of points forms an $\eps$-net for $\C$.
\begin{theorem}[VC Theorem 2.2.7, \cite{Guingona}]
\label{thm:VCtheorem}
Let $(X,\cB,\mu)$ be a probability space, $\C$ be a concept class on $X$ with each concept having $\mu$-measurable support, $d,n<\omega$ and $\eps\in(0,1)$.
If the VC dimension of $\C$ is $\leq d$, then
$$\mu(\{ \overline{a} \in X^n : \{a_1, \ldots, a_n\} \text{ is not an $\eps$-net for } \C \}) \leq 2(2n)^d 2^{-\frac{\eps n}{2}}.$$
\end{theorem}
\begin{proof}
\begin{outline}
\contribution{1 --- Improved proof structure and clarity; added minor explanatory details.}
\0 The proof consists of two main parts: first, we define sets to characterize \enquote{bad} samples and derive an estimate, and second, we compute an upper bound on this estimate.\par
We start with the first part:
\1 Without loss of generality, we can assume $\C = \{f\in \C : \mu(f) \geq \eps\}$, as functions with measure less than $\eps$ do not need to be witnessed by $\overline{a}$.
\1 Define $Y_0 = \{\overline{a}\in X^n : (\exists f \in \C)(\forall i \leq n) (f(a_i)=0)\}$. This set represents all samples $\overline{a}$ that fail to be an $\eps$-net. Our goal is to show that $\mu(Y_0)$ is small.
\0 We will set up a \enquote{coupling} argument to bound $\mu(Y_0)$ in terms of $\mu$-measure of another set:
% Explanation for me: this set is product of Y_0 and some other set where f has empirical measure higher than eps n / 2.
\1 For each $f\in \C$ define
$$Y_f= \left\{
\overline{a} \in X^{2n} : (\forall i \leq n) (f(a_i)=0) \land \sum^{2n}_{i=n+1} f(a_i)\geq \frac{\eps n}{2}
\right\}.$$
This set represents all $2n$-tuples where $f$ assigns $0$ to the first $n$ elements and $1$ to at least $\lceil\eps n / 2\rceil$ of the remaining $n$ elements.
\1 Let $Y_1 = \bigcup_{f\in\C} Y_f$. This set can be seen as a product $Y_0 \times Y_{\eps n /2}$, where $Y_{\eps n/2}$ denotes all $a$'s where $f$ has sufficiently big empirical measure.
\1 For any $\overline{a}\in X^n$ define the \enquote{section} of $Y_1$ corresponding to $\overline{a}$, that is $Y_1|_{\overline{a}} = \{\overline{b}\in X^n: (\overline{a}, \overline{b})\in Y_1\}$.
\marginnote[0cm]{A measure-theory inclined reader might remember: If $X=A\times B$ is a measurable rectangle, then every $A$ section is either $B$ or $\emptyset$, according to whether $x\in A$ or not.}
\2 If $\overline{a}\in Y_0$, then straightforward application of \cref{lem:suppfUB} with $p=\eps$ yields:
$$\mu\left(\left\{
(a_1,\ldots,a_n) \in X^n :
\sum^n_{i=1} f_i(a_i) > \frac{1}{2}n\eps
\right\}\right)
\geq \frac{1}{2}.$$
The left-hand side of this inequality is precisely $\mu(Y_1|_{\overline{a}})$, therefore $\mu(Y_1|_{\overline{a}})\geq 1/2$.
\2 If $\overline{a}\not\in Y_0$, then $\mu(Y_1|_{\overline{a}})=0$, as the first condition of $Y_f$ fails.
\2 This implies
% $$
% \begin{cases}
% \overline{a}\in Y_0 & \implies \mu(Y_1 |_{\overline{a}}) \geq \frac{1}{2} \\
% \overline{a}\in Y_0^C & \implies \mu(Y_1 |_{\overline{a}})=0
% \end{cases}
% $$
% and we get
$$\mu(Y_1)=\int_{Y_0} \mu(Y_1|_{\overline{a}})d\mu(\overline{a}) \geq \frac{1}{2}\int_{Y_0}d\mu(\overline{a}) = \frac{1}{2}\mu(Y_0).$$
Therefore, $\mu(Y_0)\leq 2\mu(Y_1)$. So, to bound $\mu(Y_0)$, it suffices to compute an upper bound for $\mu(Y_1)$.
\0 Now the second part:
\1 Consider the product space $X^{2n} \times \binom{2n}{n}$ with the product measure $\mu \otimes \nu$ where $\nu$ is the uniform probability measure on $\binom{2n}{n}$.
\2 $X^{2n}$ is the set of $2n$-sequences $(a_0,\ldots,a_{2n-1})\subseteq X^{2n}$,
\2 $\binom{2n}{n}$ denotes the set of $n$-element subsets of $2n$.
\2 For $I\subseteq \binom{2n}{n}$, let $\sigma|_I: 2n \ra 2n$ be the permutation that maps $I$ to $\{0,1,\ldots,n-1\}$ and its complement to $\{n,n+1,\ldots,2n-1\}$.
\2 For $\overline{a}\in X^{2n}$ and $I\in\binom{2n}{n}$, define $\overline{a}_I=(a_{\sigma(0)},\ldots, a_{\sigma(2n-1)})$.
\1 Fix $\overline{a}\in X^{2n}$ and $f\in \C$. We compute the probability that $\overline{a}_I\in Y_f$ for some $I\in\binom{2n}{n}$.
\2 If $\sum_{i=0}^{2n-1}f(a_i) < \frac{\eps n}{2}$, then $\overline{a}_I\not\in Y_f$ no matter how we choose $I$ because it is impossible to satisfy the second condition of $Y_f$.
% \2 Case 1: $\sum_{i=0}^{2n-1}f(a_i) < \frac{\eps n}{2}$. Since $\sum_{i=n}^{2n-1}f(a_i) \leq \sum_{i=0}^{2n-1}f(a_i) < \frac{\eps n}{2}$, it follows that $\overline{a}_I\not\in Y_f$ because the second condition in the definition of $Y_f$ fails no matter what.
\2 If $\sum_{i\leq 2n}f(a_i) \geq \frac{\eps n}{2}$., then $I$ must index elements where $f(a_i) = 0$ and avoid elements with $f(a_i)=1$ because of the first condition of $Y_f$. By assumption, we have at least $k=\ceil{\frac{\eps n}{2}}$ to avoid. Thus, the probability that $\overline{a}_I\in Y_f$ is at most
% $$
% \mu\left(\left\{I\in\binom{2n}{n} : \overline{a}_I\in Y_f\right\}\right)
% $$
\begin{align*}
\frac{\binom{2n-k}{n}}{\binom{2n}{n}}=
\frac{\mfrac{(2n-k)!}{n! (n-k)!}}{\mfrac{(2n)!}{n!n!}} =
\mfrac{(2n-k)!}{(2n)!}\cdot\frac{n!}{(n-k)!} =
\mfrac{(n-k+1)\ldots(n)}{(2n-k+1)\ldots(2n)}.
\end{align*}
Factoring out $2$ from the denominator we can estimate each factor as less than $\frac{1}{2}$, thus
$$
\mfrac{(n-k+1)\ldots(n)}{(2n-k+1)\ldots(2n)}
\leq \left(\frac{1}{2}\right)^k
\leq \left(\frac{1}{2}\right)^{\frac{\eps n}{2}}
= 2^{-\frac{\eps n}{2}}.
$$
Hence $2^{-\frac{\eps n}{2}}$ is the upper bound for $\overline{a}_I\in Y_f$.
\0 Now, we use the VC dimension to bound $\mu(Y_1)$:
\1 Since $\C$ has VC dimension $\leq d$, by \cref{lemma:Sauer-Shelah}, $\shatterfunc(2n)\leq \Phi_d(2n)$. Therefore, for a fixed $\overline{a}\in X^{2n}$,
$$
|\{
I\subseteq 2n : (\exists f \in \C : f \text{ cuts out }a_{i\in I})
\}|
\leq \Phi_d(2n) \leq (2n)^d.
$$
Therefore, for any $\overline{a}\in X^{2n}$, the probability $\mu(\overline{a}_I \in Y_1) \leq (2n)^d 2^{-\frac{\varepsilon n}{2}}$. This implies $\mu(Y_1)\leq (2n)^d 2^{-\frac{\eps n}{2}}$.
\0 Finally, we conclude $\mu(Y_0)<2(2n)^d 2^{-\frac{\eps n}{2}}$, which proves the theorem.
\end{outline}
\end{proof}
\begin{remark}
\begin{outline}
\0 Exact measurability conditions are discussed in the appendix A1 and A2 of \cite{blumer1989learnability}.
\end{outline}
\end{remark}
The VC Theorem provides a bound on the convergence of empirical measures to true measures uniformly over a class of sets (or functions), where the uniformity is controlled by the VC dimension.
\subsubsection{Proof of the fundamental theorem of PAC learning}
\label{subsection:3}
In this section, we prove the main theorem:
\begin{theorem}[Theorem 2.1, \cite{blumer1989learnability}]
\label{thm:fundamentalThmPacLearning}
Let $X$ be a set and $\C$ a concept class on $X$. Then, the following are equivalent:
\begin{enumerate}
\item $\C$ is a VC class.
\item $\C$ is PAC learnable.
\end{enumerate}
\end{theorem}
This theorem establishes that if the complexity measure of $\C$ (as measured by its VC dimension) is bounded, we can efficiently learn it by sampling data from $X$, independent of any other assumptions.
We prove this theorem by explicitly calculating a lower and an upper bound on the sample complexity $\samplecomp$ in \cref{thm:VC_lb} and \cref{thm:VC_ub}.
%Let $\C$ be a fixed concept class on an input set $X$.
\begin{theorem}[Lower bound]
\label{thm:VC_lb}
Let $d < \omega$ such that $\VCdim{\C} \geq d$. Then, any hypothesis function $H:\Cfin\ra\Xtwo$ that is a witness to PAC learnability of $\C$ has sample complexity
$$\samplecomp \geq
d(1-2(\eps(1-\delta)+\delta)).$$
\end{theorem}
\begin{theorem}[Upper bound]
\label{thm:VC_ub}
% \marginnote{Note that consistency restricts $H$ from $\Xtwo$ to $\C$.}
% d>2 to ensure en/d < n in the shatter func estimate
Let $d < \omega$ such that $\VCdim{\C} \leq d$. Then, any consistent hypothesis function $H:\Cfin\ra\C$ is a learning function for $\C$ with sample complexity
$$\samplecomp \leq \max
\left(
\frac{4}{\eps} \log \left( \frac{2}{\delta} \right),
\frac{8d}{\eps} \log \left( \frac{13}{\eps} \right)
\right).$$
\end{theorem}
\begin{fact}
\label{fact:expValue}
Let $(X,\cA, \mu)$ and $(Y, \cB, \gamma)$ be two probability space and $f:(X\times Y)\ra \R$ a random variable on the product space. Then, there exists $a\in X$ such that $f|_a:Y\ra \R$, where $f|_a(b)=f(a,b)$, such that
$$\EX(f|_a)\geq \EX(f)$$
\end{fact}
\begin{proof}[\cref{thm:VC_lb}]
\begin{outline}
\contribution{1 --- Improved proof structure and clarity; added minor explanatory details.}
\0 Let $H:\Cfin \ra \Xtwo$ be any hypothesis function witnessing PAC learnability of $\C$. Our goal is to compute a lower bound on the expected value of hypothesis function error $\err$ in terms of $d$ and $n$, and then leverage \cref{lemma:expectedValueOfError} to extract a lower bound on $\samplecomp$.\\
\1 We begin by analyzing the hypothesis function error on $X\times \C$, viewing it as a function in two variables $(\overline{a},f)\mapsto \err$. To find a lower bound on the expected value of $\err$, we consider the integral
$$\EX(\err)=\int_{X^n\times \C} \err d\mu((\overline{a},f)).$$
%$$\EX(\err)=\sum_{(\overline{a},f)\in X^n\times \C} \err p((\overline{a},f)).$$
\2 Since $\C$ is PAC learnable, we can choose any probability measure $\mu$ on $X$ and on $\C$. We opt for the uniform probability measure, which implies that for $a\in X: \mu(\{a\}) = 1/d$ and for $f\in\C: \mu(\{f\})=1/2^d$.
\2 Since $\C$ has VC-dimension $\geq d$, there exists a subset of $X$ with cardinality $d$ that is shattered by $\C$. By restricting the measure to this shattered set, we may assume $|X|=d$ and $\C=\Xtwo$.
\2 Since $H$ is consistent by \cref{remark:consistentRealizable}, given sample $Y\subseteq X$, we can restrict the domain of the error function to the \enquote{unseen} subset $(X\setminus Y)\subseteq X^n$.
\2 These reductions allow us to simplify the integral above to
$$\EX(\err)=\left(\frac{1}{d\cdot 2^d}\right) \left(\sum_{(\overline{a},f)\in (X\setminus Y)\times \C} \err\right).$$
\1 Now, we compute a lower bound of the expected value.
% Maximum Uncertainty:
% The uniform distribution maximizes entropy, which means it carries the least amount of information about where the important points might be. This forces the learner to treat all points equally.
\2 Fix $n\leq d$ and consider sequences $\overlinea = (a_1,\ldots,a_n) \in X^n$. Define $Y = \{ a_1, \ldots, a_k\}$, the set of distinct values of $\overline{a}$, noting that $|Y|=k\leq n$.
\2 By \cref{thm:VCbasic1}(1), $\C$ shatters $Y\subseteq X$ so $\C|_Y = \Ytwo$. For any fixed concept $g\in \Ytwo$, it can be extended to a function in $\C$ in $2^{d-k}$ ways.
% By assigning either $0$ or $1$ to the remaining $d-k$ elements
\marginnote{$1 \leq k \leq n < d \leq \VCdim{\C}$}
\2 For a fixed $x\in(X\setminus Y)$, by symmetry, exactly half of $f\in\C$ which extend $g$ will disagree with $H(g)$ on $x$.
% $$\Bigl|\Bigl\{ f \in \C : f|_Y=g \text{ and } H(g)(x)\neq f(x)\Bigr\}\Bigr| = \frac{1}{2}\Bigl(2^{d-k}\Bigr).$$
% error over one point
Summing over all possible elements $x\in X \setminus Y$, we have:
\setlength\delimitershortfall{-1pt}
$$\sum_{x\in X\setminus Y}
\left|
\left\{
\left|H(g) (b)-f(b)
\right|
: b\in X, f\in\Xtwo
\right\}
\right|
\geq
% \sum_{i=k+1}^d
% \frac{1}{2}
% \left(2^{d-k}
% \right)
% =
\frac{1}{2}
\left(2^{d-k}
\right)
\left(d-k
\right).$$
% error over all of X\Y
\2 Since $|\Ytwo|=2^k$, over all possible concepts $g\in \Ytwo$ we get:
\setlength\delimitershortfall{-1pt}
$$
\sum_{i=1}^{2^k}
\left\{
\left|
H(f|_Y)(b) - f(b)
\right|
: b\in X, f\in \Xtwo
\right\}
\geq
\frac{1}{2}2^d
\left(d-k
\right)
\geq
\frac{1}{2}2^d
\left(d-n
\right).
$$
\2 Thus, we arrive at the lower bound:
$$
\EX(\err)\geq \frac{1}{d\cdot 2^d}\cdot
\frac{1}{2}2^d
\left(d-n
\right)
=
\frac{d-n}{2d}.
$$
\1 We can now leverage this lower bound to obtain a lower bound on $\samplecomp$.
\2 By \cref{fact:expValue}, there exists $f\in\C$ with
$
\EX(\overline{a}\mapsto\err)\geq\frac{d-n}{2d}.
$
\2 By \cref{lemma:expectedValueOfError},
$
\EX(a\mapsto \err) \leq \eps(1-\delta)+\delta.
$
\2 Since $\samplecomp$ must hold for any probability measure, it must hold specifically for $\mu$. Therefore, we conclude that $\samplecomp \geq d(1-2(\eps(1-\delta)+\delta))$.
\end{outline}
\end{proof}
Now we prove the other direction, i.e. \cref{thm:VC_ub} using $\eps$-nets and the VC Theorem \labelcref{thm:VCtheorem}.
\begin{proof}[\cref{thm:VC_ub}]
\begin{outline}
\contribution{1 --- Improved proof structure and clarity; added minor explanatory details.}
\0 The key idea of the proof is to use $\eps$-nets to show that a consistent hypothesis function $H$ can PAC learn a concept class $\C$ with finite VC dimension. This approach is somewhat consistent with the principle of Occam's Razor --- a short explanation (i.e. a hypothesis function that is as simple as possible) tends to be more valid than a long explanation.
\1 Fix a target concept $f\in\C$ and a sample $\overline{a}\in X^n$.
\2 For any prediction $h=H(f|_{\overline{a}})\in\C$, the error function $|h-f|$ belongs to concept class $(\C\symmdiff f)$.
%As as set system, $\cF \symmdiff f$ represents all elements that are counterexamples to a hypothesis $h\in\C$ for a target concept $f$.
%If $\overline{a}\in X^n$ is an $\eps$-net for $\C \symmdiff f$, then it contains counterexamples to every hypothesis $h$ with error greater than $\eps$.
Therefore we can describe the error in terms of the $\mu$-measure of $|f-h|$, namely
$
\err = \mu\left(\left|
f-h
\right|\right).
$
\2 By \cref{thm:VCbasic2}(4), $(\C\symmdiff f)$ has VC dimension $d$.
\1 By the VC Theorem \labelcref{thm:VCtheorem}, we can estimate the probability of $\overline{a}$ failing to be an $\eps$-net, independent of $\mu$:
\setlength\delimitershortfall{-1pt}
$$
\mu\left(\left\{
\overline{a} \in X^n : \left\{a_1, \ldots, a_n\right\}
\text{ not an $\eps$-net for }
\left(\C\symmdiff f\right)
\right\}\right)
\leq
2\left(2\frac{\exp n}{d}\right)^d 2^{-\frac{\eps n}{2}},
$$
Here we use a sharper bound $(\frac{\exp n}{d})^d$ on the shatter function's growth rate, as shown in \cref{cor:shatterfuncGrowth}.
\1 We need to choose $n$ large enough so that:
\begin{align*}
& &2\left(2\frac{\exp n}{d}\right)^d2^{-\frac{\eps n}{2}} & \leq \delta \\
\iff& &\log(2)+d \log\left(2\frac{\exp n}{d}\right) & \leq \log(\delta)+\frac{\eps n}{2}\\
\iff& &\frac{\eps n}{2} & \geq d\log\left(2\frac{\exp n}{d}\right) + \log(\frac{2}{\delta}).
\end{align*}
We choose
$
n \geq \max \left\{
\frac{4}{\eps} \log\left(\frac{2}{\delta}\right),
\frac{8d}{\eps} \log \left(\frac{13}{\eps}\right)
\right\}
$
and split the inequality in two parts:
$$
\frac{\eps n }{4}\geq \log(\frac{2}{\delta})
\text{~~~and~~~}
\frac{\eps n}{4} \geq d\log\left(2\frac{\exp n}{d}\right).
$$
\2 The first inequality holds trivially by our choice of $n\geq \frac{4}{\eps}\log (\frac{2}{\delta})$.
\2 The second inequality holds for all $m>n$ if we can prove it for the lower bound $n\geq\frac{8d}{\eps} \log \left(\frac{13}{\eps}\right)$, implying $\frac{\eps n}{4}\geq 2d\log\left(\frac{13}{\eps}\right)$.
so the inequality holds if we can prove
\begin{align*}
2\log\left(
\frac{13}{\eps}
\right)
& \geq
\log\left(
16\left(\frac{\exp}{\eps}\right)
\log\left(
\frac{13}{\eps}
\right)
\right)\\
\iff \left(\frac{13}{\eps}\right)^2 & \geq \frac{16\exp}{\eps}\log\left(\frac{13}{\eps}\right) \\
\iff \frac{13^2}{16\exp\eps} & \geq \log\left(\frac{13}{\eps}\right).
\end{align*}
This inequality holds for $\eps = 1$ and all smaller values, completing the proof.
%3.88 > 3.7
\1 Conclusion:
\2 We have proven that for $n \geq \max \{
\frac{4}{\eps} \log\left(\frac{2}{\delta}\right), \frac{8d}{\eps} \log \left(\frac{13}{\eps}\right)
\}$, $\overline{a}\in X^n$ is an $\eps$-net for $(\C\symmdiff f)$ with probability greater than $1-\delta$.
\2 If $\err\geq \eps$, then by definition of an $\eps$-net, $\overline{a}$ \enquote{catches} some $a_i\in X$ such that $f(a_i)\neq h(a_i)$.
\2 This contradicts consistency of $H$, therefore
$$
\mu\left(\left\{
\overline{a} \in X^n : \err > \eps
\right\}\right)
< 2(2n)^d 2^{-\frac{\eps n}{2}} \leq \delta.
$$
\end{outline}
\end{proof}
\begin{proof}[\cref{thm:fundamentalThmPacLearning}]
\begin{outline}
\contribution{1 --- Improved proof structure and clarity.}
\0 We prove both directions of the equivalence:
\1[$\implies:$] Suppose $\C$ is a VC class. By definition, there exists $d<\omega$ such that $\VCdim{\C}=d$. By \cref{thm:VC_ub}, any consistent hypothesis function $H$ is a PAC learning function for $\C$. Therefore, $\C$ is PAC learnable.
\1[$\impliedby:$] Suppose $\C$ is not a VC class. Then $\C$ has infinite VC dimension. By \cref{thm:VC_lb}, for any hypothesis function $H$ and VC dimension $d\in \N$, the sample complexity satisfies $\samplecomp\geq d(1-2(\eps(1-\delta)+\delta))$. Since the VC dimension of $\C$ is infinite, no finite sample size is sufficient for PAC learning $\C$. Therefore, $\C$ is not PAC learnable.
\end{outline}
\end{proof}
\newpage
\subsection{NIP theories}
\label{section:NIPtheories}
Now we move into the realm of model theory.
\begin{remark}[Notation]
\begin{outline}
\contribution{3.}
\0 We will work in a fixed signature $L$.
\1 $\cM$ denotes an $L$-structure with universe $M$.
\1 $x,y,z$ represent tuples of variables,
\1 $|x|$ denotes the length of tuple $x$.
\1 For $A\subseteq M$, $A_x$ represents all tuples from $A$ of length $|x|$.
\0 In a partitioned $L$-formula $\phi(x;y)$
\1 $x$ represents object variables,
\1 $y$ represents parameter variables,
\1 for $b\in M_y$, $A\subseteq M_x$ define $\phi(A,b)=\{x\in A : \cM\models \phi(x,b)\}$,
\1 for $A\subseteq M_x$ and $B\subseteq M_y$ define $\phi(A;B)=\{\phi(a;B) : a\in A\}$.
\end{outline}
\end{remark}
% I've tried to incorporate page 16 of Laskowski Johnson paper, but alas... I think its an important definition though, to say that something is "uniformly" definable.
\begin{definition}[Uniformly definable family]
\begin{outline}
\contribution{2 --- Synthesized from multiple sources, independently worded.}
\0 Let $\cM$ be an $L$-structure and $\phi(x;y)$ any fixed $L$-formula. The formula $\phi(x;y)$ generates a \emph{uniformly definable family} on $M_x$ as a collection of definable sets:
$$\C_\phi = \{\phi(M_x;b) : b \in M_y\}.$$
% \1 As a concept class of indicator functions: $\C = \{\mathds{1}_{\phi(M_x;b)} : b \in M_y\}$,
% where $\mathds{1}_{\phi(M_x;b)}(a)=1$ if and only if $M\models \phi(a;b)$.
\0 The VC dimension of $\phi(x;y)$ is defined to be the VC dimension of the induced concept class $\C_\phi$.
\end{outline}
\end{definition}
The term \enquote{uniformly definable} emphasizes that a single formula $\phi$ simultaneously defines all sets within the family, instead of using multiple formulas to define different sets. This uniformity allows us to study properties of the entire family by analyzing the single formula $\phi$.
\begin{example}
\label{example:circle}
\begin{outline}
\contribution{3.}
\0 Consider $RCF$, the theory of ordered real closed fields. Let $\phi(x_1,x_2;y_1,y_2,y_3)$ be the formula:
$$(x_1-y_1)^2+(x_2-y_2)^2 < y_3$$
\0 This formula defines the interior of a circle in $\R^2$. Specifically, $\phi(x_1,x_2;y_1,y_2,y_3)$ generates the family of sets
$$\C_\phi = \{\{(x_1,x_2) \in \R^2 : (x_1-a)^2+(x_2-b)^2 < r\} : a,b,r \in \R\}$$
Each set in this family is the interior of a circle with center $(a,b)$ and radius $\sqrt{r}$.
The corresponding concept class consists of indicator functions:
$$\C = \{f_{a,b,r} : \R^2 \to {0,1} \mid a,b,r \in \R, r > 0\}$$
where
$$f_{a,b,r}(x_1,x_2) = \begin{cases}
1 & \text{if } (x_1-a)^2+(x_2-b)^2 < r \\
0 & \text{otherwise}
\end{cases}$$
Thus, $\phi$ uniformly defines all open circles in $\R^2$, allowing us to study properties of this entire family through the single formula $\phi$.
\end{outline}
\end{example}
\begin{definition}[Independence property]
\label{def:indFormula}
~
\begin{outline}
\contribution{1 --- Minor structure improvements.}
\1 A formula $\phi(x;y)$ has the \emph{independence property} with respect to $\cM$, if:
\2 For every $n\in\N$, there exists a sequence $(b_0,\ldots,b_{n-1})$ of elements from $M_y$ such that
\2 For every $k\subseteq n$, there exists is an $a_k\in M_x$ such that \marginnote[2cm]{$i < k < n$}
$$\cM\models \phi(a_k; b_i) \iff i\in k$$
\1 The \emph{independence dimension} $I(\phi)$ is defined as:
\2 If $\phi$ does not have the independence property (\emph{is NIP}), then $I(\phi)$ is the greatest $n$ for which the above condition holds.
\2 If $\phi$ has the independence property \emph{(is not NIP)}, then $I(\phi)=\infty$.
\1 The \emph{dual formula} $\psi(y;x)$ represents a dual formula; $\phi$ and $\psi$ are identical as formulas but the roles of $x$ and $y$ are reversed.
\end{outline}
\end{definition}
Our main theorem is Proposition 1.3 from \cite{Laskowski1992} establishing the equivalence between the independence property and VC dimension.
From now on, fix a structure $\cM$ and a formula $\phi(x;y)$. Let $\C_\phi$ be the concept class associated with $\phi$ in $\cM$.
\begin{theorem}[Prop. 1.3, \cite{Laskowski1992}]
\label{thm:NIPfiniteVC}
If VC dimension of $\C_\phi$ is $d$ and the independence dimension of $\phi$ is $n$ then $n\leq 2^d$ and $d \leq 2^n$, and the following are equivalent:
\begin{enumerate}
\item $\C_\phi$ is a VC class.
\item $\phi$ is NIP.
\end{enumerate}
\end{theorem}
This theorem will immediately follow from two lemmas below.
\begin{lemma}[Lemma 1.4, \cite{Laskowski1992}]
\label{lem:14}
Let $\psi(y;x)$ be the dual formula of $\phi(x;y)$. Then $\VCdim{\C_\phi}\leq d \iff I(\psi)\leq d$
\end{lemma}
\begin{proof}
\begin{outline}
\0 We will show that $\VCdim{\C_\phi} > d \iff I(\psi) > d$, which is equivalent to the statement of the lemma.
\1 By definition, $\VCdim{\C_\phi} > d$ if and only if there exists a set $A = \{a_0, \ldots, a_d\} \subseteq M_x$ that is shattered by $\C_\phi$. This holds if and only if for every $S \subseteq d$, there exists $b_S \in M_y$ such that:
$$\cM\models\phi(a_i;b_S) \iff i\in S.$$
\1 By the definition of the dual formula $\psi$, this condition is equivalent to:
$$\cM \models \psi(b_S;a_i) \iff i\in S$$
\0 This last statement is precisely the definition of $I(\psi) > d$. Therefore, $\VCdim{\C_\phi} > d \iff I(\psi) > d$, which completes the proof.
\end{outline}
\end{proof}
\begin{lemma}[Lemma 1.5, \cite{Laskowski1992}]
\label{lem:15}
Let $\psi(y;x)$ be the dual formula of $\phi(x;y)$. If $I(\phi)\leq n$ then $I(\psi)\leq 2^n$.
\end{lemma}
The idea of the proof rests on the observation that a shattered set of size $n$ corresponds to $2^n$ parameters that shatter it. Each element in this set implicitly defines a subset of these parameters --- those corresponding to sets containing that element.
\begin{proof}