-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.html
More file actions
345 lines (320 loc) · 22.9 KB
/
index.html
File metadata and controls
345 lines (320 loc) · 22.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Prama Development's Deep Learning Test</title>
<style>
.header {
display: flex;
align-items: center;
}
.logo {
margin-right: 20px;
}
nav {
text-align: center;
margin-top: 20px;
}
nav a {
display: inline-block;
padding: 10px 20px;
text-decoration: none;
color: #0000FF;
font-weight: bold;
}
nav a:hover {
text-decoration: underline;
}
</style>
</head>
<body>
<div class="header">
<img src="logo.webp" alt="Logo" width="100" height="100" class="logo">
<h1>Prama Development's Deep Learning Test</h1>
</div>
<nav>
<a href="https://pramadevelopment.com/">Prama Home Page</a>
<a href="https://x.com/PramaResearch/">Prama Twitter</a>
<a href="https://github.com/PramaLLC/DeepLearningTest/">Repo</a>
</nav>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
}
.question {
margin-bottom: 20px;
border: 1px solid #ddd;
padding: 15px;
border-radius: 5px;
}
.answer {
margin: 10px 0;
}
button {
margin-top: 20px;
padding: 10px 20px;
font-size: 16px;
}
#result {
margin-top: 20px;
font-weight: bold;
}
.explanation {
margin-top: 10px;
font-style: italic;
display: none;
}
</style>
<!-- <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
-->
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
</head>
<body>
<div id="quiz"></div>
<button id="submit">Submit Answers</button>
<div id="result"></div>
<script>
const quizData = [
{
"section": "Deep Learning",
"questions": [
{
"question": "Which of the following typically has the fastest convergence and least noisiness of a neural network during training per iteration?",
"answerF": "Stochastic Gradient Descent (SGD)",
"answerT": "Gradient Descent",
"answerF2": "Mini-batch Stochastic Gradient Descent",
"answerF3": "Adam",
"answerF4": "AdamW",
"explanation": "Traditional gradient descent computes the gradient using all data points, leading to more accurate updates per iteration compared to SGD or mini-batch SGD. This results in faster convergence per iteration but at the cost of higher computational requirements and memory usage per iteration."
},
{
"question": "Which form of gradient descent offers the best trade-off between computation and accuracy and is the industry standard for training deep neural networks?",
"answerF": "Stochastic Gradient Descent (SGD)",
"answerF2": "Gradient Descent",
"answerT": "Mini-batch Stochastic Gradient Descent",
"answerF3": "Adam",
"answerF4": "AdamW",
"explanation": "Mini-batch Stochastic Gradient Descent allows models to be trained on larger datasets compared to Gradient Descent due to memory constraints. It provides a balance between the computational efficiency of SGD and the stability of full Gradient Descent. SGD can be more noisy because each example is randomly selected, while full Gradient Descent can be computationally prohibitive for large datasets."
},
{
"question": "The Kullback–Leibler (KL) divergence function can be described as?",
"answerF": "The similarity between two neural networks",
"answerF2": "The entropy of a single probability distribution",
"answerF3": "The distance between two points in Euclidean space",
"answerT": "How different two probability distributions are",
"answerF4": "The mutual information between two random variables",
"explanation": "The Kullback-Leibler (KL) divergence, also known as relative entropy, measures the difference between two probability distributions. It quantifies how much information is lost when one distribution is used to approximate another. While it's often referred to as a 'distance' between distributions, it's not a true metric as it's not symmetric and doesn't satisfy the triangle inequality. KL divergence is widely used in various fields including information theory, machine learning, and statistics."
},
{
"question": "The function F represents which of the following, where \( P_i \) and \( Q_i \) are the true and predicted probability distributions, and F measures the relative entropy between \( P_i \) and \( Q_i \)?",
"equation": "F(P,Q) = \\frac{1}{N} \\sum_{i=1}^N \\sum_{x \\in \\mathcal{X}} P_i(x) \\log \\frac{P_i(x)}{Q_i(x)}",
"answerF": "The Jensen-Shannon divergence between P and Q",
"answerF2": "The cross-entropy between P and Q",
"answerT": "The Kullback-Leibler (KL) divergence between P and Q",
"answerF3": "The mutual information between P and Q",
"answerF4": "The Hellinger distance",
"explanation": "This equation represents the Kullback-Leibler (KL) divergence between the true distributions \( P_i \) and the predicted distributions \( Q_i \). KL divergence measures the relative entropy, or how one probability distribution differs from another. It is asymmetric, meaning that \( KL(P_i || Q_i) \\neq KL(Q_i || P_i) \). The lower the KL divergence, the closer the predicted distribution \( Q_i \) is to the true distribution \( P_i \)."
},
{
"question": "What is the reason for the widespread adoption of cross-entropy versus mean squared error in classification tasks?",
"answerF": "Mean squared error is better for image classification because the log in cross-entropy diminishes the value",
"answerF2": "Cross-entropy is computationally less expensive than mean squared error",
"answerF3": "Cross-entropy produces smaller gradients, leading to more stable training",
"answerF4": "Cross-entropy is immune to overfitting, unlike mean squared error",
"answerT": "The logarithm in the cross-entropy leads to bigger steps for worse predictions",
"explanation": "Cross-entropy is widely preferred for classification tasks due to its logarithmic nature. This results in larger gradients for probabilities far from the true label, allowing for faster learning when the model is incorrect. In contrast, mean squared error can lead to slower learning for incorrect predictions, especially when the predicted probability is close to 0 or 1. The steeper gradients of cross-entropy help combat the vanishing gradient problem, particularly in the early stages of training. While cross-entropy has other advantages, such as its natural extension to multi-class problems, its superior gradient behavior is the primary reason for its widespread adoption in classification tasks."
},
{
"question": "The function F represents which of the following, where \( P \) is the true probability distribution and \( Q \) is the predicted probability distribution?",
"equation": "F(P,Q) = -\\sum_{x \\in \\mathcal{X}} P(x) \\log Q(x)",
"answerT": "Cross-Entropy Loss Function",
"answerF": "Kullback-Leibler (KL) divergence",
"answerF2": "Mean Squared Error",
"answerF3": "Jensen-Shannon divergence",
"answerF4": "Batch Cross-Entropy Loss Function",
"explanation": "This formula represents the cross-entropy loss function between the true distribution \( P \) and the predicted distribution \( Q \)."
},
{
"question": "The function F represents which of the following, where \( P_i \) and \( Q_i \) are the true and predicted probability distributions?",
"equation": "F(P,Q) = -\\frac{1}{N} \\sum_{i=1}^N \\sum_{x \\in \\mathcal{X}} P_i(x) \\log Q_i(x)",
"answerT": "Batch Cross-Entropy Loss Function",
"answerF": "Single Sample Cross-Entropy Loss",
"answerF2": "Kullback-Leibler (KL) divergence",
"answerF3": "Mean Squared Error",
"answerF4": "Jensen-Shannon divergence",
"explanation": "This equation represents the Batch Cross-Entropy Loss Function. It calculates the average cross-entropy loss across a batch of \( N \) samples. For each sample \( i \), \( P_i \) is the true distribution and \( Q_i \) is the predicted distribution. The outer sum averages the loss over all samples in the batch, making it suitable for mini-batch training in neural networks."
},
{
"question": "Let's say we have an RGB input image into a convolutional neural network that has the dimensionality of Batch, Channel (\( C \)), Height, Width, and we want an output channel of \( 2C \). Which of the following is true about the convolution layer?",
"answerF": "The number of filters is equal to \( C \), and each filter will have \( 2C \) kernels applied to each channel of the input image.",
"answerT": "The number of filters is equal to \( 2C \); each filter will have \( C \) kernels, and each kernel is applied to the corresponding channel in the input image.",
"answerF2": "The number of filters is equal to \( 2C \); each filter will have \( 2C \) kernels, and all kernels are applied to all channels of the input image.",
"answerF3": "The number of filters is equal to \( C \); each filter will have \( C \) kernels, and the output is concatenated to achieve \( 2C \) channels.",
"answerF4": "The number of filters is equal to 3; each filter will have \( 2C \) kernels, and each kernel is applied to one of the RGB channels separately.",
"explanation": "In a convolutional layer, to increase the number of output channels from \( C \) to \( 2C \), we need \( 2C \) filters. Each filter corresponds to one output channel. Since the input has \( C \) channels, each filter must have \( C \) kernels to process all input channels. Each kernel in a filter is applied to its corresponding input channel, and the results are summed to produce one value in the output feature map for that filter."
},
{
"question": "Which of the following is the correct order of operations after training the last batch in an epoch (Pytorch)?",
"answerF": "1. Zero grad optimizer 2. Compute loss 3. Call backward on the loss 4. Step the optimizer 5. Clip the gradient 6. Update learning rate",
"answerF2": "1. Call backward on the loss 2. Compute loss 3. Zero grad optimizer 4. Step the optimizer 5. Clip the gradient 6. Update learning rate",
"answerF3": "1. Compute loss 2. Call backward on the loss 3. Zero grad optimizer 4. Step the optimizer 5. Update learning rate 6. Clip the gradient",
"answerF4": "1. Zero grad optimizer 2. Step the optimizer 3. Compute loss 4. Call backward on the loss 5. Clip the gradient 6. Update learning rate",
"answerT": "1. Compute loss 2. Zero grad optimizer 3. Call backward on the loss 4. Clip the gradient 5. Step the optimizer 6. Update learning rate",
"explanation": "The correct order of operations is crucial for proper training. First, we compute the loss to evaluate the model's performance. Then, we zero the gradients to clear any existing gradients. Next, we call backward on the loss to compute gradients. Gradient clipping is performed to prevent exploding gradients. After that, we step the optimizer to update the model's parameters. Finally, we update the learning rate, which is typically done at the end of an epoch. The incorrect answers mix up this order, which could lead to improper training or errors in the backpropagation process."
},
{
"question": "The equation describes which of the following?",
"equation": "\\begin{align*}\n v_t &= \\beta v_{t-1} + \\nabla J(\\theta_{t-1}) \\\\\n \\theta_t &= \\theta_{t-1} - \\alpha v_t\n\\end{align*}",
"answerF3": "Stochastic Gradient Descent",
"answerF": "Vanilla Gradient Descent",
"answerF2": "Adagrad",
"answerT": "Gradient Descent with Momentum",
"answerF4": "RMSProp",
"explanation": "This equation represents Gradient Descent with Momentum. The first equation updates the velocity (momentum) term based on the previous velocity and the gradient of the loss function. The second equation updates the parameters by subtracting a fraction of the velocity. Momentum helps accelerate convergence and reduces oscillations by accumulating past gradients."
},
{
"question": "The function F represents which of the following, where \( P_i \) and \( Q_i \) are the true and predicted probability distributions?",
"equation": "F(P,Q) = \\frac{1}{N} \\sum_{i=1}^N \\sum_{x \\in \\mathcal{X}} (P_i(x) - Q_i(x))^2",
"answerF3": "Batch Cross-Entropy Loss Function",
"answerF": "Single Sample Cross-Entropy Loss",
"answerF2": "Kullback-Leibler (KL) divergence",
"answerT": "Mean Squared Error",
"answerF4": "Jensen-Shannon divergence",
"explanation": "This equation represents the Mean Squared Error (MSE) over a batch of \( N \) samples. It calculates the average squared difference between the true distribution \( P_i(x) \) and the predicted distribution \( Q_i(x) \) for each sample \( i \)."
},
{
"question": "Which is true about the Adam optimizer?",
"answerT": "The Adam optimizer effectively combines momentum and adaptive learning rates.",
"answerF3": "Adam optimizer is a simplified version of RMSProp that doesn't use momentum.",
"answerF": "Adam optimizer was developed before RMSProp and served as its foundation.",
"answerF2": "Adam optimizer only works well with convolutional neural networks.",
"answerF4": "Adam optimizer always converges faster than all other optimization algorithms.",
"explanation": "Adam (Adaptive Moment Estimation) combines ideas from both RMSProp and momentum. It wasn't developed before them, nor is it a simplified version. It's not limited to CNNs, and while often effective, it doesn't always outperform other optimizers in all scenarios."
},
{
"question": "Which of the following describes RMSProp?",
"answerT": "Root Mean Square Propagation, an algorithm that maintains a moving average of squared gradients and divides current gradients by the root of this average",
"answerF3": "Root Mean Square Perception, an algorithm designed specifically for training perceptron networks by adjusting learning rates",
"answerF": "Recursive Momentum Stabilization, a method to prevent exploding gradients in recurrent neural networks by recursively applying momentum",
"answerF2": "Rectified Maximum Slope, a technique to adaptively adjust the slope of activation functions based on gradient information",
"answerF4": "Regularized Minimum Search, an algorithm to efficiently find the global minimum of the loss function using regularization techniques",
"explanation": "RMSProp stands for Root Mean Square Propagation. It's an optimization algorithm that adapts the learning rate for each parameter by maintaining a moving average of squared gradients. The other options are fictitious and do not describe RMSProp's actual function or meaning."
},
{
"question": "Which of the following correctly describes transfer learning?",
"answerT": "When a model is trained on a task and gains information so that it then can be trained on a new task.",
"answerF": "A technique where a model learns to transfer data between different storage devices.",
"answerF2": "The process of training multiple models simultaneously on the same dataset.",
"answerF3": "A method to compress the size of a neural network without losing accuracy.",
"answerF4": "The ability of a model to generate new, unseen data samples.",
"explanation": "Transfer learning is a machine learning technique where a model developed for one task is reused as the starting point for a model on a second, related task. This approach leverages the knowledge gained from solving one problem and applies it to a different but related problem. It's particularly useful when you have limited data for the new task or want to speed up the training process. The correct answer describes this process, while the false answers relate to different concepts in machine learning and data processing that are not transfer learning."
},
{
"question": "Early stopping in the context of training a deep neural network describes which of the following?",
"answerF": "Halting training when the model reaches a predetermined accuracy threshold on the training set",
"answerT": "A regularization technique that stops training before the model begins to overfit the training data",
"answerF2": "Interrupting training periodically to perform batch normalization on the model's parameters",
"answerF3": "Terminating the training process if the loss doesn't decrease after a fixed number of epochs",
"answerF4": "Ending training early to save computational resources, regardless of model performance",
"explanation": "Early stopping is a regularization technique used to prevent overfitting in iterative learning methods like gradient descent. It involves monitoring the model's performance on a validation set and stopping training when the performance begins to degrade, indicating the point where the model starts to overfit the training data. This helps find an optimal balance between fitting the training data and maintaining good generalization to unseen data."
}
]
}
]
function shuffleArray(array) {
for (let i = array.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[array[i], array[j]] = [array[j], array[i]];
}
}
function renderQuiz() {
const quizContainer = document.getElementById('quiz');
let questions = [...quizData[0].questions];
shuffleArray(questions);
questions.forEach((q, index) => {
const questionDiv = document.createElement('div');
questionDiv.className = 'question';
// Shuffle answers
let answers = Object.entries(q).filter(([key, value]) => key.startsWith('answer'));
shuffleArray(answers);
questionDiv.innerHTML = `
<h3>Question ${index + 1}:</h3>
<p>${q.question}</p>
${q.equation ? `<p>Equation: $$${q.equation}$$</p>` : ''}
<div class="answers">
${answers.map(([key, value]) => `
<div class="answer">
<input type="radio" name="q${index}" value="${key}" id="q${index}${key}">
<label for="q${index}${key}">${value}</label>
</div>
`).join('')}
</div>
<div class="explanation">${q.explanation}</div>
`;
quizContainer.appendChild(questionDiv);
});
// Now that the content is added, typeset the math
document.addEventListener('DOMContentLoaded', () => {
console.log("DOM fully loaded"); // Debug log
// Wait for MathJax to be ready
MathJax.startup.promise.then(() => {
renderQuiz();
const submitButton = document.getElementById('submit');
submitButton.addEventListener('click', () => {
console.log("Submit button clicked"); // Debug log
gradeQuiz();
});
});
});
}
function gradeQuiz() {
console.log("Grading quiz..."); // Debug log
let score = 0;
const questions = document.querySelectorAll('.question');
questions.forEach((q, index) => {
const selectedAnswer = q.querySelector('input:checked');
if (selectedAnswer) {
console.log(`Question ${index + 1}: Selected answer - ${selectedAnswer.value}`); // Debug log
const answerLabel = selectedAnswer.nextElementSibling;
if (selectedAnswer.value === 'answerT') {
score++;
answerLabel.style.color = 'green';
console.log(`Question ${index + 1}: Correct`); // Debug log
} else {
answerLabel.style.color = 'red';
// Find and highlight the correct answer in green
const correctAnswer = q.querySelector('input[value="answerT"]');
if (correctAnswer) {
correctAnswer.nextElementSibling.style.color = 'green';
}
console.log(`Question ${index + 1}: Incorrect`); // Debug log
}
} else {
console.log(`Question ${index + 1}: No answer selected`); // Debug log
}
q.querySelector('.explanation').style.display = 'block';
});
const resultDiv = document.getElementById('result');
resultDiv.textContent = `Score: ${score}/${questions.length}`;
console.log(`Final score: ${score}/${questions.length}`); // Debug log
// Disable all radio buttons after submission
document.querySelectorAll('input[type="radio"]').forEach(radio => {
radio.disabled = true;
});
// Disable the submit button
document.getElementById('submit').disabled = true;
}
document.addEventListener('DOMContentLoaded', () => {
console.log("DOM fully loaded"); // Debug log
renderQuiz();
const submitButton = document.getElementById('submit');
submitButton.addEventListener('click', () => {
console.log("Submit button clicked"); // Debug log
gradeQuiz();
});
});
</script>