forked from AnonymousResults123abc/ICCV2984
-
Notifications
You must be signed in to change notification settings - Fork 0
/
KD_loss.py
42 lines (32 loc) · 1.79 KB
/
KD_loss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Code is modified from MEAL (https://arxiv.org/abs/1812.02425) and Label Refinery (https://arxiv.org/abs/1805.02641).
import torch
from torch.nn import functional as F
from torch.nn.modules import loss
class DistributionLoss(loss._Loss):
"""The KL-Divergence loss for the binary student model and real teacher output.
output must be a pair of (model_output, real_output), both NxC tensors.
The rows of real_output must all add up to one (probability scores);
however, model_output must be the pre-softmax output of the network."""
def forward(self, model_output, real_output):
self.size_average = True
# Target is ignored at training time. Loss is defined as KL divergence
# between the model output and the refined labels.
if real_output.requires_grad:
raise ValueError("real network output should not require gradients.")
model_output_log_prob = F.log_softmax(model_output, dim=1)
real_output_soft = F.softmax(real_output, dim=1)
del model_output, real_output
# Loss is -dot(model_output_log_prob, real_output). Prepare tensors
# for batch matrix multiplicatio
real_output_soft = real_output_soft.unsqueeze(1)
model_output_log_prob = model_output_log_prob.unsqueeze(2)
# Compute the loss, and average/sum for the batch.
cross_entropy_loss = -torch.bmm(real_output_soft, model_output_log_prob)
if self.size_average:
cross_entropy_loss = cross_entropy_loss.mean()
else:
cross_entropy_loss = cross_entropy_loss.sum()
# Return a pair of (loss_output, model_output). Model output will be
# used for top-1 and top-5 evaluation.
# model_output_log_prob = model_output_log_prob.squeeze(2)
return cross_entropy_loss