Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/clip_sdpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@ def forward(self, x: torch.Tensor):


def get_abs_pos(abs_pos, tgt_size):
"""

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation

Resize absolute positional embeddings to target size if necessary.

Args:
abs_pos (torch.Tensor): [1, H, W, C] absolute positional embeddings
tgt_size (int): target height/width

Returns:
torch.Tensor: resized absolute positional embeddings [1, tgt_size, tgt_size, C]
"""
# abs_pos: L, C
# tgt_size: M
# return: M, C
Expand Down
18 changes: 16 additions & 2 deletions DeepSeek-OCR-master/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,20 @@ def get_abs_pos(abs_pos, tgt_size):


class MLPBlock(nn.Module):
"""
Simple MLP block with two linear layers and activation.

Args:
embedding_dim (int): Input and output feature dimension.
mlp_dim (int): Hidden layer dimension.
act (nn.Module): Activation function class (default nn.GELU).

Forward Args:
x (torch.Tensor): Input tensor of shape [B, N, embedding_dim].

def forward => Returns:
torch.Tensor: Output tensor of same shape [B, N, embedding_dim].
"""
def __init__(
self,
embedding_dim: int,
Expand Down Expand Up @@ -359,7 +373,7 @@ def window_unpartition(
hw (Tuple): original height and width (H, W) before padding.

Returns:
x: unpartitioned sequences with [B, H, W, C].
torch.Tensor: Reconstructed tensor of shape [B, H, W, C].
"""
Hp, Wp = pad_hw
H, W = hw
Expand Down Expand Up @@ -525,4 +539,4 @@ def _build_sam(
# tob
image_encoder.load_state_dict({k[30:]: v for k, v in state_dict.items() if 'vision_tower_high' in k}, strict=True)
print(checkpoint)
return image_encoder
return image_encoder