Spaces:

BorisEm
/

HATSAT

Running on CPU Upgrade

App Files Files Community

BorisEm commited on Sep 24

Commit

0def483

1 Parent(s): 6cc2b3b

Broke down code base into smaller files for readibility

Browse files

Files changed (12) hide show

.gitignore +159 -0
app.py +13 -896
config.py +28 -0
interface/__init__.py +8 -0
interface/css_styles.py +65 -0
interface/gradio_app.py +116 -0
model/__init__.py +16 -0
model/components.py +451 -0
model/hat_model.py +221 -0
utils/__init__.py +11 -0
utils/image_utils.py +64 -0
utils/model_utils.py +31 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,159 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be added to the global gitignore or merged into this project gitignore.  For a PyCharm
+#  project, it is possible to include .idea/directory entries, you may need to remove them.
+.idea/

app.py CHANGED Viewed

@@ -1,904 +1,21 @@
-import gradio as gr
-import torch
-import torch.nn as nn
-import numpy as np
-from PIL import Image
-import math
-from einops import rearrange
-import os
-import glob
-import base64
-from io import BytesIO
-# Constants
-MODEL_CHECKPOINT = 'net_g_150000.pth'
-REQUIRED_IMAGE_SIZE = (130, 130)
-WINDOW_SIZE = 16
-UPSCALE_FACTOR = 4
-def to_2tuple(x):
-    """Convert input to tuple of length 2."""
-    if isinstance(x, (tuple, list)):
-        return tuple(x)
-    return (x, x)
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
-    """Truncated normal initialization."""
-    def norm_cdf(x):
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-    with torch.no_grad():
-        l = norm_cdf((a - mean) / std)
-        u = norm_cdf((b - mean) / std)
-        tensor.uniform_(2 * l - 1, 2 * u - 1)
-        tensor.erfinv_()
-        tensor.mul_(std * math.sqrt(2.))
-        tensor.add_(mean)
-        tensor.clamp_(min=a, max=b)
-        return tensor
-def drop_path(x, drop_prob: float = 0., training: bool = False):
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
-    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
-    random_tensor.floor_()
-    output = x.div(keep_prob) * random_tensor
-    return output
-class DropPath(nn.Module):
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-class ChannelAttention(nn.Module):
-    def __init__(self, num_feat, squeeze_factor=16):
-        super(ChannelAttention, self).__init__()
-        self.attention = nn.Sequential(
-            nn.AdaptiveAvgPool2d(1),
-            nn.Conv2d(num_feat, num_feat // squeeze_factor, 1, padding=0),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(num_feat // squeeze_factor, num_feat, 1, padding=0),
-            nn.Sigmoid())
-    def forward(self, x):
-        y = self.attention(x)
-        return x * y
-class CAB(nn.Module):
-    def __init__(self, num_feat, compress_ratio=3, squeeze_factor=30):
-        super(CAB, self).__init__()
-        self.cab = nn.Sequential(
-            nn.Conv2d(num_feat, num_feat // compress_ratio, 3, 1, 1),
-            nn.GELU(),
-            nn.Conv2d(num_feat // compress_ratio, num_feat, 3, 1, 1),
-            ChannelAttention(num_feat, squeeze_factor)
-        )
-    def forward(self, x):
-        return self.cab(x)
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-def window_partition(x, window_size):
-    b, h, w, c = x.shape
-    x = x.view(b, h // window_size, window_size, w // window_size, window_size, c)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, c)
-    return windows
-def window_reverse(windows, window_size, h, w):
-    b = int(windows.shape[0] / (h * w / window_size / window_size))
-    x = windows.view(b, h // window_size, w // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(b, h, w, -1)
-    return x
-class WindowAttention(nn.Module):
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-    def forward(self, x, rpi, mask=None):
-        b_, n, c = x.shape
-        qkv = self.qkv(x).reshape(b_, n, 3, self.num_heads, c // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-        relative_position_bias = self.relative_position_bias_table[rpi.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
-        attn = attn + relative_position_bias.unsqueeze(0)
-        if mask is not None:
-            nw = mask.shape[0]
-            attn = attn.view(b_ // nw, nw, self.num_heads, n, n) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, n, n)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-        attn = self.attn_drop(attn)
-        x = (attn @ v).transpose(1, 2).reshape(b_, n, c)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-class HAB(nn.Module):
-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
-                 compress_ratio=3, squeeze_factor=30, conv_scale=0.01, mlp_ratio=4.,
-                 qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
-        self.conv_scale = conv_scale
-        self.conv_block = CAB(num_feat=dim, compress_ratio=compress_ratio, squeeze_factor=squeeze_factor)
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-    def forward(self, x, x_size, rpi_sa, attn_mask):
-        h, w = x_size
-        b, _, c = x.shape
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(b, h, w, c)
-        # Conv_X
-        conv_x = self.conv_block(x.permute(0, 3, 1, 2))
-        conv_x = conv_x.permute(0, 2, 3, 1).contiguous().view(b, h * w, c)
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-            attn_mask = attn_mask
-        else:
-            shifted_x = x
-            attn_mask = None
-        # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, c)
-        # W-MSA/SW-MSA
-        attn_windows = self.attn(x_windows, rpi=rpi_sa, mask=attn_mask)
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, c)
-        shifted_x = window_reverse(attn_windows, self.window_size, h, w)
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            attn_x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            attn_x = shifted_x
-        attn_x = attn_x.view(b, h * w, c)
-        # FFN
-        x = shortcut + self.drop_path(attn_x) + conv_x * self.conv_scale
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-class OCAB(nn.Module):
-    def __init__(self, dim, input_resolution, window_size, overlap_ratio, num_heads,
-                 qkv_bias=True, qk_scale=None, mlp_ratio=2, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.window_size = window_size
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-        self.overlap_win_size = int(window_size * overlap_ratio) + window_size
-        self.norm1 = norm_layer(dim)
-        self.qkv = nn.Linear(dim, dim * 3,  bias=qkv_bias)
-        self.unfold = nn.Unfold(kernel_size=(self.overlap_win_size, self.overlap_win_size),
-                               stride=window_size, padding=(self.overlap_win_size-window_size)//2)
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((window_size + self.overlap_win_size - 1) * (window_size + self.overlap_win_size - 1), num_heads))
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-        self.proj = nn.Linear(dim,dim)
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=nn.GELU)
-    def forward(self, x, x_size, rpi):
-        h, w = x_size
-        b, _, c = x.shape
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(b, h, w, c)
-        qkv = self.qkv(x).reshape(b, h, w, 3, c).permute(3, 0, 4, 1, 2)
-        q = qkv[0].permute(0, 2, 3, 1)
-        kv = torch.cat((qkv[1], qkv[2]), dim=1)
-        # partition windows
-        q_windows = window_partition(q, self.window_size)
-        q_windows = q_windows.view(-1, self.window_size * self.window_size, c)
-        kv_windows = self.unfold(kv)
-        kv_windows = rearrange(kv_windows, 'b (nc ch owh oww) nw -> nc (b nw) (owh oww) ch',
-                              nc=2, ch=c, owh=self.overlap_win_size, oww=self.overlap_win_size).contiguous()
-        k_windows, v_windows = kv_windows[0], kv_windows[1]
-        b_, nq, _ = q_windows.shape
-        _, n, _ = k_windows.shape
-        d = self.dim // self.num_heads
-        q = q_windows.reshape(b_, nq, self.num_heads, d).permute(0, 2, 1, 3)
-        k = k_windows.reshape(b_, n, self.num_heads, d).permute(0, 2, 1, 3)
-        v = v_windows.reshape(b_, n, self.num_heads, d).permute(0, 2, 1, 3)
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-        relative_position_bias = self.relative_position_bias_table[rpi.view(-1)].view(
-            self.window_size * self.window_size, self.overlap_win_size * self.overlap_win_size, -1)
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
-        attn = attn + relative_position_bias.unsqueeze(0)
-        attn = self.softmax(attn)
-        attn_windows = (attn @ v).transpose(1, 2).reshape(b_, nq, self.dim)
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, self.dim)
-        x = window_reverse(attn_windows, self.window_size, h, w)
-        x = x.view(b, h * w, self.dim)
-        x = self.proj(x) + shortcut
-        x = x + self.mlp(self.norm2(x))
-        return x
-class AttenBlocks(nn.Module):
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size, compress_ratio,
-                 squeeze_factor, conv_scale, overlap_ratio, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None,
-                 use_checkpoint=False):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-        # build blocks
-        self.blocks = nn.ModuleList([
-            HAB(dim=dim, input_resolution=input_resolution, num_heads=num_heads, window_size=window_size,
-                shift_size=0 if (i % 2 == 0) else window_size // 2, compress_ratio=compress_ratio,
-                squeeze_factor=squeeze_factor, conv_scale=conv_scale, mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop,
-                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                norm_layer=norm_layer) for i in range(depth)
-        ])
-        # OCAB
-        self.overlap_attn = OCAB(dim=dim, input_resolution=input_resolution, window_size=window_size,
-                                overlap_ratio=overlap_ratio, num_heads=num_heads, qkv_bias=qkv_bias,
-                                qk_scale=qk_scale, mlp_ratio=mlp_ratio, norm_layer=norm_layer)
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-    def forward(self, x, x_size, params):
-        for blk in self.blocks:
-            x = blk(x, x_size, params['rpi_sa'], params['attn_mask'])
-        x = self.overlap_attn(x, x_size, params['rpi_oca'])
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-class RHAG(nn.Module):
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size, compress_ratio,
-                 squeeze_factor, conv_scale, overlap_ratio, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None,
-                 use_checkpoint=False, img_size=224, patch_size=4, resi_connection='1conv'):
-        super(RHAG, self).__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.residual_group = AttenBlocks(
-            dim=dim, input_resolution=input_resolution, depth=depth, num_heads=num_heads,
-            window_size=window_size, compress_ratio=compress_ratio, squeeze_factor=squeeze_factor,
-            conv_scale=conv_scale, overlap_ratio=overlap_ratio, mlp_ratio=mlp_ratio,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop,
-            drop_path=drop_path, norm_layer=norm_layer, downsample=downsample,
-            use_checkpoint=use_checkpoint)
-        if resi_connection == '1conv':
-            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
-        elif resi_connection == 'identity':
-            self.conv = nn.Identity()
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
-    def forward(self, x, x_size, params):
-        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size, params), x_size))) + x
-class PatchEmbed(nn.Module):
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-    def forward(self, x):
-        x = x.flatten(2).transpose(1, 2)
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-class PatchUnEmbed(nn.Module):
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-    def forward(self, x, x_size):
-        x = x.transpose(1, 2).contiguous().view(x.shape[0], self.embed_dim, x_size[0], x_size[1])
-        return x
-class Upsample(nn.Sequential):
-    def __init__(self, scale, num_feat):
-        m = []
-        if (scale & (scale - 1)) == 0:
-            for _ in range(int(math.log(scale, 2))):
-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
-                m.append(nn.PixelShuffle(2))
-        elif scale == 3:
-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
-            m.append(nn.PixelShuffle(3))
-        else:
-            raise ValueError(f'scale {scale} is not supported. Supported scales: 2^n and 3.')
-        super(Upsample, self).__init__(*m)
-class HAT(nn.Module):
-    def __init__(self, img_size=64, patch_size=1, in_chans=3, embed_dim=96, depths=(6, 6, 6, 6),
-                 num_heads=(6, 6, 6, 6), window_size=7, compress_ratio=3, squeeze_factor=30,
-                 conv_scale=0.01, overlap_ratio=0.5, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm,
-                 ape=False, patch_norm=True, use_checkpoint=False, upscale=2, img_range=1.,
-                 upsampler='', resi_connection='1conv', **kwargs):
-        super(HAT, self).__init__()
-        self.window_size = window_size
-        self.shift_size = window_size // 2
-        self.overlap_ratio = overlap_ratio
-        num_in_ch = in_chans
-        num_out_ch = in_chans
-        num_feat = 64
-        self.img_range = img_range
-        if in_chans == 3:
-            rgb_mean = (0.4488, 0.4371, 0.4040)
-            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
-        else:
-            self.mean = torch.zeros(1, 1, 1, 1)
-        self.upscale = upscale
-        self.upsampler = upsampler
-        # relative position index
-        relative_position_index_SA = self.calculate_rpi_sa()
-        relative_position_index_OCA = self.calculate_rpi_oca()
-        self.register_buffer('relative_position_index_SA', relative_position_index_SA)
-        self.register_buffer('relative_position_index_OCA', relative_position_index_OCA)
-        # shallow feature extraction
-        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
-        # deep feature extraction
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = embed_dim
-        self.mlp_ratio = mlp_ratio
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-        # merge non-overlapping patches into image
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
-        self.pos_drop = nn.Dropout(p=drop_rate)
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
-        # build Residual Hybrid Attention Groups (RHAG)
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = RHAG(
-                dim=embed_dim,
-                input_resolution=(patches_resolution[0], patches_resolution[1]),
-                depth=depths[i_layer],
-                num_heads=num_heads[i_layer],
-                window_size=window_size,
-                compress_ratio=compress_ratio,
-                squeeze_factor=squeeze_factor,
-                conv_scale=conv_scale,
-                overlap_ratio=overlap_ratio,
-                mlp_ratio=self.mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
-                norm_layer=norm_layer,
-                downsample=None,
-                use_checkpoint=use_checkpoint,
-                img_size=img_size,
-                patch_size=patch_size,
-                resi_connection=resi_connection)
-            self.layers.append(layer)
-        self.norm = norm_layer(self.num_features)
-        # build the last conv layer in deep feature extraction
-        if resi_connection == '1conv':
-            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
-        elif resi_connection == 'identity':
-            self.conv_after_body = nn.Identity()
-        # high quality image reconstruction
-        if self.upsampler == 'pixelshuffle':
-            self.conv_before_upsample = nn.Sequential(
-                nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True))
-            self.upsample = Upsample(upscale, num_feat)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-    def calculate_rpi_sa(self):
-        coords_h = torch.arange(self.window_size)
-        coords_w = torch.arange(self.window_size)
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
-        coords_flatten = torch.flatten(coords, 1)
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
-        relative_coords[:, :, 0] += self.window_size - 1
-        relative_coords[:, :, 1] += self.window_size - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size - 1
-        relative_position_index = relative_coords.sum(-1)
-        return relative_position_index
-    def calculate_rpi_oca(self):
-        window_size_ori = self.window_size
-        window_size_ext = self.window_size + int(self.overlap_ratio * self.window_size)
-        coords_h = torch.arange(window_size_ori)
-        coords_w = torch.arange(window_size_ori)
-        coords_ori = torch.stack(torch.meshgrid([coords_h, coords_w]))
-        coords_ori_flatten = torch.flatten(coords_ori, 1)
-        coords_h = torch.arange(window_size_ext)
-        coords_w = torch.arange(window_size_ext)
-        coords_ext = torch.stack(torch.meshgrid([coords_h, coords_w]))
-        coords_ext_flatten = torch.flatten(coords_ext, 1)
-        relative_coords = coords_ext_flatten[:, None, :] - coords_ori_flatten[:, :, None]
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
-        relative_coords[:, :, 0] += window_size_ori - window_size_ext + 1
-        relative_coords[:, :, 1] += window_size_ori - window_size_ext + 1
-        relative_coords[:, :, 0] *= window_size_ori + window_size_ext - 1
-        relative_position_index = relative_coords.sum(-1)
-        return relative_position_index
-    def calculate_mask(self, x_size):
-        h, w = x_size
-        img_mask = torch.zeros((1, h, w, 1))
-        h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None))
-        w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None))
-        cnt = 0
-        for h in h_slices:
-            for w in w_slices:
-                img_mask[:, h, w, :] = cnt
-                cnt += 1
-        mask_windows = window_partition(img_mask, self.window_size)
-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-        return attn_mask
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'absolute_pos_embed'}
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        return {'relative_position_bias_table'}
-    def forward_features(self, x):
-        x_size = (x.shape[2], x.shape[3])
-        attn_mask = self.calculate_mask(x_size).to(x.device)
-        params = {'attn_mask': attn_mask, 'rpi_sa': self.relative_position_index_SA, 'rpi_oca': self.relative_position_index_OCA}
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-        for layer in self.layers:
-            x = layer(x, x_size, params)
-        x = self.norm(x)
-        x = self.patch_unembed(x, x_size)
-        return x
-    def forward(self, x):
-        self.mean = self.mean.type_as(x)
-        x = (x - self.mean) * self.img_range
-        if self.upsampler == 'pixelshuffle':
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            x = self.conv_last(self.upsample(x))
-        x = x / self.img_range + self.mean
-        return x
-# Load the model
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model = HAT(
-    upscale=4,
-    in_chans=3,
-    img_size=128,
-    window_size=16,
-    compress_ratio=3,
-    squeeze_factor=30,
-    conv_scale=0.01,
-    overlap_ratio=0.5,
-    img_range=1.,
-    depths=[6, 6, 6, 6, 6, 6],
-    embed_dim=180,
-    num_heads=[6, 6, 6, 6, 6, 6],
-    mlp_ratio=2,
-    upsampler='pixelshuffle',
-    resi_connection='1conv'
-)
-# Load the fine-tuned weights
-checkpoint = torch.load(MODEL_CHECKPOINT, map_location=device)
-# Try different checkpoint formats
-state_dict = checkpoint.get('params_ema') or checkpoint.get('params') or checkpoint
-model.load_state_dict(state_dict)
-model.to(device)
-model.eval()
-def upscale_image(image):
-    # Convert PIL image to tensor
-    img_np = np.array(image).astype(np.float32) / 255.0
-    img_tensor = torch.from_numpy(img_np).permute(2, 0, 1).unsqueeze(0).to(device)
-    # Ensure the image dimensions are multiples of window_size
-    h, w = img_tensor.shape[2], img_tensor.shape[3]
-    # Pad if necessary
-    pad_h = (WINDOW_SIZE - h % WINDOW_SIZE) % WINDOW_SIZE
-    pad_w = (WINDOW_SIZE - w % WINDOW_SIZE) % WINDOW_SIZE
-    if pad_h > 0 or pad_w > 0:
-        img_tensor = torch.nn.functional.pad(img_tensor, (0, pad_w, 0, pad_h), mode='reflect')
-    with torch.no_grad():
-        output = model(img_tensor)
-    # Remove padding if it was added
-    if pad_h > 0 or pad_w > 0:
-        output = output[:, :, :h*UPSCALE_FACTOR, :w*UPSCALE_FACTOR]
-    # Convert back to PIL image
-    output_np = output.squeeze(0).permute(1, 2, 0).cpu().numpy()
-    output_np = np.clip(output_np * 255.0, 0, 255).astype(np.uint8)
-    return Image.fromarray(output_np)
-# Get sample images
-def get_sample_images():
-    sample_dir = "sample_images"
-    if os.path.exists(sample_dir):
-        image_files = glob.glob(os.path.join(sample_dir, "*.png")) + glob.glob(os.path.join(sample_dir, "*.jpg"))
-        return sorted(image_files)
-    return []
-# Gradio interface using Blocks for better layout control
-def validate_image_size(image):
-    """Validate that the image is exactly the required size"""
-    if image is None:
-        return False, "No image provided"
-    width, height = image.size
-    req_width, req_height = REQUIRED_IMAGE_SIZE
-    if width != req_width or height != req_height:
-        return False, f"Image must be exactly {req_width}x{req_height} pixels. Your image is {width}x{height} pixels."
-    return True, "Valid image size"
-def upscale_and_display(image):
-    if image is None:
-        return None, "Please upload an image or select a sample image."
-    # Validate image size
-    is_valid, message = validate_image_size(image)
-    if not is_valid:
-        return None, f"❌ Error: {message}"
-    try:
-        # Get the super-resolution output
-        upscaled = upscale_image(image)
-        return upscaled, "✅ Image successfully enhanced!"
-    except Exception as e:
-        return None, f"❌ Error processing image: {str(e)}"
-def select_sample_image(image_path):
-    if image_path:
-        return Image.open(image_path)
-    return None
-def image_to_base64(image_path):
-    """Convert image to base64 data URL for CSS background"""
-    img = Image.open(image_path)
-    img.thumbnail((120, 120), Image.Resampling.LANCZOS)
-    buffer = BytesIO()
-    img.save(buffer, format='PNG')
-    img_str = base64.b64encode(buffer.getvalue()).decode()
-    return f"data:image/png;base64,{img_str}"
-# Generate CSS with base64 images
-def generate_css():
-    base_css = """
-/* Target only the image display area, not the whole component */
-.image-container [data-testid="image"] {
-    height: 500px !important;
-    min-height: 500px !important;
-}
-/* Make images fill their containers */
-.image-container img {
-    width: 500px !important;
-    height: 500px !important;
-    object-fit: contain !important;
-    object-position: center !important;
-}
-/* Sample image buttons with background images */
-.sample-image-btn {
-    height: 120px !important;
-    width: 120px !important;
-    background-size: cover !important;
-    background-position: center !important;
-    border: 2px solid #ddd !important;
-    border-radius: 8px !important;
-    cursor: pointer !important;
-    transition: border-color 0.2s !important;
-    margin: 5px !important;
-}
-.sample-image-btn:hover {
-    border-color: #007acc !important;
-}
 """
-    # Add background images for each sample (only if samples exist)
-    sample_images = get_sample_images()
-    if sample_images:
-        for i, img_path in enumerate(sample_images):
-            try:
-                base64_img = image_to_base64(img_path)
-                base_css += f"#sample_btn_{i} {{ background-image: url('{base64_img}'); }}\n"
-            except Exception:
-                # Skip invalid images
-                continue
-    return base_css
-css = generate_css()
-with gr.Blocks(css=css, title="HATSAT - Super-Resolution for Satellite Images") as iface:
-    gr.Markdown("# HATSAT - Super-Resolution for Satellite Images")
-    gr.Markdown(f"Upload a satellite image or select a sample to enhance its resolution by {UPSCALE_FACTOR}x.")
-    gr.Markdown(f"⚠️ **Important**: Images must be exactly **{REQUIRED_IMAGE_SIZE[0]}x{REQUIRED_IMAGE_SIZE[1]} pixels** for the model to work properly.")
-    # Acknowledgments section
-    with gr.Accordion("Acknowledgments", open=False):
-        gr.Markdown("""
-        ### Base Model: HAT (Hybrid Attention Transformer)
-        This model is a fine tuned version of **HAT**:
-        - **GitHub Repository**: [https://github.com/XPixelGroup/HAT](https://github.com/XPixelGroup/HAT)
-        - **Paper**: [Activating More Pixels in Image Super-Resolution Transformer](https://arxiv.org/abs/2205.04437)
-        - **Authors**: Xiangyu Chen, Xintao Wang, Jiantao Zhou, Yu Qiao, Chao Dong
-        ### Training Dataset: SEN2NAIPv2
-        The model was fine-tuned using the **SEN2NAIPv2** dataset:
-        - **HuggingFace Dataset**: [https://huggingface.co/datasets/tacofoundation/SEN2NAIPv2](https://huggingface.co/datasets/tacofoundation/SEN2NAIPv2)
-        - **Description**: High-resolution satellite imagery dataset for super-resolution tasks
-        """)
-    # Sample images
-    sample_images = get_sample_images()
-    sample_buttons = []
-    if sample_images:
-        gr.Markdown("**Sample Images (click to select):**")
-        with gr.Row():
-            for i, img_path in enumerate(sample_images):
-                btn = gr.Button(
-                    "",
-                    elem_id=f"sample_btn_{i}",
-                    elem_classes="sample-image-btn"
-                )
-                sample_buttons.append((btn, img_path))
-    with gr.Row():
-        input_image = gr.Image(
-            type="pil",
-            label=f"Input Image (must be {REQUIRED_IMAGE_SIZE[0]}x{REQUIRED_IMAGE_SIZE[1]} pixels)",
-            elem_classes="image-container",
-            sources=["upload"],
-            height=500,
-            width=500
-        )
-        output_image = gr.Image(
-            type="pil",
-            label=f"Enhanced Output ({UPSCALE_FACTOR}x)",
-            elem_classes="image-container",
-            interactive=False,
-            height=500,
-            width=500,
-            show_download_button=True
-        )
-    submit_btn = gr.Button("Enhance Image", variant="primary")
-    # Status message
-    status_message = gr.Textbox(
-        label="Status",
-        interactive=False,
-        show_label=True
-    )
-    # Event handlers
-    if sample_images:
-        for btn, img_path in sample_buttons:
-            btn.click(fn=lambda path=img_path: select_sample_image(path), outputs=input_image)
-    submit_btn.click(fn=upscale_and_display, inputs=input_image, outputs=[output_image, status_message])
 if __name__ == "__main__":
-    iface.launch()

+"""
+HATSAT - Super-Resolution for Satellite Images
+Main application entry point.
 """
+from utils.model_utils import load_model
+from interface.gradio_app import create_interface
+def main():
+    """Initialize and launch the HATSAT application."""
+    # Load model and get device
+    model, device = load_model()
+    # Create and launch Gradio interface
+    iface = create_interface(model, device)
+    iface.launch()
 if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Configuration constants for HATSAT application.
+"""
+# Model configuration
+MODEL_CHECKPOINT = 'net_g_150000.pth'
+REQUIRED_IMAGE_SIZE = (130, 130)
+WINDOW_SIZE = 16
+UPSCALE_FACTOR = 4
+# Model architecture parameters
+MODEL_CONFIG = {
+    'upscale': 4,
+    'in_chans': 3,
+    'img_size': 128,
+    'window_size': 16,
+    'compress_ratio': 3,
+    'squeeze_factor': 30,
+    'conv_scale': 0.01,
+    'overlap_ratio': 0.5,
+    'img_range': 1.,
+    'depths': [6, 6, 6, 6, 6, 6],
+    'embed_dim': 180,
+    'num_heads': [6, 6, 6, 6, 6, 6],
+    'mlp_ratio': 2,
+    'upsampler': 'pixelshuffle',
+    'resi_connection': '1conv'
+}

interface/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Gradio interface components for HATSAT application.
+"""
+from .gradio_app import create_interface
+from .css_styles import generate_css, get_sample_images
+__all__ = ['create_interface', 'generate_css', 'get_sample_images']

interface/css_styles.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+CSS styling and sample image utilities for Gradio interface.
+"""
+import os
+import glob
+from utils.image_utils import image_to_base64
+def get_sample_images():
+    """Get list of sample images."""
+    sample_dir = "sample_images"
+    if os.path.exists(sample_dir):
+        image_files = glob.glob(os.path.join(sample_dir, "*.png")) + glob.glob(os.path.join(sample_dir, "*.jpg"))
+        return sorted(image_files)
+    return []
+def generate_css():
+    """Generate CSS with base64 images for sample buttons."""
+    base_css = """
+/* Target only the image display area, not the whole component */
+.image-container [data-testid="image"] {
+    height: 500px !important;
+    min-height: 500px !important;
+}
+/* Make images fill their containers */
+.image-container img {
+    width: 500px !important;
+    height: 500px !important;
+    object-fit: contain !important;
+    object-position: center !important;
+}
+/* Sample image buttons with background images */
+.sample-image-btn {
+    height: 120px !important;
+    width: 120px !important;
+    background-size: cover !important;
+    background-position: center !important;
+    border: 2px solid #ddd !important;
+    border-radius: 8px !important;
+    cursor: pointer !important;
+    transition: border-color 0.2s !important;
+    margin: 5px !important;
+}
+.sample-image-btn:hover {
+    border-color: #007acc !important;
+}
+"""
+    # Add background images for each sample (only if samples exist)
+    sample_images = get_sample_images()
+    if sample_images:
+        for i, img_path in enumerate(sample_images):
+            try:
+                base64_img = image_to_base64(img_path)
+                base_css += f"#sample_btn_{i} {{ background-image: url('{base64_img}'); }}\n"
+            except Exception:
+                # Skip invalid images
+                continue
+    return base_css

interface/gradio_app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Gradio interface for HATSAT application.
+"""
+import gradio as gr
+from PIL import Image
+from config import REQUIRED_IMAGE_SIZE, UPSCALE_FACTOR
+from utils.image_utils import validate_image_size, upscale_image
+from interface.css_styles import generate_css, get_sample_images
+def upscale_and_display(image, model, device):
+    """Process image upload and return upscaled result."""
+    if image is None:
+        return None, "Please upload an image or select a sample image."
+    # Validate image size
+    is_valid, message = validate_image_size(image)
+    if not is_valid:
+        return None, f"❌ Error: {message}"
+    try:
+        # Get the super-resolution output
+        upscaled = upscale_image(image, model, device)
+        return upscaled, "✅ Image successfully enhanced!"
+    except Exception as e:
+        return None, f"❌ Error processing image: {str(e)}"
+def select_sample_image(image_path):
+    """Load and return a sample image."""
+    if image_path:
+        return Image.open(image_path)
+    return None
+def create_interface(model, device):
+    """Create and configure the Gradio interface."""
+    css = generate_css()
+    with gr.Blocks(css=css, title="HATSAT - Super-Resolution for Satellite Images") as iface:
+        gr.Markdown("# HATSAT - Super-Resolution for Satellite Images")
+        gr.Markdown(f"Upload a satellite image or select a sample to enhance its resolution by {UPSCALE_FACTOR}x.")
+        gr.Markdown(f"⚠️ **Important**: Images must be exactly **{REQUIRED_IMAGE_SIZE[0]}x{REQUIRED_IMAGE_SIZE[1]} pixels** for the model to work properly.")
+        # Acknowledgments section
+        with gr.Accordion("Acknowledgments", open=False):
+            gr.Markdown("""
+            ### Base Model: HAT (Hybrid Attention Transformer)
+            This model is a fine tuned version of **HAT**:
+            - **GitHub Repository**: [https://github.com/XPixelGroup/HAT](https://github.com/XPixelGroup/HAT)
+            - **Paper**: [Activating More Pixels in Image Super-Resolution Transformer](https://arxiv.org/abs/2205.04437)
+            - **Authors**: Xiangyu Chen, Xintao Wang, Jiantao Zhou, Yu Qiao, Chao Dong
+            ### Training Dataset: SEN2NAIPv2
+            The model was fine-tuned using the **SEN2NAIPv2** dataset:
+            - **HuggingFace Dataset**: [https://huggingface.co/datasets/tacofoundation/SEN2NAIPv2](https://huggingface.co/datasets/tacofoundation/SEN2NAIPv2)
+            - **Description**: High-resolution satellite imagery dataset for super-resolution tasks
+            """)
+        # Sample images
+        sample_images = get_sample_images()
+        sample_buttons = []
+        if sample_images:
+            gr.Markdown("**Sample Images (click to select):**")
+            with gr.Row():
+                for i, img_path in enumerate(sample_images):
+                    btn = gr.Button(
+                        "",
+                        elem_id=f"sample_btn_{i}",
+                        elem_classes="sample-image-btn"
+                    )
+                    sample_buttons.append((btn, img_path))
+        with gr.Row():
+            input_image = gr.Image(
+                type="pil",
+                label=f"Input Image (must be {REQUIRED_IMAGE_SIZE[0]}x{REQUIRED_IMAGE_SIZE[1]} pixels)",
+                elem_classes="image-container",
+                sources=["upload"],
+                height=500,
+                width=500
+            )
+            output_image = gr.Image(
+                type="pil",
+                label=f"Enhanced Output ({UPSCALE_FACTOR}x)",
+                elem_classes="image-container",
+                interactive=False,
+                height=500,
+                width=500,
+                show_download_button=True
+            )
+        submit_btn = gr.Button("Enhance Image", variant="primary")
+        # Status message
+        status_message = gr.Textbox(
+            label="Status",
+            interactive=False,
+            show_label=True
+        )
+        # Event handlers
+        if sample_images:
+            for btn, img_path in sample_buttons:
+                btn.click(fn=lambda path=img_path: select_sample_image(path), outputs=input_image)
+        submit_btn.click(
+            fn=lambda img: upscale_and_display(img, model, device),
+            inputs=input_image,
+            outputs=[output_image, status_message]
+        )
+    return iface

model/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+HAT model architecture components.
+"""
+from .hat_model import HAT
+from .components import (
+    DropPath, ChannelAttention, CAB, Mlp,
+    WindowAttention, HAB, OCAB, AttenBlocks,
+    RHAG, PatchEmbed, PatchUnEmbed, Upsample
+)
+__all__ = [
+    'HAT', 'DropPath', 'ChannelAttention', 'CAB', 'Mlp',
+    'WindowAttention', 'HAB', 'OCAB', 'AttenBlocks',
+    'RHAG', 'PatchEmbed', 'PatchUnEmbed', 'Upsample'
+]

model/components.py ADDED Viewed

	@@ -0,0 +1,451 @@

+"""
+HAT model components and building blocks.
+"""
+import torch
+import torch.nn as nn
+import math
+from einops import rearrange
+def to_2tuple(x):
+    """Convert input to tuple of length 2."""
+    if isinstance(x, (tuple, list)):
+        return tuple(x)
+    return (x, x)
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    """Truncated normal initialization."""
+    def norm_cdf(x):
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    with torch.no_grad():
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        tensor.erfinv_()
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class ChannelAttention(nn.Module):
+    def __init__(self, num_feat, squeeze_factor=16):
+        super(ChannelAttention, self).__init__()
+        self.attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(num_feat, num_feat // squeeze_factor, 1, padding=0),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(num_feat // squeeze_factor, num_feat, 1, padding=0),
+            nn.Sigmoid())
+    def forward(self, x):
+        y = self.attention(x)
+        return x * y
+class CAB(nn.Module):
+    def __init__(self, num_feat, compress_ratio=3, squeeze_factor=30):
+        super(CAB, self).__init__()
+        self.cab = nn.Sequential(
+            nn.Conv2d(num_feat, num_feat // compress_ratio, 3, 1, 1),
+            nn.GELU(),
+            nn.Conv2d(num_feat // compress_ratio, num_feat, 3, 1, 1),
+            ChannelAttention(num_feat, squeeze_factor)
+        )
+    def forward(self, x):
+        return self.cab(x)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    b, h, w, c = x.shape
+    x = x.view(b, h // window_size, window_size, w // window_size, window_size, c)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, c)
+    return windows
+def window_reverse(windows, window_size, h, w):
+    b = int(windows.shape[0] / (h * w / window_size / window_size))
+    x = windows.view(b, h // window_size, w // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(b, h, w, -1)
+    return x
+class WindowAttention(nn.Module):
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, rpi, mask=None):
+        b_, n, c = x.shape
+        qkv = self.qkv(x).reshape(b_, n, 3, self.num_heads, c // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[rpi.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nw = mask.shape[0]
+            attn = attn.view(b_ // nw, nw, self.num_heads, n, n) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, n, n)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(b_, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class HAB(nn.Module):
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 compress_ratio=3, squeeze_factor=30, conv_scale=0.01, mlp_ratio=4.,
+                 qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.conv_scale = conv_scale
+        self.conv_block = CAB(num_feat=dim, compress_ratio=compress_ratio, squeeze_factor=squeeze_factor)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x, x_size, rpi_sa, attn_mask):
+        h, w = x_size
+        b, _, c = x.shape
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(b, h, w, c)
+        # Conv_X
+        conv_x = self.conv_block(x.permute(0, 3, 1, 2))
+        conv_x = conv_x.permute(0, 2, 3, 1).contiguous().view(b, h * w, c)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = attn_mask
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, c)
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, rpi=rpi_sa, mask=attn_mask)
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, c)
+        shifted_x = window_reverse(attn_windows, self.window_size, h, w)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attn_x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            attn_x = shifted_x
+        attn_x = attn_x.view(b, h * w, c)
+        # FFN
+        x = shortcut + self.drop_path(attn_x) + conv_x * self.conv_scale
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class OCAB(nn.Module):
+    def __init__(self, dim, input_resolution, window_size, overlap_ratio, num_heads,
+                 qkv_bias=True, qk_scale=None, mlp_ratio=2, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.overlap_win_size = int(window_size * overlap_ratio) + window_size
+        self.norm1 = norm_layer(dim)
+        self.qkv = nn.Linear(dim, dim * 3,  bias=qkv_bias)
+        self.unfold = nn.Unfold(kernel_size=(self.overlap_win_size, self.overlap_win_size),
+                               stride=window_size, padding=(self.overlap_win_size-window_size)//2)
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((window_size + self.overlap_win_size - 1) * (window_size + self.overlap_win_size - 1), num_heads))
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+        self.proj = nn.Linear(dim,dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=nn.GELU)
+    def forward(self, x, x_size, rpi):
+        h, w = x_size
+        b, _, c = x.shape
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(b, h, w, c)
+        qkv = self.qkv(x).reshape(b, h, w, 3, c).permute(3, 0, 4, 1, 2)
+        q = qkv[0].permute(0, 2, 3, 1)
+        kv = torch.cat((qkv[1], qkv[2]), dim=1)
+        # partition windows
+        q_windows = window_partition(q, self.window_size)
+        q_windows = q_windows.view(-1, self.window_size * self.window_size, c)
+        kv_windows = self.unfold(kv)
+        kv_windows = rearrange(kv_windows, 'b (nc ch owh oww) nw -> nc (b nw) (owh oww) ch',
+                              nc=2, ch=c, owh=self.overlap_win_size, oww=self.overlap_win_size).contiguous()
+        k_windows, v_windows = kv_windows[0], kv_windows[1]
+        b_, nq, _ = q_windows.shape
+        _, n, _ = k_windows.shape
+        d = self.dim // self.num_heads
+        q = q_windows.reshape(b_, nq, self.num_heads, d).permute(0, 2, 1, 3)
+        k = k_windows.reshape(b_, n, self.num_heads, d).permute(0, 2, 1, 3)
+        v = v_windows.reshape(b_, n, self.num_heads, d).permute(0, 2, 1, 3)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[rpi.view(-1)].view(
+            self.window_size * self.window_size, self.overlap_win_size * self.overlap_win_size, -1)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attn = attn + relative_position_bias.unsqueeze(0)
+        attn = self.softmax(attn)
+        attn_windows = (attn @ v).transpose(1, 2).reshape(b_, nq, self.dim)
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, self.dim)
+        x = window_reverse(attn_windows, self.window_size, h, w)
+        x = x.view(b, h * w, self.dim)
+        x = self.proj(x) + shortcut
+        x = x + self.mlp(self.norm2(x))
+        return x
+class AttenBlocks(nn.Module):
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size, compress_ratio,
+                 squeeze_factor, conv_scale, overlap_ratio, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            HAB(dim=dim, input_resolution=input_resolution, num_heads=num_heads, window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2, compress_ratio=compress_ratio,
+                squeeze_factor=squeeze_factor, conv_scale=conv_scale, mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+        # OCAB
+        self.overlap_attn = OCAB(dim=dim, input_resolution=input_resolution, window_size=window_size,
+                                overlap_ratio=overlap_ratio, num_heads=num_heads, qkv_bias=qkv_bias,
+                                qk_scale=qk_scale, mlp_ratio=mlp_ratio, norm_layer=norm_layer)
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, x_size, params):
+        for blk in self.blocks:
+            x = blk(x, x_size, params['rpi_sa'], params['attn_mask'])
+        x = self.overlap_attn(x, x_size, params['rpi_oca'])
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+class RHAG(nn.Module):
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size, compress_ratio,
+                 squeeze_factor, conv_scale, overlap_ratio, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None,
+                 use_checkpoint=False, img_size=224, patch_size=4, resi_connection='1conv'):
+        super(RHAG, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.residual_group = AttenBlocks(
+            dim=dim, input_resolution=input_resolution, depth=depth, num_heads=num_heads,
+            window_size=window_size, compress_ratio=compress_ratio, squeeze_factor=squeeze_factor,
+            conv_scale=conv_scale, overlap_ratio=overlap_ratio, mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop,
+            drop_path=drop_path, norm_layer=norm_layer, downsample=downsample,
+            use_checkpoint=use_checkpoint)
+        if resi_connection == '1conv':
+            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
+        elif resi_connection == 'identity':
+            self.conv = nn.Identity()
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
+        self.patch_unembed = PatchUnEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
+    def forward(self, x, x_size, params):
+        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size, params), x_size))) + x
+class PatchEmbed(nn.Module):
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+class PatchUnEmbed(nn.Module):
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+    def forward(self, x, x_size):
+        x = x.transpose(1, 2).contiguous().view(x.shape[0], self.embed_dim, x_size[0], x_size[1])
+        return x
+class Upsample(nn.Sequential):
+    def __init__(self, scale, num_feat):
+        m = []
+        if (scale & (scale - 1)) == 0:
+            for _ in range(int(math.log(scale, 2))):
+                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
+                m.append(nn.PixelShuffle(2))
+        elif scale == 3:
+            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
+            m.append(nn.PixelShuffle(3))
+        else:
+            raise ValueError(f'scale {scale} is not supported. Supported scales: 2^n and 3.')
+        super(Upsample, self).__init__(*m)

model/hat_model.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+HAT (Hybrid Attention Transformer) main model implementation.
+"""
+import torch
+import torch.nn as nn
+import math
+from .components import (
+    RHAG, PatchEmbed, PatchUnEmbed, Upsample,
+    trunc_normal_, window_partition, to_2tuple
+)
+class HAT(nn.Module):
+    def __init__(self, img_size=64, patch_size=1, in_chans=3, embed_dim=96, depths=(6, 6, 6, 6),
+                 num_heads=(6, 6, 6, 6), window_size=7, compress_ratio=3, squeeze_factor=30,
+                 conv_scale=0.01, overlap_ratio=0.5, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm,
+                 ape=False, patch_norm=True, use_checkpoint=False, upscale=2, img_range=1.,
+                 upsampler='', resi_connection='1conv', **kwargs):
+        super(HAT, self).__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.overlap_ratio = overlap_ratio
+        num_in_ch = in_chans
+        num_out_ch = in_chans
+        num_feat = 64
+        self.img_range = img_range
+        if in_chans == 3:
+            rgb_mean = (0.4488, 0.4371, 0.4040)
+            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
+        else:
+            self.mean = torch.zeros(1, 1, 1, 1)
+        self.upscale = upscale
+        self.upsampler = upsampler
+        # relative position index
+        relative_position_index_SA = self.calculate_rpi_sa()
+        relative_position_index_OCA = self.calculate_rpi_oca()
+        self.register_buffer('relative_position_index_SA', relative_position_index_SA)
+        self.register_buffer('relative_position_index_OCA', relative_position_index_OCA)
+        # shallow feature extraction
+        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
+        # deep feature extraction
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = embed_dim
+        self.mlp_ratio = mlp_ratio
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # merge non-overlapping patches into image
+        self.patch_unembed = PatchUnEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        # build Residual Hybrid Attention Groups (RHAG)
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = RHAG(
+                dim=embed_dim,
+                input_resolution=(patches_resolution[0], patches_resolution[1]),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                compress_ratio=compress_ratio,
+                squeeze_factor=squeeze_factor,
+                conv_scale=conv_scale,
+                overlap_ratio=overlap_ratio,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=None,
+                use_checkpoint=use_checkpoint,
+                img_size=img_size,
+                patch_size=patch_size,
+                resi_connection=resi_connection)
+            self.layers.append(layer)
+        self.norm = norm_layer(self.num_features)
+        # build the last conv layer in deep feature extraction
+        if resi_connection == '1conv':
+            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
+        elif resi_connection == 'identity':
+            self.conv_after_body = nn.Identity()
+        # high quality image reconstruction
+        if self.upsampler == 'pixelshuffle':
+            self.conv_before_upsample = nn.Sequential(
+                nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True))
+            self.upsample = Upsample(upscale, num_feat)
+            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def calculate_rpi_sa(self):
+        coords_h = torch.arange(self.window_size)
+        coords_w = torch.arange(self.window_size)
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size - 1
+        relative_coords[:, :, 1] += self.window_size - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size - 1
+        relative_position_index = relative_coords.sum(-1)
+        return relative_position_index
+    def calculate_rpi_oca(self):
+        window_size_ori = self.window_size
+        window_size_ext = self.window_size + int(self.overlap_ratio * self.window_size)
+        coords_h = torch.arange(window_size_ori)
+        coords_w = torch.arange(window_size_ori)
+        coords_ori = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        coords_ori_flatten = torch.flatten(coords_ori, 1)
+        coords_h = torch.arange(window_size_ext)
+        coords_w = torch.arange(window_size_ext)
+        coords_ext = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        coords_ext_flatten = torch.flatten(coords_ext, 1)
+        relative_coords = coords_ext_flatten[:, None, :] - coords_ori_flatten[:, :, None]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += window_size_ori - window_size_ext + 1
+        relative_coords[:, :, 1] += window_size_ori - window_size_ext + 1
+        relative_coords[:, :, 0] *= window_size_ori + window_size_ext - 1
+        relative_position_index = relative_coords.sum(-1)
+        return relative_position_index
+    def calculate_mask(self, x_size):
+        h, w = x_size
+        img_mask = torch.zeros((1, h, w, 1))
+        h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        return attn_mask
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+    def forward_features(self, x):
+        x_size = (x.shape[2], x.shape[3])
+        attn_mask = self.calculate_mask(x_size).to(x.device)
+        params = {'attn_mask': attn_mask, 'rpi_sa': self.relative_position_index_SA, 'rpi_oca': self.relative_position_index_OCA}
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for layer in self.layers:
+            x = layer(x, x_size, params)
+        x = self.norm(x)
+        x = self.patch_unembed(x, x_size)
+        return x
+    def forward(self, x):
+        self.mean = self.mean.type_as(x)
+        x = (x - self.mean) * self.img_range
+        if self.upsampler == 'pixelshuffle':
+            x = self.conv_first(x)
+            x = self.conv_after_body(self.forward_features(x)) + x
+            x = self.conv_before_upsample(x)
+            x = self.conv_last(self.upsample(x))
+        x = x / self.img_range + self.mean
+        return x

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+Utility functions for HATSAT application.
+"""
+from .image_utils import upscale_image, validate_image_size, image_to_base64
+from .model_utils import load_model, get_device
+__all__ = [
+    'upscale_image', 'validate_image_size', 'image_to_base64',
+    'load_model', 'get_device'
+]

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+Image processing utilities.
+"""
+import torch
+import numpy as np
+from PIL import Image
+import base64
+from io import BytesIO
+from config import REQUIRED_IMAGE_SIZE, WINDOW_SIZE, UPSCALE_FACTOR
+def validate_image_size(image):
+    """Validate that the image is exactly the required size."""
+    if image is None:
+        return False, "No image provided"
+    width, height = image.size
+    req_width, req_height = REQUIRED_IMAGE_SIZE
+    if width != req_width or height != req_height:
+        return False, f"Image must be exactly {req_width}x{req_height} pixels. Your image is {width}x{height} pixels."
+    return True, "Valid image size"
+def upscale_image(image, model, device):
+    """Upscale an image using the HAT model."""
+    # Convert PIL image to tensor
+    img_np = np.array(image).astype(np.float32) / 255.0
+    img_tensor = torch.from_numpy(img_np).permute(2, 0, 1).unsqueeze(0).to(device)
+    # Ensure the image dimensions are multiples of window_size
+    h, w = img_tensor.shape[2], img_tensor.shape[3]
+    # Pad if necessary
+    pad_h = (WINDOW_SIZE - h % WINDOW_SIZE) % WINDOW_SIZE
+    pad_w = (WINDOW_SIZE - w % WINDOW_SIZE) % WINDOW_SIZE
+    if pad_h > 0 or pad_w > 0:
+        img_tensor = torch.nn.functional.pad(img_tensor, (0, pad_w, 0, pad_h), mode='reflect')
+    with torch.no_grad():
+        output = model(img_tensor)
+    # Remove padding if it was added
+    if pad_h > 0 or pad_w > 0:
+        output = output[:, :, :h*UPSCALE_FACTOR, :w*UPSCALE_FACTOR]
+    # Convert back to PIL image
+    output_np = output.squeeze(0).permute(1, 2, 0).cpu().numpy()
+    output_np = np.clip(output_np * 255.0, 0, 255).astype(np.uint8)
+    return Image.fromarray(output_np)
+def image_to_base64(image_path):
+    """Convert image to base64 data URL for CSS background."""
+    img = Image.open(image_path)
+    img.thumbnail((120, 120), Image.Resampling.LANCZOS)
+    buffer = BytesIO()
+    img.save(buffer, format='PNG')
+    img_str = base64.b64encode(buffer.getvalue()).decode()
+    return f"data:image/png;base64,{img_str}"

utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+Model loading and device utilities.
+"""
+import torch
+from model import HAT
+from config import MODEL_CHECKPOINT, MODEL_CONFIG
+def get_device():
+    """Get the appropriate device for model inference."""
+    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def load_model():
+    """Load and initialize the HAT model with pre-trained weights."""
+    device = get_device()
+    # Initialize model
+    model = HAT(**MODEL_CONFIG)
+    # Load the fine-tuned weights
+    checkpoint = torch.load(MODEL_CHECKPOINT, map_location=device)
+    # Try different checkpoint formats
+    state_dict = checkpoint.get('params_ema') or checkpoint.get('params') or checkpoint
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    return model, device