File size: 5,900 Bytes
c69c4af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""Contains functions used for loading and logging models """

import sys
import os
from transformers import AutoModel, AutoProcessor 

import os
import core.vision_encoder.pe as pe
import core.vision_encoder.transforms as transforms_pe
from core.vision_encoder.config import PE_VISION_CONFIG
import torchvision.transforms as transforms
from PIL import Image
import requests


def print_trainable_params(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    percent = (trainable_params / total_params * 100) if total_params > 0 else 0
    print("\n--- Summary ---")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Total parameters:     {total_params:,}")
    print(f"Percentage:           {percent:.2f}%")


def get_backbone_pe(version, print_info=False, apply_migration_flag=False,pretrained=True):
    """
    Load PE ViT model, return model, transforms and size of output (dimension of embedding of last token)
    """
    print(f'Loading {version}...')
    backbone = pe.VisionTransformer.from_config(version, pretrained=pretrained)
    backbone_config = PE_VISION_CONFIG[version]
    transform = transforms_pe.get_image_transform_fix(image_size=backbone_config.image_size)
   
    print("\nYou can ignore the Missing keys list above.")
    print(f"Applying migration = {apply_migration_flag}")
    
    if print_info:
        attnpool= backbone.attn_pool
        print(f'embed_dim={attnpool.embed_dim}\nnum_heads={attnpool.num_heads}')
        print(f'OUTPUT DIM = {backbone_config.output_dim}')

    def apply_migration(m):
        if isinstance(m, pe.SelfAttention):
            m.migrate_weights()

    if apply_migration_flag == True: # when testing/resuming no migration should be used
        print('[MIGRATION] Migrating weights for PEFT compatibiltyy')
        backbone.apply(apply_migration)

    return backbone, transform, backbone_config.output_dim


def get_backbone_dinov3(model_name: str="facebook/dinov3-vitb16-pretrain-lvd1689m", print_info=False):
    print(f"Loading Hugging Face model: {model_name}")
    processor = AutoProcessor.from_pretrained(model_name)

    # Extract image processing configuration from the loaded processor
    image_processor_config = processor
    image_size = image_processor_config.size['height']
    image_mean = image_processor_config.image_mean
    image_std = image_processor_config.image_std

    transform = transforms.Compose([
        transforms.Lambda(_convert_to_rgb),
        transforms.Resize((image_size, image_size), antialias=True),
        transforms.ToTensor(),
        transforms.Normalize(mean=image_mean, std=image_std)
    ])
    
    # Load the model and return only the vision backbone
    vision_model = AutoModel.from_pretrained(model_name)

    if print_info:
        print(f'\nVISION CONFIGS:\n{vision_model.config}')
        print(f'\n\n\n{vision_model}')


    return vision_model, transform, vision_model.config.hidden_size


def get_backbone_siglip2(model_name: str='google/siglip2-base-patch16-224',print_info=False):
    """
    Load siglip2 ViT model, return model, transforms and size of output (dimension of embedding of last token)
    """
    print(f"Loading Hugging Face model: {model_name}")
    processor = AutoProcessor.from_pretrained(model_name)


    # Extract image processing configuration from the loaded processor
    image_processor_config = processor.image_processor
    image_size = image_processor_config.size['height']
    image_mean = image_processor_config.image_mean
    image_std = image_processor_config.image_std

    transform = transforms.Compose([
        transforms.Lambda(_convert_to_rgb),
        transforms.Resize((image_size, image_size), antialias=True),
        transforms.ToTensor(),
        transforms.Normalize(mean=image_mean, std=image_std)
    ])
    
    # Load the model and return only the vision backbone
    model = AutoModel.from_pretrained(model_name)
    vision_model = model.vision_model

    if print_info:
        print(f'\nVISION CONFIGS:\n{vision_model.config}')
        print(f'\n\n***************MHAP\n{vision_model.head}')


    return vision_model, transform, vision_model.config.hidden_size

def _convert_to_rgb(image: Image.Image) -> Image.Image:
    """Converts a PIL Image to RGB format."""
    return image.convert("RGB")


def get_backbone(version: str, apply_migration : bool = False):
    """
    Returns vision transformer backbone
    Args:
        version: Name of the backbone to use, PE-Core or Siglip
        ckpt: if different from null, loads backbone from .pt file specified, only for PE
    """
    if 'PE-Core-' in version:
        return get_backbone_pe(version, False, apply_migration)
    elif 'siglip2' in version:
        print('[LOADING SIGLIP2]')
        return get_backbone_siglip2(version)
    elif 'dinov3' in version:
        return get_backbone_dinov3(version)
    


def send_telegram_message(message: str):
    """Sends a message to a Telegram chat using credentials from the config."""
    # Get credentials from your config object. Use getattr for safety.
    token = os.getenv("BOT_TOKEN")
    chat_id = "1220514183"

    if not token or not chat_id:
        # Silently fail if credentials are not set
        return

    api_url = f"https://api.telegram.org/bot{token}/sendMessage"
    payload = {
        'chat_id': chat_id,
        'text': message,
        'parse_mode': 'Markdown'  # For nice formatting (bold, italics, etc.)
    }

    try:
        response = requests.post(api_url, data=payload, timeout=10)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
    except requests.exceptions.RequestException as e:
        # Don't crash the training loop if Telegram is down
        print(f"\nWarning: Could not send Telegram message. Error: {e}")