dimemex / src /utils.py
julianzrmrz's picture
Deploy: Versi贸n inicial con modelos v4
4e07023
raw
history blame
1.75 kB
import re
import cv2
import numpy as np
import pandas as pd
def clean_text(text):
"""Limpieza est谩ndar para BERT."""
if not text or pd.isna(text): return "sin texto"
text = str(text).lower()
# Eliminar URLs y usuarios
text = re.sub(r'http\S+|www\.\S+', '', text)
text = re.sub(r'@\w+', '', text)
# Normalizar risas
text = re.sub(r'(ja|je|ha|he|lo){2,}', 'jaja', text)
# Eliminar basura de OCR
text = re.sub(r'[|_~*^>\[\]]', ' ', text)
# Espacios y saltos
text = text.replace('\n', ' ').replace('\r', ' ')
text = re.sub(r'\s+', ' ', text).strip()
return text if text else "sin texto"
def preprocess_image_for_ocr(file_bytes):
"""
Recibe bytes (desde Streamlit) y aplica filtros de OpenCV.
Retorna: (imagen_binarizada, imagen_original_cv2)
"""
# Convertir bytes a array numpy para OpenCV
file_bytes = np.asarray(bytearray(file_bytes.read()), dtype=np.uint8)
img = cv2.imdecode(file_bytes, 1) # 1 = Color BGR
if img is None: return None, None
# Pipeline de Mejora (Igual al benchmark)
try:
# 1. Upscaling (2x)
img_resized = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
# 2. Escala de Grises
gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)
# 3. Denoising
denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21)
# 4. Binarizaci贸n Adaptativa
binary = cv2.adaptiveThreshold(
denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
return binary, img
except Exception as e:
print(f"Error en pre-procesamiento: {e}")
return None, img