Spaces:

julianzrmrz
/

dimemex

Sleeping

dimemex / src /utils.py

Deploy: Versión inicial con modelos v4

4e07023 10 days ago

1.75 kB

	import re
	import cv2
	import numpy as np
	import pandas as pd

	def clean_text(text):
	"""Limpieza estándar para BERT."""
	if not text or pd.isna(text): return "sin texto"
	text = str(text).lower()
	# Eliminar URLs y usuarios
	text = re.sub(r'http\S+\|www\.\S+', '', text)
	text = re.sub(r'@\w+', '', text)
	# Normalizar risas
	text = re.sub(r'(ja\|je\|ha\|he\|lo){2,}', 'jaja', text)
	# Eliminar basura de OCR
	text = re.sub(r'[\|_~*^>\[\]]', ' ', text)
	# Espacios y saltos
	text = text.replace('\n', ' ').replace('\r', ' ')
	text = re.sub(r'\s+', ' ', text).strip()
	return text if text else "sin texto"

	def preprocess_image_for_ocr(file_bytes):
	"""
	Recibe bytes (desde Streamlit) y aplica filtros de OpenCV.
	Retorna: (imagen_binarizada, imagen_original_cv2)
	"""
	# Convertir bytes a array numpy para OpenCV
	file_bytes = np.asarray(bytearray(file_bytes.read()), dtype=np.uint8)
	img = cv2.imdecode(file_bytes, 1) # 1 = Color BGR

	if img is None: return None, None

	# Pipeline de Mejora (Igual al benchmark)
	try:
	# 1. Upscaling (2x)
	img_resized = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

	# 2. Escala de Grises
	gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)

	# 3. Denoising
	denoised = cv2.fastNlMeansDenoising(gray, None, h=10, templateWindowSize=7, searchWindowSize=21)

	# 4. Binarización Adaptativa
	binary = cv2.adaptiveThreshold(
	denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
	)
	return binary, img
	except Exception as e:
	print(f"Error en pre-procesamiento: {e}")
	return None, img