dimemex / src /split-data.py
julianzrmrz's picture
Deploy: Versi贸n inicial con modelos v4
4e07023
raw
history blame
3.05 kB
import pandas as pd
from sklearn.model_selection import train_test_split
import os
# Configuracion de rutas dinamicas
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "../"))
# Directorios de entrada y salida
DATA_PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed","nlp")
TEXT_FILE_PATH = os.path.join(DATA_PROCESSED_DIR, "cleaned-ocr.csv")
SIMPLE_FILE_PATH = os.path.join(DATA_PROCESSED_DIR, "datasets", "dataset-simple.csv")
COMPLEX_FILE_PATH = os.path.join(DATA_PROCESSED_DIR, "datasets", "dataset-complex.csv")
OUTPUT_DIR = os.path.join(DATA_PROCESSED_DIR, "splits")
def create_master_splits():
# Validar existencia de archivos de entrada
if not all(os.path.exists(p) for p in [TEXT_FILE_PATH, SIMPLE_FILE_PATH, COMPLEX_FILE_PATH]):
print("Error: Faltan archivos de entrada en data/processed")
return
# Cargar archivos CSV
df_text = pd.read_csv(TEXT_FILE_PATH)
df_simple = pd.read_csv(SIMPLE_FILE_PATH)
df_complex = pd.read_csv(COMPLEX_FILE_PATH)
# Generar columna id basada en el nombre del archivo si no existe
if 'id' not in df_simple.columns:
df_simple['id'] = df_simple['path'].apply(os.path.basename)
if 'id' not in df_complex.columns:
df_complex['id'] = df_complex['path'].apply(os.path.basename)
# Renombrar columnas de etiquetas para evitar conflictos
df_simple = df_simple.rename(columns={'label': 'label-simple'})
df_complex = df_complex.rename(columns={'label': 'label-complex'})
# Unir etiquetas simples y complejas usando el id
df_labels_merged = pd.merge(df_simple, df_complex[['id', 'label-complex']], on='id', how='inner')
# Unir con el texto limpio para crear el dataset maestro
df_master = pd.merge(df_labels_merged, df_text[['id', 'text_clean']], on='id', how='inner')
# Crear directorio de salida si no existe
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Guardar el dataset completo para entrenamiento final
complete_data_path = os.path.join(OUTPUT_DIR, "complete-data.csv")
df_master.to_csv(complete_data_path, index=False)
print(f"Dataset completo guardado en: {complete_data_path}")
# Separar conjunto de prueba manteniendo balance de clases complejas
df_temp, df_test = train_test_split(
df_master,
test_size=0.15,
random_state=42,
stratify=df_master['label-complex']
)
# Separar entrenamiento y validacion del conjunto restante
df_train, df_val = train_test_split(
df_temp,
test_size=0.176,
random_state=42,
stratify=df_temp['label-complex']
)
# Guardar los archivos de particion
df_train.to_csv(os.path.join(OUTPUT_DIR, "train.csv"), index=False)
df_val.to_csv(os.path.join(OUTPUT_DIR, "val.csv"), index=False)
df_test.to_csv(os.path.join(OUTPUT_DIR, "test.csv"), index=False)
print(f"Splits guardados en: {OUTPUT_DIR}")
if __name__ == "__main__":
create_master_splits()