Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| import os | |
| # Configuracion de rutas dinamicas | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| PROJECT_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "../")) | |
| # Directorios de entrada y salida | |
| DATA_PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed","nlp") | |
| TEXT_FILE_PATH = os.path.join(DATA_PROCESSED_DIR, "cleaned-ocr.csv") | |
| SIMPLE_FILE_PATH = os.path.join(DATA_PROCESSED_DIR, "datasets", "dataset-simple.csv") | |
| COMPLEX_FILE_PATH = os.path.join(DATA_PROCESSED_DIR, "datasets", "dataset-complex.csv") | |
| OUTPUT_DIR = os.path.join(DATA_PROCESSED_DIR, "splits") | |
| def create_master_splits(): | |
| # Validar existencia de archivos de entrada | |
| if not all(os.path.exists(p) for p in [TEXT_FILE_PATH, SIMPLE_FILE_PATH, COMPLEX_FILE_PATH]): | |
| print("Error: Faltan archivos de entrada en data/processed") | |
| return | |
| # Cargar archivos CSV | |
| df_text = pd.read_csv(TEXT_FILE_PATH) | |
| df_simple = pd.read_csv(SIMPLE_FILE_PATH) | |
| df_complex = pd.read_csv(COMPLEX_FILE_PATH) | |
| # Generar columna id basada en el nombre del archivo si no existe | |
| if 'id' not in df_simple.columns: | |
| df_simple['id'] = df_simple['path'].apply(os.path.basename) | |
| if 'id' not in df_complex.columns: | |
| df_complex['id'] = df_complex['path'].apply(os.path.basename) | |
| # Renombrar columnas de etiquetas para evitar conflictos | |
| df_simple = df_simple.rename(columns={'label': 'label-simple'}) | |
| df_complex = df_complex.rename(columns={'label': 'label-complex'}) | |
| # Unir etiquetas simples y complejas usando el id | |
| df_labels_merged = pd.merge(df_simple, df_complex[['id', 'label-complex']], on='id', how='inner') | |
| # Unir con el texto limpio para crear el dataset maestro | |
| df_master = pd.merge(df_labels_merged, df_text[['id', 'text_clean']], on='id', how='inner') | |
| # Crear directorio de salida si no existe | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # Guardar el dataset completo para entrenamiento final | |
| complete_data_path = os.path.join(OUTPUT_DIR, "complete-data.csv") | |
| df_master.to_csv(complete_data_path, index=False) | |
| print(f"Dataset completo guardado en: {complete_data_path}") | |
| # Separar conjunto de prueba manteniendo balance de clases complejas | |
| df_temp, df_test = train_test_split( | |
| df_master, | |
| test_size=0.15, | |
| random_state=42, | |
| stratify=df_master['label-complex'] | |
| ) | |
| # Separar entrenamiento y validacion del conjunto restante | |
| df_train, df_val = train_test_split( | |
| df_temp, | |
| test_size=0.176, | |
| random_state=42, | |
| stratify=df_temp['label-complex'] | |
| ) | |
| # Guardar los archivos de particion | |
| df_train.to_csv(os.path.join(OUTPUT_DIR, "train.csv"), index=False) | |
| df_val.to_csv(os.path.join(OUTPUT_DIR, "val.csv"), index=False) | |
| df_test.to_csv(os.path.join(OUTPUT_DIR, "test.csv"), index=False) | |
| print(f"Splits guardados en: {OUTPUT_DIR}") | |
| if __name__ == "__main__": | |
| create_master_splits() |