Spaces:
Running
Running
| import pandas as pd | |
| import json | |
| import os | |
| # Configuracion de rutas dinamicas | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| PROJECT_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "../")) | |
| # Directorios de entrada y salida | |
| INPUT_DIR = os.path.join(PROJECT_ROOT, "data", "train") | |
| OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data", "processed", "datasets") | |
| TASKS = { | |
| "simple": { | |
| "file": "label_simple.csv", | |
| "cols": ["none", "inappropriate", "hate_speech"] | |
| }, | |
| "complex": { | |
| "file": "label_complex.csv", | |
| "cols": ["none", "inappropriate", "sexism", "racism", "classicism", "other"] | |
| } | |
| } | |
| def create_dataset_csv(task_name): | |
| # Validar existencia del directorio de entrada | |
| if not os.path.exists(INPUT_DIR): | |
| print(f"Error: No se encuentra el directorio {INPUT_DIR}") | |
| return | |
| # Cargar archivo JSON con metadatos | |
| json_path = os.path.join(INPUT_DIR, "train_data.json") | |
| with open(json_path, 'r', encoding='utf-8') as f: | |
| data_json = json.load(f) | |
| df = pd.DataFrame(data_json) | |
| # Cargar archivo CSV con etiquetas | |
| csv_conf = TASKS[task_name] | |
| labels_path = os.path.join(INPUT_DIR, csv_conf["file"]) | |
| df_labels = pd.read_csv(labels_path, header=None) | |
| # Convertir one-hot encoding a etiquetas numericas | |
| df['label'] = df_labels.values.argmax(axis=1) | |
| # Generar rutas relativas de las imagenes | |
| images_prefix = os.path.join("data", "train", "images") | |
| df['path'] = df['MEME-ID'].apply(lambda x: os.path.join(images_prefix, x)) | |
| # Seleccionar columnas finales | |
| final_df = df[['path', 'text', 'description', 'label']] | |
| # Crear directorio de salida si no existe | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # Guardar archivo CSV | |
| output_file = os.path.join(OUTPUT_DIR, f"dataset-{task_name}.csv") | |
| final_df.to_csv(output_file, index=False) | |
| print(f"Archivo generado: {output_file}") | |
| if __name__ == "__main__": | |
| create_dataset_csv("simple") | |
| create_dataset_csv("complex") |