import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
# Load preprocessed data (X, y, X_test from preprocessing step)
# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y
)
# Initialize and train model
model = RandomForestClassifier(
n_estimators=200,
random_state=42,
class_weight='balanced'
)
model.fit(X_train, y_train)
# Validate model
y_val_pred = model.predict(X_val)
y_val_proba = model.predict_proba(X_val)[:, 1]
# Print metrics (see Evaluation page for detailed results)
print("CONFUSION MATRIX")
print(confusion_matrix(y_val, y_val_pred))
print("\nCLASSIFICATION REPORT")
print(classification_report(y_val, y_val_pred))
print("\nAUC-ROC:", roc_auc_score(y_val, y_val_proba))
# Retrain on full dataset
model.fit(X, y)
# Make final predictions
threshold = 0.5
y_pred_test = (model.predict_proba(X_test)[:, 1] >= threshold).astype(int)
# Save predictions
submission = pd.DataFrame({
"id": ids_test,
"FRAUDE": y_pred_test
})
submission.to_csv("/content/test_evaluado.csv", index=False)