128 lines
4.3 KiB
Python
128 lines
4.3 KiB
Python
|
|
import os
|
|||
|
|
import pandas as pd
|
|||
|
|
import numpy as np
|
|||
|
|
from catboost import CatBoostClassifier, Pool
|
|||
|
|
from sklearn.model_selection import train_test_split
|
|||
|
|
from sklearn.metrics import roc_auc_score
|
|||
|
|
|
|||
|
|
print("Загрузка датасета...")
|
|||
|
|
|
|||
|
|
df = pd.read_parquet("data/dataset_from_db.parquet")
|
|||
|
|
|
|||
|
|
print(f"Всего записей: {len(df)}")
|
|||
|
|
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
|||
|
|
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
|||
|
|
|
|||
|
|
# --- Bag-of-Heroes подход ---
|
|||
|
|
# Создаем бинарные признаки для каждого героя в каждой команде
|
|||
|
|
|
|||
|
|
# Получаем все уникальные ID героев из данных
|
|||
|
|
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
|||
|
|
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
|
|||
|
|
|
|||
|
|
all_hero_ids = set()
|
|||
|
|
for col in hero_cols_r + hero_cols_d:
|
|||
|
|
all_hero_ids.update(df[col].dropna().unique())
|
|||
|
|
|
|||
|
|
all_hero_ids = sorted([int(h) for h in all_hero_ids if h >= 0])
|
|||
|
|
print(f"\nВсего уникальных героев: {len(all_hero_ids)}")
|
|||
|
|
|
|||
|
|
# Создаем новый датафрейм с bag-of-heroes признаками
|
|||
|
|
X = pd.DataFrame()
|
|||
|
|
|
|||
|
|
# Добавляем is_first_pick_radiant
|
|||
|
|
X["is_first_pick_radiant"] = df["is_first_pick_radiant"].astype(int)
|
|||
|
|
|
|||
|
|
# Для каждого героя создаем 2 признака: radiant_hero_{id} и dire_hero_{id}
|
|||
|
|
for hero_id in all_hero_ids:
|
|||
|
|
# Radiant team
|
|||
|
|
X[f"radiant_hero_{hero_id}"] = 0
|
|||
|
|
for col in hero_cols_r:
|
|||
|
|
X.loc[df[col] == hero_id, f"radiant_hero_{hero_id}"] = 1
|
|||
|
|
|
|||
|
|
# Dire team
|
|||
|
|
X[f"dire_hero_{hero_id}"] = 0
|
|||
|
|
for col in hero_cols_d:
|
|||
|
|
X.loc[df[col] == hero_id, f"dire_hero_{hero_id}"] = 1
|
|||
|
|
|
|||
|
|
print(f"Количество признаков: {len(X.columns)}")
|
|||
|
|
print(f" - is_first_pick_radiant: 1")
|
|||
|
|
print(f" - radiant_hero_*: {len(all_hero_ids)}")
|
|||
|
|
print(f" - dire_hero_*: {len(all_hero_ids)}")
|
|||
|
|
|
|||
|
|
# Целевая переменная
|
|||
|
|
y = df["y"].astype(int).copy()
|
|||
|
|
|
|||
|
|
# Разбиение
|
|||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|||
|
|
X, y,
|
|||
|
|
test_size=0.2,
|
|||
|
|
random_state=42,
|
|||
|
|
stratify=y
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print(f"\nTrain: {len(X_train)} записей")
|
|||
|
|
print(f"Test: {len(X_test)} записей")
|
|||
|
|
|
|||
|
|
# В bag-of-heroes все признаки числовые (0 или 1), категориальных нет
|
|||
|
|
train_pool = Pool(X_train, y_train)
|
|||
|
|
test_pool = Pool(X_test, y_test)
|
|||
|
|
|
|||
|
|
# Модель
|
|||
|
|
model = CatBoostClassifier(
|
|||
|
|
iterations=2500,
|
|||
|
|
learning_rate=0.03,
|
|||
|
|
depth=7,
|
|||
|
|
l2_leaf_reg=2,
|
|||
|
|
bootstrap_type="Bayesian",
|
|||
|
|
bagging_temperature=1.0,
|
|||
|
|
loss_function="Logloss",
|
|||
|
|
eval_metric="AUC",
|
|||
|
|
random_seed=42,
|
|||
|
|
verbose=100,
|
|||
|
|
od_type="Iter",
|
|||
|
|
od_wait=200
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print("\nНачало обучения...")
|
|||
|
|
model.fit(train_pool, eval_set=test_pool, use_best_model=True)
|
|||
|
|
|
|||
|
|
# --- Оценка качества ---
|
|||
|
|
best_scores = model.get_best_score()
|
|||
|
|
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
|
|||
|
|
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
|
|||
|
|
|
|||
|
|
y_train_proba = model.predict_proba(train_pool)[:, 1]
|
|||
|
|
y_test_proba = model.predict_proba(test_pool)[:, 1]
|
|||
|
|
train_auc = roc_auc_score(y_train, y_train_proba)
|
|||
|
|
test_auc = roc_auc_score(y_test, y_test_proba)
|
|||
|
|
|
|||
|
|
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
|
|||
|
|
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
|||
|
|
|
|||
|
|
# --- Сохранение ---
|
|||
|
|
os.makedirs("artifacts", exist_ok=True)
|
|||
|
|
model_path = "artifacts/model_bag_of_heroes.cbm"
|
|||
|
|
model.save_model(model_path)
|
|||
|
|
print(f"\nМодель сохранена: {model_path}")
|
|||
|
|
|
|||
|
|
# Порядок фичей
|
|||
|
|
feature_cols = list(X.columns)
|
|||
|
|
pd.DataFrame(feature_cols, columns=["feature"]).to_csv(
|
|||
|
|
"artifacts/feature_order_bag_of_heroes.csv", index=False
|
|||
|
|
)
|
|||
|
|
print("Порядок фичей сохранен в artifacts/feature_order_bag_of_heroes.csv")
|
|||
|
|
|
|||
|
|
# Важность признаков (топ-30)
|
|||
|
|
importance = model.get_feature_importance(train_pool)
|
|||
|
|
importance_df = (
|
|||
|
|
pd.DataFrame({"feature": X_train.columns, "importance": importance})
|
|||
|
|
.sort_values("importance", ascending=False)
|
|||
|
|
.reset_index(drop=True)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print("\nВажность признаков (top 30):")
|
|||
|
|
print(importance_df.head(30).to_string(index=False))
|
|||
|
|
|
|||
|
|
importance_df.to_csv("artifacts/feature_importance_bag_of_heroes.csv", index=False)
|