Files
predictV1/educationML/train_model_with_players.py

157 lines
5.4 KiB
Python
Raw Normal View History

import os
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
print("Загрузка датасета...")
df = pd.read_parquet("data/dataset_with_players.parquet")
print(f"Всего записей (матчей): {len(df)}")
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
# --- Создаём признаки на уровне матча ---
print("\nСоздание признаков...")
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
player_cols_r = [f"r_p{i}" for i in range(1, 6)]
player_cols_d = [f"d_p{i}" for i in range(1, 6)]
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
# Создаём признаки: player_hero_pos для каждой команды
# Формат: radiant_p{player_id}_h{hero_id}_pos{position}, dire_p{player_id}_h{hero_id}_pos{position}
rows = []
for idx, row in df.iterrows():
features = {}
# Radiant heroes с игроками и позициями
for i in range(5):
hero_id = int(row[hero_cols_r[i]])
player_id = int(row[player_cols_r[i]])
position = int(row[pos_cols_r[i]])
# Признак: игрок + герой + позиция
if player_id > 0 and hero_id >= 0 and position >= 0:
features[f"radiant_p{player_id}_h{hero_id}_pos{position}"] = 1
# Признак: только игрок + герой (если позиция неизвестна)
if player_id > 0 and hero_id >= 0:
features[f"radiant_p{player_id}_h{hero_id}"] = 1
# Признак: только игрок + позиция
if player_id > 0 and position >= 0:
features[f"radiant_p{player_id}_pos{position}"] = 1
# Dire heroes с игроками и позициями
for i in range(5):
hero_id = int(row[hero_cols_d[i]])
player_id = int(row[player_cols_d[i]])
position = int(row[pos_cols_d[i]])
# Признак: игрок + герой + позиция
if player_id > 0 and hero_id >= 0 and position >= 0:
features[f"dire_p{player_id}_h{hero_id}_pos{position}"] = 1
# Признак: только игрок + герой (если позиция неизвестна)
if player_id > 0 and hero_id >= 0:
features[f"dire_p{player_id}_h{hero_id}"] = 1
# Признак: только игрок + позиция
if player_id > 0 and position >= 0:
features[f"dire_p{player_id}_pos{position}"] = 1
features['y'] = int(row['y'])
rows.append(features)
if (idx + 1) % 100 == 0:
print(f"Обработано {idx + 1}/{len(df)} матчей...")
df_features = pd.DataFrame(rows).fillna(0)
print(f"\nСоздано признаков: {len(df_features.columns) - 1}")
# Целевая
y = df_features['y'].astype(int)
X = df_features.drop('y', axis=1)
# Разбиение
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y
)
print(f"\nTrain: {len(X_train)} матчей")
print(f"Test: {len(X_test)} матчей")
# Обучение
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)
model = CatBoostClassifier(
iterations=1000,
learning_rate=0.05,
depth=5,
l2_leaf_reg=3,
min_data_in_leaf=5,
bootstrap_type="Bayesian",
bagging_temperature=0.5,
loss_function="Logloss",
eval_metric="AUC",
random_seed=42,
verbose=50,
od_type="Iter",
od_wait=100,
use_best_model=True
)
print("\nНачало обучения...")
model.fit(train_pool, eval_set=test_pool)
# Оценка
best_scores = model.get_best_score()
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
y_train_proba = model.predict_proba(train_pool)[:, 1]
y_test_proba = model.predict_proba(test_pool)[:, 1]
train_auc = roc_auc_score(y_train, y_train_proba)
test_auc = roc_auc_score(y_test, y_test_proba)
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
# Сохранение
os.makedirs("artifacts", exist_ok=True)
model_path = "artifacts/model_with_players.cbm"
model.save_model(model_path)
print(f"\nМодель сохранена: {model_path}")
# Важность (топ-30)
importance = model.get_feature_importance(train_pool)
importance_df = (
pd.DataFrame({"feature": X_train.columns, "importance": importance})
.sort_values("importance", ascending=False)
.reset_index(drop=True)
)
print("\nВажность признаков (top 30):")
print(importance_df.head(30).to_string(index=False))
importance_df.to_csv("artifacts/feature_importance_with_players.csv", index=False)
# Сохраняем список всех возможных признаков для инференса
all_features = sorted(X.columns.tolist())
pd.DataFrame(all_features, columns=["feature"]).to_csv(
"artifacts/feature_order_with_players.csv", index=False
)
print(f"\nПорядок фичей сохранен в artifacts/feature_order_with_players.csv ({len(all_features)} признаков)")