Files
predictV1/educationML/train_model_with_players.py
mamonov.ep 8a134239d7 Initial commit: добавление проекта predictV1
Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-21 17:22:58 +03:00

157 lines
5.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
print("Загрузка датасета...")
df = pd.read_parquet("data/dataset_with_players.parquet")
print(f"Всего записей (матчей): {len(df)}")
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
# --- Создаём признаки на уровне матча ---
print("\nСоздание признаков...")
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
player_cols_r = [f"r_p{i}" for i in range(1, 6)]
player_cols_d = [f"d_p{i}" for i in range(1, 6)]
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
# Создаём признаки: player_hero_pos для каждой команды
# Формат: radiant_p{player_id}_h{hero_id}_pos{position}, dire_p{player_id}_h{hero_id}_pos{position}
rows = []
for idx, row in df.iterrows():
features = {}
# Radiant heroes с игроками и позициями
for i in range(5):
hero_id = int(row[hero_cols_r[i]])
player_id = int(row[player_cols_r[i]])
position = int(row[pos_cols_r[i]])
# Признак: игрок + герой + позиция
if player_id > 0 and hero_id >= 0 and position >= 0:
features[f"radiant_p{player_id}_h{hero_id}_pos{position}"] = 1
# Признак: только игрок + герой (если позиция неизвестна)
if player_id > 0 and hero_id >= 0:
features[f"radiant_p{player_id}_h{hero_id}"] = 1
# Признак: только игрок + позиция
if player_id > 0 and position >= 0:
features[f"radiant_p{player_id}_pos{position}"] = 1
# Dire heroes с игроками и позициями
for i in range(5):
hero_id = int(row[hero_cols_d[i]])
player_id = int(row[player_cols_d[i]])
position = int(row[pos_cols_d[i]])
# Признак: игрок + герой + позиция
if player_id > 0 and hero_id >= 0 and position >= 0:
features[f"dire_p{player_id}_h{hero_id}_pos{position}"] = 1
# Признак: только игрок + герой (если позиция неизвестна)
if player_id > 0 and hero_id >= 0:
features[f"dire_p{player_id}_h{hero_id}"] = 1
# Признак: только игрок + позиция
if player_id > 0 and position >= 0:
features[f"dire_p{player_id}_pos{position}"] = 1
features['y'] = int(row['y'])
rows.append(features)
if (idx + 1) % 100 == 0:
print(f"Обработано {idx + 1}/{len(df)} матчей...")
df_features = pd.DataFrame(rows).fillna(0)
print(f"\nСоздано признаков: {len(df_features.columns) - 1}")
# Целевая
y = df_features['y'].astype(int)
X = df_features.drop('y', axis=1)
# Разбиение
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y
)
print(f"\nTrain: {len(X_train)} матчей")
print(f"Test: {len(X_test)} матчей")
# Обучение
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)
model = CatBoostClassifier(
iterations=1000,
learning_rate=0.05,
depth=5,
l2_leaf_reg=3,
min_data_in_leaf=5,
bootstrap_type="Bayesian",
bagging_temperature=0.5,
loss_function="Logloss",
eval_metric="AUC",
random_seed=42,
verbose=50,
od_type="Iter",
od_wait=100,
use_best_model=True
)
print("\nНачало обучения...")
model.fit(train_pool, eval_set=test_pool)
# Оценка
best_scores = model.get_best_score()
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
y_train_proba = model.predict_proba(train_pool)[:, 1]
y_test_proba = model.predict_proba(test_pool)[:, 1]
train_auc = roc_auc_score(y_train, y_train_proba)
test_auc = roc_auc_score(y_test, y_test_proba)
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
# Сохранение
os.makedirs("artifacts", exist_ok=True)
model_path = "artifacts/model_with_players.cbm"
model.save_model(model_path)
print(f"\nМодель сохранена: {model_path}")
# Важность (топ-30)
importance = model.get_feature_importance(train_pool)
importance_df = (
pd.DataFrame({"feature": X_train.columns, "importance": importance})
.sort_values("importance", ascending=False)
.reset_index(drop=True)
)
print("\nВажность признаков (top 30):")
print(importance_df.head(30).to_string(index=False))
importance_df.to_csv("artifacts/feature_importance_with_players.csv", index=False)
# Сохраняем список всех возможных признаков для инференса
all_features = sorted(X.columns.tolist())
pd.DataFrame(all_features, columns=["feature"]).to_csv(
"artifacts/feature_order_with_players.csv", index=False
)
print(f"\nПорядок фичей сохранен в artifacts/feature_order_with_players.csv ({len(all_features)} признаков)")