Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
177 lines
6.3 KiB
Python
177 lines
6.3 KiB
Python
import os
|
||
import sys
|
||
import pandas as pd
|
||
import numpy as np
|
||
from catboost import CatBoostClassifier, Pool
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.metrics import roc_auc_score
|
||
from sklearn.linear_model import LogisticRegression
|
||
import pickle
|
||
|
||
# Добавляем корневую директорию проекта в путь
|
||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||
|
||
print("Загрузка датасета...")
|
||
df = pd.read_parquet("data/dataset_from_db.parquet")
|
||
|
||
print(f"Всего записей: {len(df)}")
|
||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||
|
||
# Целевая переменная
|
||
y = df["y"].astype(int).copy()
|
||
|
||
# Разбиение на train/test
|
||
_, X_test_indices, _, y_test = train_test_split(
|
||
df.index, y,
|
||
test_size=0.2,
|
||
random_state=42,
|
||
stratify=y
|
||
)
|
||
|
||
print("\n" + "="*60)
|
||
print("Загрузка базовых моделей...")
|
||
print("="*60)
|
||
|
||
# === Модель 1: Heroes + Positions ===
|
||
from routes.predict import build_long_format_input, modelPro
|
||
|
||
# === Модель 2: Bag of Heroes ===
|
||
from routes.predict_bag_of_heroes import build_bag_of_heroes_features, modelBagOfHeroes
|
||
|
||
# === Модель 3: With Players ===
|
||
from routes.predict_with_players import build_player_features, modelWithPlayers
|
||
|
||
print("\n" + "="*60)
|
||
print("Генерация предсказаний базовых моделей...")
|
||
print("="*60)
|
||
|
||
# Подготовим данные для всех моделей
|
||
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
||
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
|
||
player_cols_r = [f"r_p{i}" for i in range(1, 6)]
|
||
player_cols_d = [f"d_p{i}" for i in range(1, 6)]
|
||
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
|
||
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
|
||
|
||
predictions_list = []
|
||
|
||
for idx in df.index:
|
||
row_data = df.loc[idx]
|
||
|
||
# Формируем payload для текущей записи
|
||
payload = {
|
||
"is_first_pick_radiant": int(row_data.get("is_first_pick_radiant", 0)),
|
||
}
|
||
|
||
# Герои
|
||
for col in hero_cols_r + hero_cols_d:
|
||
payload[col] = int(row_data.get(col, -1))
|
||
|
||
# Игроки
|
||
for col in player_cols_r + player_cols_d:
|
||
payload[col] = int(row_data.get(col, -1))
|
||
|
||
# Позиции
|
||
for col in pos_cols_r + pos_cols_d:
|
||
payload[col] = int(row_data.get(col, -1))
|
||
|
||
# === Предсказание модели 1: Heroes + Positions ===
|
||
X_with_pos = build_long_format_input(payload)
|
||
pred1 = float(modelPro.predict_proba(X_with_pos)[0, 1])
|
||
|
||
# === Предсказание модели 2: Bag of Heroes ===
|
||
X_bag = build_bag_of_heroes_features(payload)
|
||
pred2 = float(modelBagOfHeroes.predict_proba(X_bag)[0, 1])
|
||
|
||
# === Предсказание модели 3: With Players ===
|
||
X_players = build_player_features(payload)
|
||
pred3 = float(modelWithPlayers.predict_proba(X_players)[0, 1])
|
||
|
||
predictions_list.append({
|
||
"pred_with_positions": pred1,
|
||
"pred_bag_of_heroes": pred2,
|
||
"pred_with_players": pred3
|
||
})
|
||
|
||
if (idx + 1) % 100 == 0:
|
||
print(f"Обработано {idx + 1}/{len(df)} записей...")
|
||
|
||
# Создаём DataFrame с предсказаниями
|
||
X_meta = pd.DataFrame(predictions_list)
|
||
|
||
print(f"\nСоздано {len(X_meta)} мета-признаков")
|
||
print(f"Колонки: {list(X_meta.columns)}")
|
||
|
||
# Разбиение на train/test по тем же индексам
|
||
X_meta_train = X_meta.loc[~X_meta.index.isin(X_test_indices)]
|
||
X_meta_test = X_meta.loc[X_meta.index.isin(X_test_indices)]
|
||
y_meta_train = y.loc[~y.index.isin(X_test_indices)]
|
||
y_meta_test = y.loc[y.index.isin(X_test_indices)]
|
||
|
||
print(f"\nMeta Train: {len(X_meta_train)} записей")
|
||
print(f"Meta Test: {len(X_meta_test)} записей")
|
||
|
||
# Обучение мета-модели
|
||
print("\n" + "="*60)
|
||
print("Обучение мета-модели (Логистическая регрессия)...")
|
||
print("="*60)
|
||
|
||
# Используем логистическую регрессию вместо CatBoost для избежания переобучения
|
||
meta_model = LogisticRegression(
|
||
random_state=42,
|
||
max_iter=1000,
|
||
C=1.0 # Регуляризация
|
||
)
|
||
|
||
meta_model.fit(X_meta_train, y_meta_train)
|
||
|
||
# Оценка качества
|
||
y_train_proba = meta_model.predict_proba(X_meta_train)[:, 1]
|
||
y_test_proba = meta_model.predict_proba(X_meta_test)[:, 1]
|
||
train_auc = roc_auc_score(y_meta_train, y_train_proba)
|
||
test_auc = roc_auc_score(y_meta_test, y_test_proba)
|
||
|
||
print(f"\nLogistic Regression AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
||
|
||
# Сохранение мета-модели
|
||
os.makedirs("artifacts", exist_ok=True)
|
||
model_path = "artifacts/model_stacking.pkl"
|
||
with open(model_path, 'wb') as f:
|
||
pickle.dump(meta_model, f)
|
||
print(f"\nМета-модель сохранена: {model_path}")
|
||
|
||
# Важность признаков (коэффициенты логистической регрессии)
|
||
coefficients = meta_model.coef_[0]
|
||
intercept = meta_model.intercept_[0]
|
||
|
||
importance_df = pd.DataFrame({
|
||
"feature": X_meta_train.columns,
|
||
"coefficient": coefficients
|
||
}).sort_values("coefficient", ascending=False).reset_index(drop=True)
|
||
|
||
print("\nКоэффициенты логистической регрессии:")
|
||
print(f"Intercept: {intercept:.4f}")
|
||
print(importance_df.to_string(index=False))
|
||
|
||
# Сохраняем в старом формате для совместимости
|
||
importance_df_compat = pd.DataFrame({
|
||
"feature": X_meta_train.columns,
|
||
"importance": np.abs(coefficients) # Абсолютные значения коэффициентов
|
||
})
|
||
importance_df_compat.to_csv("artifacts/feature_importance_stacking.csv", index=False)
|
||
|
||
print("\n" + "="*60)
|
||
print("Сравнение моделей на тестовой выборке:")
|
||
print("="*60)
|
||
|
||
# AUC базовых моделей
|
||
auc1 = roc_auc_score(y_meta_test, X_meta_test["pred_with_positions"])
|
||
auc2 = roc_auc_score(y_meta_test, X_meta_test["pred_bag_of_heroes"])
|
||
auc3 = roc_auc_score(y_meta_test, X_meta_test["pred_with_players"])
|
||
|
||
print(f"Модель 1 (Heroes + Positions): AUC = {auc1:.4f}")
|
||
print(f"Модель 2 (Bag of Heroes): AUC = {auc2:.4f}")
|
||
print(f"Модель 3 (With Players): AUC = {auc3:.4f}")
|
||
print(f"Мета-модель (Stacking): AUC = {test_auc:.4f}")
|