Initial commit: добавление проекта predictV1
Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
116
educationML/train_model_pro_old.py
Normal file
116
educationML/train_model_pro_old.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from catboost import CatBoostClassifier, Pool
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
print("Загрузка датасета...")
|
||||
|
||||
df = pd.read_parquet("data/dataset_from_db.parquet")
|
||||
|
||||
print(f"Всего записей: {len(df)}")
|
||||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||||
|
||||
# --- Фичи под новый формат датасета ---
|
||||
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
||||
hero_cols_d = [f"d_h{i}" for i in range(1, 5+1)]
|
||||
# player_cols_r = [f"r_p{i}" for i in range(1, 6)]
|
||||
# player_cols_d = [f"d_p{i}" for i in range(1, 6)]
|
||||
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
|
||||
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
|
||||
|
||||
feature_cols = (
|
||||
["is_first_pick_radiant"]
|
||||
+ hero_cols_r + hero_cols_d
|
||||
# + player_cols_r + player_cols_d # Убрали игроков - мало данных
|
||||
+ pos_cols_r + pos_cols_d
|
||||
)
|
||||
|
||||
# Целевая
|
||||
target_col = "y"
|
||||
|
||||
# Отделяем признаки/таргет
|
||||
X = df[feature_cols].copy()
|
||||
y = df[target_col].astype(int).copy()
|
||||
|
||||
# На всякий случай убедимся, что бинарный признак int
|
||||
X["is_first_pick_radiant"] = X["is_first_pick_radiant"].astype(int)
|
||||
|
||||
# Разбиение
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y,
|
||||
test_size=0.1,
|
||||
random_state=42,
|
||||
stratify=y
|
||||
)
|
||||
|
||||
print(f"\nTrain: {len(X_train)} записей")
|
||||
print(f"Test: {len(X_test)} записей")
|
||||
|
||||
# Категориальные признаки: герои и позиции (их ID — это категории)
|
||||
cat_features = hero_cols_r + hero_cols_d + pos_cols_r + pos_cols_d
|
||||
# CatBoost принимает либо индексы, либо имена колонок. Передаем имена.
|
||||
train_pool = Pool(X_train, y_train, cat_features=cat_features)
|
||||
test_pool = Pool(X_test, y_test, cat_features=cat_features)
|
||||
|
||||
# Модель
|
||||
model = CatBoostClassifier(
|
||||
iterations=2500,
|
||||
learning_rate=0.03,
|
||||
depth=7,
|
||||
l2_leaf_reg=2,
|
||||
bootstrap_type="Bayesian",
|
||||
bagging_temperature=1.0, # <- вместо subsample
|
||||
loss_function="Logloss",
|
||||
eval_metric="AUC",
|
||||
random_seed=42,
|
||||
verbose=100,
|
||||
od_type="Iter",
|
||||
od_wait=200
|
||||
)
|
||||
|
||||
print("\nНачало обучения...")
|
||||
model.fit(train_pool, eval_set=test_pool, use_best_model=True)
|
||||
|
||||
# --- Оценка качества ---
|
||||
# Лучшие метрики по мнению CatBoost
|
||||
best_scores = model.get_best_score()
|
||||
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
|
||||
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
|
||||
|
||||
# Перепроверим AUC напрямую
|
||||
y_train_proba = model.predict_proba(train_pool)[:, 1]
|
||||
y_test_proba = model.predict_proba(test_pool)[:, 1]
|
||||
train_auc = roc_auc_score(y_train, y_train_proba)
|
||||
test_auc = roc_auc_score(y_test, y_test_proba)
|
||||
|
||||
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
|
||||
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
||||
|
||||
# --- Сохранение ---
|
||||
os.makedirs("artifacts", exist_ok=True)
|
||||
model_path = "artifacts/model_from_db_pro_v3.cbm"
|
||||
model.save_model(model_path)
|
||||
print(f"\nМодель сохранена: {model_path}")
|
||||
|
||||
# Порядок фичей
|
||||
pd.DataFrame(feature_cols, columns=["feature"]).to_csv(
|
||||
"artifacts/feature_order_db.csv", index=False
|
||||
)
|
||||
print("Порядок фичей сохранен в artifacts/feature_order_db.csv")
|
||||
|
||||
# Важность признаков
|
||||
importance = model.get_feature_importance(train_pool)
|
||||
importance_df = (
|
||||
pd.DataFrame({"feature": X_train.columns, "importance": importance})
|
||||
.sort_values("importance", ascending=False)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
print("\nВажность признаков (top 25):")
|
||||
print(importance_df.head(25).to_string(index=False))
|
||||
|
||||
# При желании — сохранить важности целиком
|
||||
importance_df.to_csv("artifacts/feature_importance_db.csv", index=False)
|
||||
Reference in New Issue
Block a user