Initial commit: добавление проекта predictV1

Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-21 17:22:58 +03:00
commit 8a134239d7
42 changed files with 12831 additions and 0 deletions
--- a/educationML/train_model_bag_of_heroes.py
+++ b/educationML/train_model_bag_of_heroes.py
@@ -0,0 +1,127 @@
+import os
+import pandas as pd
+import numpy as np
+from catboost import CatBoostClassifier, Pool
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score
+
+print("Загрузка датасета...")
+
+df = pd.read_parquet("data/dataset_from_db.parquet")
+
+print(f"Всего записей: {len(df)}")
+print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
+print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
+
+# --- Bag-of-Heroes подход ---
+# Создаем бинарные признаки для каждого героя в каждой команде
+
+# Получаем все уникальные ID героев из данных
+hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
+hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
+
+all_hero_ids = set()
+for col in hero_cols_r + hero_cols_d:
+    all_hero_ids.update(df[col].dropna().unique())
+
+all_hero_ids = sorted([int(h) for h in all_hero_ids if h >= 0])
+print(f"\nВсего уникальных героев: {len(all_hero_ids)}")
+
+# Создаем новый датафрейм с bag-of-heroes признаками
+X = pd.DataFrame()
+
+# Добавляем is_first_pick_radiant
+X["is_first_pick_radiant"] = df["is_first_pick_radiant"].astype(int)
+
+# Для каждого героя создаем 2 признака: radiant_hero_{id} и dire_hero_{id}
+for hero_id in all_hero_ids:
+    # Radiant team
+    X[f"radiant_hero_{hero_id}"] = 0
+    for col in hero_cols_r:
+        X.loc[df[col] == hero_id, f"radiant_hero_{hero_id}"] = 1
+
+    # Dire team
+    X[f"dire_hero_{hero_id}"] = 0
+    for col in hero_cols_d:
+        X.loc[df[col] == hero_id, f"dire_hero_{hero_id}"] = 1
+
+print(f"Количество признаков: {len(X.columns)}")
+print(f"  - is_first_pick_radiant: 1")
+print(f"  - radiant_hero_*: {len(all_hero_ids)}")
+print(f"  - dire_hero_*: {len(all_hero_ids)}")
+
+# Целевая переменная
+y = df["y"].astype(int).copy()
+
+# Разбиение
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y,
+    test_size=0.2,
+    random_state=42,
+    stratify=y
+)
+
+print(f"\nTrain: {len(X_train)} записей")
+print(f"Test:  {len(X_test)} записей")
+
+# В bag-of-heroes все признаки числовые (0 или 1), категориальных нет
+train_pool = Pool(X_train, y_train)
+test_pool  = Pool(X_test,  y_test)
+
+# Модель
+model = CatBoostClassifier(
+    iterations=2500,
+    learning_rate=0.03,
+    depth=7,
+    l2_leaf_reg=2,
+    bootstrap_type="Bayesian",
+    bagging_temperature=1.0,
+    loss_function="Logloss",
+    eval_metric="AUC",
+    random_seed=42,
+    verbose=100,
+    od_type="Iter",
+    od_wait=200
+)
+
+print("\nНачало обучения...")
+model.fit(train_pool, eval_set=test_pool, use_best_model=True)
+
+# --- Оценка качества ---
+best_scores = model.get_best_score()
+train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
+test_auc_cb  = best_scores.get("validation", {}).get("AUC", np.nan)
+
+y_train_proba = model.predict_proba(train_pool)[:, 1]
+y_test_proba  = model.predict_proba(test_pool)[:, 1]
+train_auc = roc_auc_score(y_train, y_train_proba)
+test_auc  = roc_auc_score(y_test,  y_test_proba)
+
+print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
+print(f"Recomputed AUC (train/test):      {train_auc:.4f} / {test_auc:.4f}")
+
+# --- Сохранение ---
+os.makedirs("artifacts", exist_ok=True)
+model_path = "artifacts/model_bag_of_heroes.cbm"
+model.save_model(model_path)
+print(f"\nМодель сохранена: {model_path}")
+
+# Порядок фичей
+feature_cols = list(X.columns)
+pd.DataFrame(feature_cols, columns=["feature"]).to_csv(
+    "artifacts/feature_order_bag_of_heroes.csv", index=False
+)
+print("Порядок фичей сохранен в artifacts/feature_order_bag_of_heroes.csv")
+
+# Важность признаков (топ-30)
+importance = model.get_feature_importance(train_pool)
+importance_df = (
+    pd.DataFrame({"feature": X_train.columns, "importance": importance})
+    .sort_values("importance", ascending=False)
+    .reset_index(drop=True)
+)
+
+print("\nВажность признаков (top 30):")
+print(importance_df.head(30).to_string(index=False))
+
+importance_df.to_csv("artifacts/feature_importance_bag_of_heroes.csv", index=False)