Initial commit: добавление проекта predictV1
Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
94
educationML/build_dataset_pro.py
Normal file
94
educationML/build_dataset_pro.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import pandas as pd
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
def get_db_connection():
|
||||
return psycopg2.connect(
|
||||
host="localhost",
|
||||
port=5432,
|
||||
database="korobka_db",
|
||||
user="postgres",
|
||||
password="postgres"
|
||||
)
|
||||
|
||||
print("Загрузка данных из БД...")
|
||||
|
||||
conn = get_db_connection()
|
||||
|
||||
# Загружаем матчи
|
||||
matches_df = pd.read_sql_query("""
|
||||
SELECT id as match_id, radiant_team_id, dire_team_id, radiant_win
|
||||
FROM matches
|
||||
WHERE "source" = 'pro'
|
||||
ORDER BY id
|
||||
""", conn)
|
||||
|
||||
print(f"Загружено {len(matches_df)} матчей")
|
||||
|
||||
# Загружаем детали героев
|
||||
details_df = pd.read_sql_query("""
|
||||
SELECT match_id, hero_id, team, players_id, pos
|
||||
FROM details_match
|
||||
WHERE "source" = 'pro'
|
||||
ORDER BY match_id
|
||||
""", conn)
|
||||
|
||||
print(f"Загружено {len(details_df)} записей героев")
|
||||
|
||||
conn.close()
|
||||
|
||||
# Формируем слоты героев для каждого матча
|
||||
def slots_from_picks(group):
|
||||
# --- Radiant (team = 0) ---
|
||||
r_heroes = group[group['team'] == 0]['hero_id'].tolist()[:5]
|
||||
r_players = group[group['team'] == 0]['players_id'].tolist()[:5]
|
||||
r_pos = group[group['team'] == 0]['pos'].tolist()[:5]
|
||||
|
||||
# --- Dire (team = 1) ---
|
||||
d_heroes = group[group['team'] == 1]['hero_id'].tolist()[:5]
|
||||
d_players = group[group['team'] == 1]['players_id'].tolist()[:5]
|
||||
d_pos = group[group['team'] == 1]['pos'].tolist()[:5]
|
||||
|
||||
row = {}
|
||||
# --- Добавляем 5 слотов для каждой стороны ---
|
||||
for i in range(5):
|
||||
# Герои Radiant / Dire
|
||||
row[f"r_h{i+1}"] = r_heroes[i] if i < len(r_heroes) else -1
|
||||
row[f"d_h{i+1}"] = d_heroes[i] if i < len(d_heroes) else -1
|
||||
|
||||
# Позиции героев
|
||||
row[f"rp_h{i+1}"] = r_pos[i] if i < len(r_pos) else -1
|
||||
row[f"dp_h{i+1}"] = d_pos[i] if i < len(d_pos) else -1
|
||||
|
||||
# Игроки Radiant / Dire
|
||||
row[f"r_p{i+1}"] = r_players[i] if i < len(r_players) else -1
|
||||
row[f"d_p{i+1}"] = d_players[i] if i < len(d_players) else -1
|
||||
|
||||
# Определяем, кто пикал первым (команда 0 = radiant)
|
||||
fp_team = group.iloc[0]['team'] if len(group) > 0 else 0
|
||||
row["is_first_pick_radiant"] = 1 if fp_team == 0 else 0
|
||||
|
||||
return pd.Series(row)
|
||||
|
||||
slots_df = details_df.groupby("match_id").apply(slots_from_picks).reset_index()
|
||||
|
||||
# Объединяем с информацией о матчах
|
||||
dataset = matches_df.merge(slots_df, on="match_id", how="inner")
|
||||
|
||||
# Добавляем целевую переменную
|
||||
dataset['y'] = dataset['radiant_win'].astype(int)
|
||||
|
||||
# Выбираем нужные колонки в правильном порядке
|
||||
final_df = dataset[['match_id', 'is_first_pick_radiant',
|
||||
'r_h1', 'r_h2', 'r_h3', 'r_h4', 'r_h5',
|
||||
'd_h1', 'd_h2', 'd_h3', 'd_h4', 'd_h5',
|
||||
'r_p1', 'r_p2', 'r_p3', 'r_p4', 'r_p5',
|
||||
'd_p1', 'd_p2', 'd_p3', 'd_p4', 'd_p5',
|
||||
'rp_h1', 'rp_h2', 'rp_h3', 'rp_h4', 'rp_h5',
|
||||
'dp_h1', 'dp_h2', 'dp_h3', 'dp_h4', 'dp_h5',
|
||||
'y']]
|
||||
|
||||
# Сохраняем
|
||||
final_df.to_parquet("data/dataset_from_db.parquet", index=False)
|
||||
print(f"Сохранено {len(final_df)} записей в data/dataset_from_db.parquet")
|
||||
print(f"Radiant wins: {final_df['y'].sum()}, Dire wins: {len(final_df) - final_df['y'].sum()}")
|
||||
139
educationML/build_dataset_with_players.py
Normal file
139
educationML/build_dataset_with_players.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
print("Подключение к базе данных...")
|
||||
conn = psycopg2.connect(
|
||||
host="localhost",
|
||||
port=5432,
|
||||
database="korobka_db",
|
||||
user="postgres",
|
||||
password="postgres"
|
||||
)
|
||||
|
||||
print("Загрузка матчей с известными игроками...")
|
||||
|
||||
# Получаем все матчи где есть хотя бы один известный игрок
|
||||
query = """
|
||||
SELECT
|
||||
m.id as match_id,
|
||||
m.radiant_win,
|
||||
m.leagueid
|
||||
FROM matches m
|
||||
WHERE EXISTS (
|
||||
SELECT 1
|
||||
FROM details_match dm
|
||||
WHERE dm.match_id = m.id
|
||||
AND dm.players_id IS NOT NULL
|
||||
AND dm.players_id != 0
|
||||
)
|
||||
ORDER BY m.id
|
||||
"""
|
||||
|
||||
matches_df = pd.read_sql(query, conn)
|
||||
print(f"Найдено матчей: {len(matches_df)}")
|
||||
|
||||
# Получаем детали всех этих матчей
|
||||
query_details = """
|
||||
SELECT
|
||||
dm.match_id,
|
||||
dm.hero_id,
|
||||
dm.team,
|
||||
dm.players_id,
|
||||
dm.pos,
|
||||
dm."order"
|
||||
FROM details_match dm
|
||||
WHERE dm.match_id IN (
|
||||
SELECT DISTINCT m.id
|
||||
FROM matches m
|
||||
WHERE EXISTS (
|
||||
SELECT 1
|
||||
FROM details_match dm2
|
||||
WHERE dm2.match_id = m.id
|
||||
AND dm2.players_id IS NOT NULL
|
||||
AND dm2.players_id != 0
|
||||
)
|
||||
)
|
||||
ORDER BY dm.match_id, dm."order"
|
||||
"""
|
||||
|
||||
details_df = pd.read_sql(query_details, conn)
|
||||
conn.close()
|
||||
|
||||
print(f"Загружено {len(details_df)} записей деталей")
|
||||
|
||||
# Преобразуем в wide-format
|
||||
print("\nПреобразование в wide-format...")
|
||||
|
||||
rows = []
|
||||
|
||||
for match_id, group in details_df.groupby('match_id'):
|
||||
match_info = matches_df[matches_df['match_id'] == match_id].iloc[0]
|
||||
|
||||
row = {
|
||||
'match_id': match_id,
|
||||
'y': int(match_info['radiant_win']),
|
||||
'leagueid': int(match_info['leagueid'])
|
||||
}
|
||||
|
||||
# Radiant (team=0) и Dire (team=1)
|
||||
radiant_picks = group[group['team'] == 0].sort_values('order')
|
||||
dire_picks = group[group['team'] == 1].sort_values('order')
|
||||
|
||||
# Заполняем героев и игроков для Radiant (до 5)
|
||||
for i, (idx, pick) in enumerate(radiant_picks.iterrows(), 1):
|
||||
if i > 5:
|
||||
break
|
||||
row[f'r_h{i}'] = int(pick['hero_id'])
|
||||
row[f'r_p{i}'] = int(pick['players_id']) if pd.notna(pick['players_id']) and pick['players_id'] != 0 else -1
|
||||
row[f'rp_h{i}'] = int(pick['pos']) if pd.notna(pick['pos']) else -1
|
||||
|
||||
# Заполняем пропуски для Radiant
|
||||
for i in range(len(radiant_picks) + 1, 6):
|
||||
row[f'r_h{i}'] = -1
|
||||
row[f'r_p{i}'] = -1
|
||||
row[f'rp_h{i}'] = -1
|
||||
|
||||
# Заполняем героев и игроков для Dire (до 5)
|
||||
for i, (idx, pick) in enumerate(dire_picks.iterrows(), 1):
|
||||
if i > 5:
|
||||
break
|
||||
row[f'd_h{i}'] = int(pick['hero_id'])
|
||||
row[f'd_p{i}'] = int(pick['players_id']) if pd.notna(pick['players_id']) and pick['players_id'] != 0 else -1
|
||||
row[f'dp_h{i}'] = int(pick['pos']) if pd.notna(pick['pos']) else -1
|
||||
|
||||
# Заполняем пропуски для Dire
|
||||
for i in range(len(dire_picks) + 1, 6):
|
||||
row[f'd_h{i}'] = -1
|
||||
row[f'd_p{i}'] = -1
|
||||
row[f'dp_h{i}'] = -1
|
||||
|
||||
rows.append(row)
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
|
||||
print(f"Создано {len(df)} записей в wide-format")
|
||||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||||
|
||||
# Статистика по игрокам
|
||||
player_cols = [f'r_p{i}' for i in range(1, 6)] + [f'd_p{i}' for i in range(1, 6)]
|
||||
all_players = []
|
||||
for col in player_cols:
|
||||
all_players.extend(df[col][df[col] > 0].tolist())
|
||||
|
||||
unique_players = len(set(all_players))
|
||||
print(f"\nУникальных игроков в датасете: {unique_players}")
|
||||
print(f"Всего записей игроков (не -1): {len(all_players)}")
|
||||
|
||||
# Статистика по турнирам
|
||||
print(f"\nУникальных турниров (leagueid): {df['leagueid'].nunique()}")
|
||||
|
||||
# Сохранение
|
||||
output_path = "data/dataset_with_players.parquet"
|
||||
df.to_parquet(output_path, index=False)
|
||||
print(f"\n✓ Датасет сохранён: {output_path}")
|
||||
|
||||
# Пример первых записей
|
||||
print("\nПример данных (первые 3 матча):")
|
||||
print(df.head(3).to_string())
|
||||
127
educationML/train_model_bag_of_heroes.py
Normal file
127
educationML/train_model_bag_of_heroes.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from catboost import CatBoostClassifier, Pool
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
print("Загрузка датасета...")
|
||||
|
||||
df = pd.read_parquet("data/dataset_from_db.parquet")
|
||||
|
||||
print(f"Всего записей: {len(df)}")
|
||||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||||
|
||||
# --- Bag-of-Heroes подход ---
|
||||
# Создаем бинарные признаки для каждого героя в каждой команде
|
||||
|
||||
# Получаем все уникальные ID героев из данных
|
||||
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
||||
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
|
||||
|
||||
all_hero_ids = set()
|
||||
for col in hero_cols_r + hero_cols_d:
|
||||
all_hero_ids.update(df[col].dropna().unique())
|
||||
|
||||
all_hero_ids = sorted([int(h) for h in all_hero_ids if h >= 0])
|
||||
print(f"\nВсего уникальных героев: {len(all_hero_ids)}")
|
||||
|
||||
# Создаем новый датафрейм с bag-of-heroes признаками
|
||||
X = pd.DataFrame()
|
||||
|
||||
# Добавляем is_first_pick_radiant
|
||||
X["is_first_pick_radiant"] = df["is_first_pick_radiant"].astype(int)
|
||||
|
||||
# Для каждого героя создаем 2 признака: radiant_hero_{id} и dire_hero_{id}
|
||||
for hero_id in all_hero_ids:
|
||||
# Radiant team
|
||||
X[f"radiant_hero_{hero_id}"] = 0
|
||||
for col in hero_cols_r:
|
||||
X.loc[df[col] == hero_id, f"radiant_hero_{hero_id}"] = 1
|
||||
|
||||
# Dire team
|
||||
X[f"dire_hero_{hero_id}"] = 0
|
||||
for col in hero_cols_d:
|
||||
X.loc[df[col] == hero_id, f"dire_hero_{hero_id}"] = 1
|
||||
|
||||
print(f"Количество признаков: {len(X.columns)}")
|
||||
print(f" - is_first_pick_radiant: 1")
|
||||
print(f" - radiant_hero_*: {len(all_hero_ids)}")
|
||||
print(f" - dire_hero_*: {len(all_hero_ids)}")
|
||||
|
||||
# Целевая переменная
|
||||
y = df["y"].astype(int).copy()
|
||||
|
||||
# Разбиение
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y,
|
||||
test_size=0.2,
|
||||
random_state=42,
|
||||
stratify=y
|
||||
)
|
||||
|
||||
print(f"\nTrain: {len(X_train)} записей")
|
||||
print(f"Test: {len(X_test)} записей")
|
||||
|
||||
# В bag-of-heroes все признаки числовые (0 или 1), категориальных нет
|
||||
train_pool = Pool(X_train, y_train)
|
||||
test_pool = Pool(X_test, y_test)
|
||||
|
||||
# Модель
|
||||
model = CatBoostClassifier(
|
||||
iterations=2500,
|
||||
learning_rate=0.03,
|
||||
depth=7,
|
||||
l2_leaf_reg=2,
|
||||
bootstrap_type="Bayesian",
|
||||
bagging_temperature=1.0,
|
||||
loss_function="Logloss",
|
||||
eval_metric="AUC",
|
||||
random_seed=42,
|
||||
verbose=100,
|
||||
od_type="Iter",
|
||||
od_wait=200
|
||||
)
|
||||
|
||||
print("\nНачало обучения...")
|
||||
model.fit(train_pool, eval_set=test_pool, use_best_model=True)
|
||||
|
||||
# --- Оценка качества ---
|
||||
best_scores = model.get_best_score()
|
||||
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
|
||||
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
|
||||
|
||||
y_train_proba = model.predict_proba(train_pool)[:, 1]
|
||||
y_test_proba = model.predict_proba(test_pool)[:, 1]
|
||||
train_auc = roc_auc_score(y_train, y_train_proba)
|
||||
test_auc = roc_auc_score(y_test, y_test_proba)
|
||||
|
||||
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
|
||||
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
||||
|
||||
# --- Сохранение ---
|
||||
os.makedirs("artifacts", exist_ok=True)
|
||||
model_path = "artifacts/model_bag_of_heroes.cbm"
|
||||
model.save_model(model_path)
|
||||
print(f"\nМодель сохранена: {model_path}")
|
||||
|
||||
# Порядок фичей
|
||||
feature_cols = list(X.columns)
|
||||
pd.DataFrame(feature_cols, columns=["feature"]).to_csv(
|
||||
"artifacts/feature_order_bag_of_heroes.csv", index=False
|
||||
)
|
||||
print("Порядок фичей сохранен в artifacts/feature_order_bag_of_heroes.csv")
|
||||
|
||||
# Важность признаков (топ-30)
|
||||
importance = model.get_feature_importance(train_pool)
|
||||
importance_df = (
|
||||
pd.DataFrame({"feature": X_train.columns, "importance": importance})
|
||||
.sort_values("importance", ascending=False)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
print("\nВажность признаков (top 30):")
|
||||
print(importance_df.head(30).to_string(index=False))
|
||||
|
||||
importance_df.to_csv("artifacts/feature_importance_bag_of_heroes.csv", index=False)
|
||||
131
educationML/train_model_pro.py
Normal file
131
educationML/train_model_pro.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from catboost import CatBoostClassifier, Pool
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
print("Загрузка датасета...")
|
||||
|
||||
df = pd.read_parquet("data/dataset_from_db.parquet")
|
||||
|
||||
print(f"Всего записей (матчей): {len(df)}")
|
||||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||||
|
||||
# --- Создаём признаки на уровне матча ---
|
||||
print("\nСоздание признаков...")
|
||||
|
||||
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
||||
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
|
||||
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
|
||||
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
|
||||
|
||||
# Создаём признаки: каждый герой на каждой позиции для каждой команды
|
||||
# Формат: radiant_{hero_id}_pos_{position}, dire_{hero_id}_pos_{position}
|
||||
|
||||
rows = []
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
features = {}
|
||||
|
||||
# Radiant heroes с позициями
|
||||
for i in range(5):
|
||||
hero_id = int(row[hero_cols_r[i]])
|
||||
position = int(row[pos_cols_r[i]])
|
||||
|
||||
if hero_id >= 0 and position >= 0:
|
||||
features[f"radiant_h{hero_id}_p{position}"] = 1
|
||||
|
||||
# Dire heroes с позициями
|
||||
for i in range(5):
|
||||
hero_id = int(row[hero_cols_d[i]])
|
||||
position = int(row[pos_cols_d[i]])
|
||||
|
||||
if hero_id >= 0 and position >= 0:
|
||||
features[f"dire_h{hero_id}_p{position}"] = 1
|
||||
|
||||
features['y'] = int(row['y'])
|
||||
rows.append(features)
|
||||
|
||||
df_features = pd.DataFrame(rows).fillna(0)
|
||||
|
||||
print(f"Создано признаков: {len(df_features.columns) - 1}")
|
||||
|
||||
# Целевая
|
||||
y = df_features['y'].astype(int)
|
||||
X = df_features.drop('y', axis=1)
|
||||
|
||||
# Разбиение
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y,
|
||||
test_size=0.2,
|
||||
random_state=42,
|
||||
stratify=y
|
||||
)
|
||||
|
||||
print(f"\nTrain: {len(X_train)} матчей")
|
||||
print(f"Test: {len(X_test)} матчей")
|
||||
|
||||
# Обучение
|
||||
train_pool = Pool(X_train, y_train)
|
||||
test_pool = Pool(X_test, y_test)
|
||||
|
||||
model = CatBoostClassifier(
|
||||
iterations=1000,
|
||||
learning_rate=0.05,
|
||||
depth=5,
|
||||
l2_leaf_reg=3,
|
||||
min_data_in_leaf=10,
|
||||
bootstrap_type="Bayesian",
|
||||
bagging_temperature=0.5,
|
||||
loss_function="Logloss",
|
||||
eval_metric="AUC",
|
||||
random_seed=42,
|
||||
verbose=50,
|
||||
od_type="Iter",
|
||||
od_wait=100,
|
||||
use_best_model=True
|
||||
)
|
||||
|
||||
print("\nНачало обучения...")
|
||||
model.fit(train_pool, eval_set=test_pool)
|
||||
|
||||
# Оценка
|
||||
best_scores = model.get_best_score()
|
||||
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
|
||||
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
|
||||
|
||||
y_train_proba = model.predict_proba(train_pool)[:, 1]
|
||||
y_test_proba = model.predict_proba(test_pool)[:, 1]
|
||||
train_auc = roc_auc_score(y_train, y_train_proba)
|
||||
test_auc = roc_auc_score(y_test, y_test_proba)
|
||||
|
||||
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
|
||||
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
||||
|
||||
# Сохранение
|
||||
os.makedirs("artifacts", exist_ok=True)
|
||||
model_path = "artifacts/model_from_db_pro_v3.cbm"
|
||||
model.save_model(model_path)
|
||||
print(f"\nМодель сохранена: {model_path}")
|
||||
|
||||
# Важность (топ-30)
|
||||
importance = model.get_feature_importance(train_pool)
|
||||
importance_df = (
|
||||
pd.DataFrame({"feature": X_train.columns, "importance": importance})
|
||||
.sort_values("importance", ascending=False)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
print("\nВажность признаков (top 30):")
|
||||
print(importance_df.head(30).to_string(index=False))
|
||||
|
||||
importance_df.to_csv("artifacts/feature_importance_db.csv", index=False)
|
||||
|
||||
# Сохраняем список всех возможных признаков для инференса
|
||||
all_features = sorted(X.columns.tolist())
|
||||
pd.DataFrame(all_features, columns=["feature"]).to_csv(
|
||||
"artifacts/feature_order_db.csv", index=False
|
||||
)
|
||||
print(f"Порядок фичей сохранен в artifacts/feature_order_db.csv ({len(all_features)} признаков)")
|
||||
161
educationML/train_model_pro_long_old.py
Normal file
161
educationML/train_model_pro_long_old.py
Normal file
@@ -0,0 +1,161 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from catboost import CatBoostClassifier, Pool
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
print("Загрузка датасета...")
|
||||
|
||||
df = pd.read_parquet("data/dataset_from_db.parquet")
|
||||
|
||||
print(f"Всего записей (матчей): {len(df)}")
|
||||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||||
|
||||
# --- Преобразование в long-format ---
|
||||
print("\nПреобразование в long-format...")
|
||||
|
||||
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
||||
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
|
||||
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
|
||||
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
|
||||
|
||||
rows = []
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
match_id = idx
|
||||
is_first_pick_radiant = int(row.get("is_first_pick_radiant", 0))
|
||||
radiant_win = int(row["y"])
|
||||
|
||||
# Radiant team (5 героев)
|
||||
for i in range(5):
|
||||
hero_id = int(row[hero_cols_r[i]])
|
||||
position = int(row[pos_cols_r[i]])
|
||||
|
||||
if hero_id >= 0: # Только валидные герои
|
||||
rows.append({
|
||||
"match_id": match_id,
|
||||
"is_first_pick_radiant": is_first_pick_radiant,
|
||||
"team": 0, # Radiant
|
||||
"hero_id": hero_id,
|
||||
"position": position,
|
||||
"radiant_win": radiant_win
|
||||
})
|
||||
|
||||
# Dire team (5 героев)
|
||||
for i in range(5):
|
||||
hero_id = int(row[hero_cols_d[i]])
|
||||
position = int(row[pos_cols_d[i]])
|
||||
|
||||
if hero_id >= 0: # Только валидные герои
|
||||
rows.append({
|
||||
"match_id": match_id,
|
||||
"is_first_pick_radiant": is_first_pick_radiant,
|
||||
"team": 1, # Dire
|
||||
"hero_id": hero_id,
|
||||
"position": position,
|
||||
"radiant_win": radiant_win
|
||||
})
|
||||
|
||||
df_long = pd.DataFrame(rows)
|
||||
|
||||
print(f"\nLong-format датасет создан:")
|
||||
print(f"Всего записей (пиков): {len(df_long)}")
|
||||
print(f"Уникальных матчей: {df_long['match_id'].nunique()}")
|
||||
print(f"Средних пиков на матч: {len(df_long) / df_long['match_id'].nunique():.1f}")
|
||||
|
||||
# Целевая переменная
|
||||
y = df_long["radiant_win"].astype(int)
|
||||
|
||||
# Признаки
|
||||
feature_cols = ["team", "hero_id", "position"]
|
||||
X = df_long[feature_cols].copy()
|
||||
|
||||
# Убедимся в правильных типах
|
||||
X["team"] = X["team"].astype(int)
|
||||
X["hero_id"] = X["hero_id"].astype(int)
|
||||
X["position"] = X["position"].astype(int)
|
||||
|
||||
# Разбиение (важно: разбиваем по match_id, чтобы пики одного матча были в одном сплите)
|
||||
unique_matches = df_long["match_id"].unique()
|
||||
train_matches, test_matches = train_test_split(
|
||||
unique_matches,
|
||||
test_size=0.1,
|
||||
random_state=42
|
||||
)
|
||||
|
||||
train_mask = df_long["match_id"].isin(train_matches)
|
||||
test_mask = df_long["match_id"].isin(test_matches)
|
||||
|
||||
X_train = X[train_mask].reset_index(drop=True)
|
||||
y_train = y[train_mask].reset_index(drop=True)
|
||||
X_test = X[test_mask].reset_index(drop=True)
|
||||
y_test = y[test_mask].reset_index(drop=True)
|
||||
|
||||
print(f"\nTrain: {len(X_train)} пиков ({len(train_matches)} матчей)")
|
||||
print(f"Test: {len(X_test)} пиков ({len(test_matches)} матчей)")
|
||||
|
||||
# Категориальные признаки
|
||||
cat_features = ["team", "hero_id", "position"]
|
||||
train_pool = Pool(X_train, y_train, cat_features=cat_features)
|
||||
test_pool = Pool(X_test, y_test, cat_features=cat_features)
|
||||
|
||||
# Модель с более агрессивной регуляризацией для малого датасета
|
||||
model = CatBoostClassifier(
|
||||
iterations=1000,
|
||||
learning_rate=0.1, # Увеличили learning rate
|
||||
depth=4, # Уменьшили глубину
|
||||
l2_leaf_reg=5, # Увеличили регуляризацию
|
||||
min_data_in_leaf=20, # Добавили минимум данных в листе
|
||||
bootstrap_type="Bayesian",
|
||||
bagging_temperature=0.5, # Уменьшили для меньшего разброса
|
||||
loss_function="Logloss",
|
||||
eval_metric="AUC",
|
||||
random_seed=42,
|
||||
verbose=50,
|
||||
od_type="Iter",
|
||||
od_wait=50, # Уменьшили patience
|
||||
use_best_model=True
|
||||
)
|
||||
|
||||
print("\nНачало обучения...")
|
||||
model.fit(train_pool, eval_set=test_pool, use_best_model=True)
|
||||
|
||||
# --- Оценка качества ---
|
||||
best_scores = model.get_best_score()
|
||||
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
|
||||
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
|
||||
|
||||
y_train_proba = model.predict_proba(train_pool)[:, 1]
|
||||
y_test_proba = model.predict_proba(test_pool)[:, 1]
|
||||
train_auc = roc_auc_score(y_train, y_train_proba)
|
||||
test_auc = roc_auc_score(y_test, y_test_proba)
|
||||
|
||||
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
|
||||
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
||||
|
||||
# --- Сохранение ---
|
||||
os.makedirs("artifacts", exist_ok=True)
|
||||
model_path = "artifacts/model_from_db_pro_v3.cbm"
|
||||
model.save_model(model_path)
|
||||
print(f"\nМодель сохранена: {model_path}")
|
||||
|
||||
# Порядок фичей
|
||||
pd.DataFrame(feature_cols, columns=["feature"]).to_csv(
|
||||
"artifacts/feature_order_db.csv", index=False
|
||||
)
|
||||
print("Порядок фичей сохранен в artifacts/feature_order_db.csv")
|
||||
|
||||
# Важность признаков
|
||||
importance = model.get_feature_importance(train_pool)
|
||||
importance_df = (
|
||||
pd.DataFrame({"feature": X_train.columns, "importance": importance})
|
||||
.sort_values("importance", ascending=False)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
print("\nВажность признаков:")
|
||||
print(importance_df.to_string(index=False))
|
||||
|
||||
importance_df.to_csv("artifacts/feature_importance_db.csv", index=False)
|
||||
116
educationML/train_model_pro_old.py
Normal file
116
educationML/train_model_pro_old.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from catboost import CatBoostClassifier, Pool
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
print("Загрузка датасета...")
|
||||
|
||||
df = pd.read_parquet("data/dataset_from_db.parquet")
|
||||
|
||||
print(f"Всего записей: {len(df)}")
|
||||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||||
|
||||
# --- Фичи под новый формат датасета ---
|
||||
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
||||
hero_cols_d = [f"d_h{i}" for i in range(1, 5+1)]
|
||||
# player_cols_r = [f"r_p{i}" for i in range(1, 6)]
|
||||
# player_cols_d = [f"d_p{i}" for i in range(1, 6)]
|
||||
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
|
||||
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
|
||||
|
||||
feature_cols = (
|
||||
["is_first_pick_radiant"]
|
||||
+ hero_cols_r + hero_cols_d
|
||||
# + player_cols_r + player_cols_d # Убрали игроков - мало данных
|
||||
+ pos_cols_r + pos_cols_d
|
||||
)
|
||||
|
||||
# Целевая
|
||||
target_col = "y"
|
||||
|
||||
# Отделяем признаки/таргет
|
||||
X = df[feature_cols].copy()
|
||||
y = df[target_col].astype(int).copy()
|
||||
|
||||
# На всякий случай убедимся, что бинарный признак int
|
||||
X["is_first_pick_radiant"] = X["is_first_pick_radiant"].astype(int)
|
||||
|
||||
# Разбиение
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y,
|
||||
test_size=0.1,
|
||||
random_state=42,
|
||||
stratify=y
|
||||
)
|
||||
|
||||
print(f"\nTrain: {len(X_train)} записей")
|
||||
print(f"Test: {len(X_test)} записей")
|
||||
|
||||
# Категориальные признаки: герои и позиции (их ID — это категории)
|
||||
cat_features = hero_cols_r + hero_cols_d + pos_cols_r + pos_cols_d
|
||||
# CatBoost принимает либо индексы, либо имена колонок. Передаем имена.
|
||||
train_pool = Pool(X_train, y_train, cat_features=cat_features)
|
||||
test_pool = Pool(X_test, y_test, cat_features=cat_features)
|
||||
|
||||
# Модель
|
||||
model = CatBoostClassifier(
|
||||
iterations=2500,
|
||||
learning_rate=0.03,
|
||||
depth=7,
|
||||
l2_leaf_reg=2,
|
||||
bootstrap_type="Bayesian",
|
||||
bagging_temperature=1.0, # <- вместо subsample
|
||||
loss_function="Logloss",
|
||||
eval_metric="AUC",
|
||||
random_seed=42,
|
||||
verbose=100,
|
||||
od_type="Iter",
|
||||
od_wait=200
|
||||
)
|
||||
|
||||
print("\nНачало обучения...")
|
||||
model.fit(train_pool, eval_set=test_pool, use_best_model=True)
|
||||
|
||||
# --- Оценка качества ---
|
||||
# Лучшие метрики по мнению CatBoost
|
||||
best_scores = model.get_best_score()
|
||||
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
|
||||
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
|
||||
|
||||
# Перепроверим AUC напрямую
|
||||
y_train_proba = model.predict_proba(train_pool)[:, 1]
|
||||
y_test_proba = model.predict_proba(test_pool)[:, 1]
|
||||
train_auc = roc_auc_score(y_train, y_train_proba)
|
||||
test_auc = roc_auc_score(y_test, y_test_proba)
|
||||
|
||||
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
|
||||
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
||||
|
||||
# --- Сохранение ---
|
||||
os.makedirs("artifacts", exist_ok=True)
|
||||
model_path = "artifacts/model_from_db_pro_v3.cbm"
|
||||
model.save_model(model_path)
|
||||
print(f"\nМодель сохранена: {model_path}")
|
||||
|
||||
# Порядок фичей
|
||||
pd.DataFrame(feature_cols, columns=["feature"]).to_csv(
|
||||
"artifacts/feature_order_db.csv", index=False
|
||||
)
|
||||
print("Порядок фичей сохранен в artifacts/feature_order_db.csv")
|
||||
|
||||
# Важность признаков
|
||||
importance = model.get_feature_importance(train_pool)
|
||||
importance_df = (
|
||||
pd.DataFrame({"feature": X_train.columns, "importance": importance})
|
||||
.sort_values("importance", ascending=False)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
print("\nВажность признаков (top 25):")
|
||||
print(importance_df.head(25).to_string(index=False))
|
||||
|
||||
# При желании — сохранить важности целиком
|
||||
importance_df.to_csv("artifacts/feature_importance_db.csv", index=False)
|
||||
176
educationML/train_model_stacking.py
Normal file
176
educationML/train_model_stacking.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import os
|
||||
import sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from catboost import CatBoostClassifier, Pool
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import pickle
|
||||
|
||||
# Добавляем корневую директорию проекта в путь
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
|
||||
print("Загрузка датасета...")
|
||||
df = pd.read_parquet("data/dataset_from_db.parquet")
|
||||
|
||||
print(f"Всего записей: {len(df)}")
|
||||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||||
|
||||
# Целевая переменная
|
||||
y = df["y"].astype(int).copy()
|
||||
|
||||
# Разбиение на train/test
|
||||
_, X_test_indices, _, y_test = train_test_split(
|
||||
df.index, y,
|
||||
test_size=0.2,
|
||||
random_state=42,
|
||||
stratify=y
|
||||
)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Загрузка базовых моделей...")
|
||||
print("="*60)
|
||||
|
||||
# === Модель 1: Heroes + Positions ===
|
||||
from routes.predict import build_long_format_input, modelPro
|
||||
|
||||
# === Модель 2: Bag of Heroes ===
|
||||
from routes.predict_bag_of_heroes import build_bag_of_heroes_features, modelBagOfHeroes
|
||||
|
||||
# === Модель 3: With Players ===
|
||||
from routes.predict_with_players import build_player_features, modelWithPlayers
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Генерация предсказаний базовых моделей...")
|
||||
print("="*60)
|
||||
|
||||
# Подготовим данные для всех моделей
|
||||
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
||||
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
|
||||
player_cols_r = [f"r_p{i}" for i in range(1, 6)]
|
||||
player_cols_d = [f"d_p{i}" for i in range(1, 6)]
|
||||
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
|
||||
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
|
||||
|
||||
predictions_list = []
|
||||
|
||||
for idx in df.index:
|
||||
row_data = df.loc[idx]
|
||||
|
||||
# Формируем payload для текущей записи
|
||||
payload = {
|
||||
"is_first_pick_radiant": int(row_data.get("is_first_pick_radiant", 0)),
|
||||
}
|
||||
|
||||
# Герои
|
||||
for col in hero_cols_r + hero_cols_d:
|
||||
payload[col] = int(row_data.get(col, -1))
|
||||
|
||||
# Игроки
|
||||
for col in player_cols_r + player_cols_d:
|
||||
payload[col] = int(row_data.get(col, -1))
|
||||
|
||||
# Позиции
|
||||
for col in pos_cols_r + pos_cols_d:
|
||||
payload[col] = int(row_data.get(col, -1))
|
||||
|
||||
# === Предсказание модели 1: Heroes + Positions ===
|
||||
X_with_pos = build_long_format_input(payload)
|
||||
pred1 = float(modelPro.predict_proba(X_with_pos)[0, 1])
|
||||
|
||||
# === Предсказание модели 2: Bag of Heroes ===
|
||||
X_bag = build_bag_of_heroes_features(payload)
|
||||
pred2 = float(modelBagOfHeroes.predict_proba(X_bag)[0, 1])
|
||||
|
||||
# === Предсказание модели 3: With Players ===
|
||||
X_players = build_player_features(payload)
|
||||
pred3 = float(modelWithPlayers.predict_proba(X_players)[0, 1])
|
||||
|
||||
predictions_list.append({
|
||||
"pred_with_positions": pred1,
|
||||
"pred_bag_of_heroes": pred2,
|
||||
"pred_with_players": pred3
|
||||
})
|
||||
|
||||
if (idx + 1) % 100 == 0:
|
||||
print(f"Обработано {idx + 1}/{len(df)} записей...")
|
||||
|
||||
# Создаём DataFrame с предсказаниями
|
||||
X_meta = pd.DataFrame(predictions_list)
|
||||
|
||||
print(f"\nСоздано {len(X_meta)} мета-признаков")
|
||||
print(f"Колонки: {list(X_meta.columns)}")
|
||||
|
||||
# Разбиение на train/test по тем же индексам
|
||||
X_meta_train = X_meta.loc[~X_meta.index.isin(X_test_indices)]
|
||||
X_meta_test = X_meta.loc[X_meta.index.isin(X_test_indices)]
|
||||
y_meta_train = y.loc[~y.index.isin(X_test_indices)]
|
||||
y_meta_test = y.loc[y.index.isin(X_test_indices)]
|
||||
|
||||
print(f"\nMeta Train: {len(X_meta_train)} записей")
|
||||
print(f"Meta Test: {len(X_meta_test)} записей")
|
||||
|
||||
# Обучение мета-модели
|
||||
print("\n" + "="*60)
|
||||
print("Обучение мета-модели (Логистическая регрессия)...")
|
||||
print("="*60)
|
||||
|
||||
# Используем логистическую регрессию вместо CatBoost для избежания переобучения
|
||||
meta_model = LogisticRegression(
|
||||
random_state=42,
|
||||
max_iter=1000,
|
||||
C=1.0 # Регуляризация
|
||||
)
|
||||
|
||||
meta_model.fit(X_meta_train, y_meta_train)
|
||||
|
||||
# Оценка качества
|
||||
y_train_proba = meta_model.predict_proba(X_meta_train)[:, 1]
|
||||
y_test_proba = meta_model.predict_proba(X_meta_test)[:, 1]
|
||||
train_auc = roc_auc_score(y_meta_train, y_train_proba)
|
||||
test_auc = roc_auc_score(y_meta_test, y_test_proba)
|
||||
|
||||
print(f"\nLogistic Regression AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
||||
|
||||
# Сохранение мета-модели
|
||||
os.makedirs("artifacts", exist_ok=True)
|
||||
model_path = "artifacts/model_stacking.pkl"
|
||||
with open(model_path, 'wb') as f:
|
||||
pickle.dump(meta_model, f)
|
||||
print(f"\nМета-модель сохранена: {model_path}")
|
||||
|
||||
# Важность признаков (коэффициенты логистической регрессии)
|
||||
coefficients = meta_model.coef_[0]
|
||||
intercept = meta_model.intercept_[0]
|
||||
|
||||
importance_df = pd.DataFrame({
|
||||
"feature": X_meta_train.columns,
|
||||
"coefficient": coefficients
|
||||
}).sort_values("coefficient", ascending=False).reset_index(drop=True)
|
||||
|
||||
print("\nКоэффициенты логистической регрессии:")
|
||||
print(f"Intercept: {intercept:.4f}")
|
||||
print(importance_df.to_string(index=False))
|
||||
|
||||
# Сохраняем в старом формате для совместимости
|
||||
importance_df_compat = pd.DataFrame({
|
||||
"feature": X_meta_train.columns,
|
||||
"importance": np.abs(coefficients) # Абсолютные значения коэффициентов
|
||||
})
|
||||
importance_df_compat.to_csv("artifacts/feature_importance_stacking.csv", index=False)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Сравнение моделей на тестовой выборке:")
|
||||
print("="*60)
|
||||
|
||||
# AUC базовых моделей
|
||||
auc1 = roc_auc_score(y_meta_test, X_meta_test["pred_with_positions"])
|
||||
auc2 = roc_auc_score(y_meta_test, X_meta_test["pred_bag_of_heroes"])
|
||||
auc3 = roc_auc_score(y_meta_test, X_meta_test["pred_with_players"])
|
||||
|
||||
print(f"Модель 1 (Heroes + Positions): AUC = {auc1:.4f}")
|
||||
print(f"Модель 2 (Bag of Heroes): AUC = {auc2:.4f}")
|
||||
print(f"Модель 3 (With Players): AUC = {auc3:.4f}")
|
||||
print(f"Мета-модель (Stacking): AUC = {test_auc:.4f}")
|
||||
156
educationML/train_model_with_players.py
Normal file
156
educationML/train_model_with_players.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from catboost import CatBoostClassifier, Pool
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
print("Загрузка датасета...")
|
||||
|
||||
df = pd.read_parquet("data/dataset_with_players.parquet")
|
||||
|
||||
print(f"Всего записей (матчей): {len(df)}")
|
||||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||||
|
||||
# --- Создаём признаки на уровне матча ---
|
||||
print("\nСоздание признаков...")
|
||||
|
||||
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
||||
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
|
||||
player_cols_r = [f"r_p{i}" for i in range(1, 6)]
|
||||
player_cols_d = [f"d_p{i}" for i in range(1, 6)]
|
||||
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
|
||||
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
|
||||
|
||||
# Создаём признаки: player_hero_pos для каждой команды
|
||||
# Формат: radiant_p{player_id}_h{hero_id}_pos{position}, dire_p{player_id}_h{hero_id}_pos{position}
|
||||
|
||||
rows = []
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
features = {}
|
||||
|
||||
# Radiant heroes с игроками и позициями
|
||||
for i in range(5):
|
||||
hero_id = int(row[hero_cols_r[i]])
|
||||
player_id = int(row[player_cols_r[i]])
|
||||
position = int(row[pos_cols_r[i]])
|
||||
|
||||
# Признак: игрок + герой + позиция
|
||||
if player_id > 0 and hero_id >= 0 and position >= 0:
|
||||
features[f"radiant_p{player_id}_h{hero_id}_pos{position}"] = 1
|
||||
|
||||
# Признак: только игрок + герой (если позиция неизвестна)
|
||||
if player_id > 0 and hero_id >= 0:
|
||||
features[f"radiant_p{player_id}_h{hero_id}"] = 1
|
||||
|
||||
# Признак: только игрок + позиция
|
||||
if player_id > 0 and position >= 0:
|
||||
features[f"radiant_p{player_id}_pos{position}"] = 1
|
||||
|
||||
# Dire heroes с игроками и позициями
|
||||
for i in range(5):
|
||||
hero_id = int(row[hero_cols_d[i]])
|
||||
player_id = int(row[player_cols_d[i]])
|
||||
position = int(row[pos_cols_d[i]])
|
||||
|
||||
# Признак: игрок + герой + позиция
|
||||
if player_id > 0 and hero_id >= 0 and position >= 0:
|
||||
features[f"dire_p{player_id}_h{hero_id}_pos{position}"] = 1
|
||||
|
||||
# Признак: только игрок + герой (если позиция неизвестна)
|
||||
if player_id > 0 and hero_id >= 0:
|
||||
features[f"dire_p{player_id}_h{hero_id}"] = 1
|
||||
|
||||
# Признак: только игрок + позиция
|
||||
if player_id > 0 and position >= 0:
|
||||
features[f"dire_p{player_id}_pos{position}"] = 1
|
||||
|
||||
features['y'] = int(row['y'])
|
||||
rows.append(features)
|
||||
|
||||
if (idx + 1) % 100 == 0:
|
||||
print(f"Обработано {idx + 1}/{len(df)} матчей...")
|
||||
|
||||
df_features = pd.DataFrame(rows).fillna(0)
|
||||
|
||||
print(f"\nСоздано признаков: {len(df_features.columns) - 1}")
|
||||
|
||||
# Целевая
|
||||
y = df_features['y'].astype(int)
|
||||
X = df_features.drop('y', axis=1)
|
||||
|
||||
# Разбиение
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y,
|
||||
test_size=0.2,
|
||||
random_state=42,
|
||||
stratify=y
|
||||
)
|
||||
|
||||
print(f"\nTrain: {len(X_train)} матчей")
|
||||
print(f"Test: {len(X_test)} матчей")
|
||||
|
||||
# Обучение
|
||||
train_pool = Pool(X_train, y_train)
|
||||
test_pool = Pool(X_test, y_test)
|
||||
|
||||
model = CatBoostClassifier(
|
||||
iterations=1000,
|
||||
learning_rate=0.05,
|
||||
depth=5,
|
||||
l2_leaf_reg=3,
|
||||
min_data_in_leaf=5,
|
||||
bootstrap_type="Bayesian",
|
||||
bagging_temperature=0.5,
|
||||
loss_function="Logloss",
|
||||
eval_metric="AUC",
|
||||
random_seed=42,
|
||||
verbose=50,
|
||||
od_type="Iter",
|
||||
od_wait=100,
|
||||
use_best_model=True
|
||||
)
|
||||
|
||||
print("\nНачало обучения...")
|
||||
model.fit(train_pool, eval_set=test_pool)
|
||||
|
||||
# Оценка
|
||||
best_scores = model.get_best_score()
|
||||
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
|
||||
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
|
||||
|
||||
y_train_proba = model.predict_proba(train_pool)[:, 1]
|
||||
y_test_proba = model.predict_proba(test_pool)[:, 1]
|
||||
train_auc = roc_auc_score(y_train, y_train_proba)
|
||||
test_auc = roc_auc_score(y_test, y_test_proba)
|
||||
|
||||
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
|
||||
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
||||
|
||||
# Сохранение
|
||||
os.makedirs("artifacts", exist_ok=True)
|
||||
model_path = "artifacts/model_with_players.cbm"
|
||||
model.save_model(model_path)
|
||||
print(f"\nМодель сохранена: {model_path}")
|
||||
|
||||
# Важность (топ-30)
|
||||
importance = model.get_feature_importance(train_pool)
|
||||
importance_df = (
|
||||
pd.DataFrame({"feature": X_train.columns, "importance": importance})
|
||||
.sort_values("importance", ascending=False)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
print("\nВажность признаков (top 30):")
|
||||
print(importance_df.head(30).to_string(index=False))
|
||||
|
||||
importance_df.to_csv("artifacts/feature_importance_with_players.csv", index=False)
|
||||
|
||||
# Сохраняем список всех возможных признаков для инференса
|
||||
all_features = sorted(X.columns.tolist())
|
||||
pd.DataFrame(all_features, columns=["feature"]).to_csv(
|
||||
"artifacts/feature_order_with_players.csv", index=False
|
||||
)
|
||||
print(f"\nПорядок фичей сохранен в artifacts/feature_order_with_players.csv ({len(all_features)} признаков)")
|
||||
Reference in New Issue
Block a user