Files
predictV1/educationML/build_dataset_pro.py
mamonov.ep 8a134239d7 Initial commit: добавление проекта predictV1
Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-21 17:22:58 +03:00

95 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pandas as pd
import psycopg2
from psycopg2.extras import RealDictCursor
def get_db_connection():
return psycopg2.connect(
host="localhost",
port=5432,
database="korobka_db",
user="postgres",
password="postgres"
)
print("Загрузка данных из БД...")
conn = get_db_connection()
# Загружаем матчи
matches_df = pd.read_sql_query("""
SELECT id as match_id, radiant_team_id, dire_team_id, radiant_win
FROM matches
WHERE "source" = 'pro'
ORDER BY id
""", conn)
print(f"Загружено {len(matches_df)} матчей")
# Загружаем детали героев
details_df = pd.read_sql_query("""
SELECT match_id, hero_id, team, players_id, pos
FROM details_match
WHERE "source" = 'pro'
ORDER BY match_id
""", conn)
print(f"Загружено {len(details_df)} записей героев")
conn.close()
# Формируем слоты героев для каждого матча
def slots_from_picks(group):
# --- Radiant (team = 0) ---
r_heroes = group[group['team'] == 0]['hero_id'].tolist()[:5]
r_players = group[group['team'] == 0]['players_id'].tolist()[:5]
r_pos = group[group['team'] == 0]['pos'].tolist()[:5]
# --- Dire (team = 1) ---
d_heroes = group[group['team'] == 1]['hero_id'].tolist()[:5]
d_players = group[group['team'] == 1]['players_id'].tolist()[:5]
d_pos = group[group['team'] == 1]['pos'].tolist()[:5]
row = {}
# --- Добавляем 5 слотов для каждой стороны ---
for i in range(5):
# Герои Radiant / Dire
row[f"r_h{i+1}"] = r_heroes[i] if i < len(r_heroes) else -1
row[f"d_h{i+1}"] = d_heroes[i] if i < len(d_heroes) else -1
# Позиции героев
row[f"rp_h{i+1}"] = r_pos[i] if i < len(r_pos) else -1
row[f"dp_h{i+1}"] = d_pos[i] if i < len(d_pos) else -1
# Игроки Radiant / Dire
row[f"r_p{i+1}"] = r_players[i] if i < len(r_players) else -1
row[f"d_p{i+1}"] = d_players[i] if i < len(d_players) else -1
# Определяем, кто пикал первым (команда 0 = radiant)
fp_team = group.iloc[0]['team'] if len(group) > 0 else 0
row["is_first_pick_radiant"] = 1 if fp_team == 0 else 0
return pd.Series(row)
slots_df = details_df.groupby("match_id").apply(slots_from_picks).reset_index()
# Объединяем с информацией о матчах
dataset = matches_df.merge(slots_df, on="match_id", how="inner")
# Добавляем целевую переменную
dataset['y'] = dataset['radiant_win'].astype(int)
# Выбираем нужные колонки в правильном порядке
final_df = dataset[['match_id', 'is_first_pick_radiant',
'r_h1', 'r_h2', 'r_h3', 'r_h4', 'r_h5',
'd_h1', 'd_h2', 'd_h3', 'd_h4', 'd_h5',
'r_p1', 'r_p2', 'r_p3', 'r_p4', 'r_p5',
'd_p1', 'd_p2', 'd_p3', 'd_p4', 'd_p5',
'rp_h1', 'rp_h2', 'rp_h3', 'rp_h4', 'rp_h5',
'dp_h1', 'dp_h2', 'dp_h3', 'dp_h4', 'dp_h5',
'y']]
# Сохраняем
final_df.to_parquet("data/dataset_from_db.parquet", index=False)
print(f"Сохранено {len(final_df)} записей в data/dataset_from_db.parquet")
print(f"Radiant wins: {final_df['y'].sum()}, Dire wins: {len(final_df) - final_df['y'].sum()}")