Files
predictV1/educationML/build_dataset_with_players.py
mamonov.ep 8a134239d7 Initial commit: добавление проекта predictV1
Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-21 17:22:58 +03:00

140 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import psycopg2
import pandas as pd
import numpy as np
print("Подключение к базе данных...")
conn = psycopg2.connect(
host="localhost",
port=5432,
database="korobka_db",
user="postgres",
password="postgres"
)
print("Загрузка матчей с известными игроками...")
# Получаем все матчи где есть хотя бы один известный игрок
query = """
SELECT
m.id as match_id,
m.radiant_win,
m.leagueid
FROM matches m
WHERE EXISTS (
SELECT 1
FROM details_match dm
WHERE dm.match_id = m.id
AND dm.players_id IS NOT NULL
AND dm.players_id != 0
)
ORDER BY m.id
"""
matches_df = pd.read_sql(query, conn)
print(f"Найдено матчей: {len(matches_df)}")
# Получаем детали всех этих матчей
query_details = """
SELECT
dm.match_id,
dm.hero_id,
dm.team,
dm.players_id,
dm.pos,
dm."order"
FROM details_match dm
WHERE dm.match_id IN (
SELECT DISTINCT m.id
FROM matches m
WHERE EXISTS (
SELECT 1
FROM details_match dm2
WHERE dm2.match_id = m.id
AND dm2.players_id IS NOT NULL
AND dm2.players_id != 0
)
)
ORDER BY dm.match_id, dm."order"
"""
details_df = pd.read_sql(query_details, conn)
conn.close()
print(f"Загружено {len(details_df)} записей деталей")
# Преобразуем в wide-format
print("\nПреобразование в wide-format...")
rows = []
for match_id, group in details_df.groupby('match_id'):
match_info = matches_df[matches_df['match_id'] == match_id].iloc[0]
row = {
'match_id': match_id,
'y': int(match_info['radiant_win']),
'leagueid': int(match_info['leagueid'])
}
# Radiant (team=0) и Dire (team=1)
radiant_picks = group[group['team'] == 0].sort_values('order')
dire_picks = group[group['team'] == 1].sort_values('order')
# Заполняем героев и игроков для Radiant (до 5)
for i, (idx, pick) in enumerate(radiant_picks.iterrows(), 1):
if i > 5:
break
row[f'r_h{i}'] = int(pick['hero_id'])
row[f'r_p{i}'] = int(pick['players_id']) if pd.notna(pick['players_id']) and pick['players_id'] != 0 else -1
row[f'rp_h{i}'] = int(pick['pos']) if pd.notna(pick['pos']) else -1
# Заполняем пропуски для Radiant
for i in range(len(radiant_picks) + 1, 6):
row[f'r_h{i}'] = -1
row[f'r_p{i}'] = -1
row[f'rp_h{i}'] = -1
# Заполняем героев и игроков для Dire (до 5)
for i, (idx, pick) in enumerate(dire_picks.iterrows(), 1):
if i > 5:
break
row[f'd_h{i}'] = int(pick['hero_id'])
row[f'd_p{i}'] = int(pick['players_id']) if pd.notna(pick['players_id']) and pick['players_id'] != 0 else -1
row[f'dp_h{i}'] = int(pick['pos']) if pd.notna(pick['pos']) else -1
# Заполняем пропуски для Dire
for i in range(len(dire_picks) + 1, 6):
row[f'd_h{i}'] = -1
row[f'd_p{i}'] = -1
row[f'dp_h{i}'] = -1
rows.append(row)
df = pd.DataFrame(rows)
print(f"Создано {len(df)} записей в wide-format")
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
# Статистика по игрокам
player_cols = [f'r_p{i}' for i in range(1, 6)] + [f'd_p{i}' for i in range(1, 6)]
all_players = []
for col in player_cols:
all_players.extend(df[col][df[col] > 0].tolist())
unique_players = len(set(all_players))
print(f"\nУникальных игроков в датасете: {unique_players}")
print(f"Всего записей игроков (не -1): {len(all_players)}")
# Статистика по турнирам
print(f"\nУникальных турниров (leagueid): {df['leagueid'].nunique()}")
# Сохранение
output_path = "data/dataset_with_players.parquet"
df.to_parquet(output_path, index=False)
print(f"\n✓ Датасет сохранён: {output_path}")
# Пример первых записей
print("\nПример данных (первые 3 матча):")
print(df.head(3).to_string())