237 lines
8.0 KiB
Bash
237 lines
8.0 KiB
Bash
|
|
#!/usr/bin/env bash
|
|||
|
|
set -euo pipefail
|
|||
|
|
|
|||
|
|
# ===========================
|
|||
|
|
# Конфиг (меняйте по желанию)
|
|||
|
|
# ===========================
|
|||
|
|
PY=python3
|
|||
|
|
VENV=".venv"
|
|||
|
|
|
|||
|
|
# Порт REST-сервиса:
|
|||
|
|
PORT="${PORT:-8000}"
|
|||
|
|
|
|||
|
|
# Принудить IPv4 (полезно при проблемном IPv6 у провайдера):
|
|||
|
|
FORCE_IPV4="${FORCE_IPV4:-1}"
|
|||
|
|
|
|||
|
|
# API-ключи (необязательно, но улучшает стабильность/квоты):
|
|||
|
|
# export OPENDOTA_API_KEY=...
|
|||
|
|
# export STRATZ_TOKEN=...
|
|||
|
|
OPENDOTA_API_KEY="${OPENDOTA_API_KEY:-}"
|
|||
|
|
STRATZ_TOKEN="${STRATZ_TOKEN:-}"
|
|||
|
|
|
|||
|
|
# Использовать STRATZ вместо OpenDota
|
|||
|
|
# только для шага 1 (список матчей): USE_STRATZ_LIST=1
|
|||
|
|
# только для шага 2 (детали/драфт): USE_STRATZ_DETAILS=1
|
|||
|
|
USE_STRATZ_LIST="${USE_STRATZ_LIST:-0}"
|
|||
|
|
USE_STRATZ_DETAILS="${USE_STRATZ_DETAILS:-0}"
|
|||
|
|
|
|||
|
|
# Задержка после каждых 100 запросов (смягчить 429):
|
|||
|
|
SLEEP_PER_100="${SLEEP_PER_100:-1.0}"
|
|||
|
|
|
|||
|
|
# ===========================
|
|||
|
|
# Окружение и зависимости
|
|||
|
|
# ===========================
|
|||
|
|
if [ ! -d "$VENV" ]; then
|
|||
|
|
$PY -m venv "$VENV"
|
|||
|
|
fi
|
|||
|
|
# shellcheck disable=SC1091
|
|||
|
|
source "$VENV/bin/activate"
|
|||
|
|
|
|||
|
|
pip install -U pip
|
|||
|
|
pip install pandas pyarrow requests httpx "urllib3>=2.2" certifi catboost scikit-learn fastapi uvicorn
|
|||
|
|
|
|||
|
|
mkdir -p data artifacts
|
|||
|
|
|
|||
|
|
# ===========================
|
|||
|
|
# Хелперы
|
|||
|
|
# ===========================
|
|||
|
|
export FORCE_IPV4
|
|||
|
|
export OPENDOTA_API_KEY
|
|||
|
|
export STRATZ_TOKEN
|
|||
|
|
export PAGES
|
|||
|
|
export SLEEP_PER_100
|
|||
|
|
|
|||
|
|
# ===========================
|
|||
|
|
# [1/7] Список pub-матчей
|
|||
|
|
# ===========================
|
|||
|
|
# [1b] Паблики (high-rank)
|
|||
|
|
$PY educationML/fetch_public_matches.py
|
|||
|
|
# [2b] Детали пабликов (герои из players)
|
|||
|
|
$PY educationML/fetch_public_details.py
|
|||
|
|
|
|||
|
|
# ===========================
|
|||
|
|
# [2/7] Список pro-матчей
|
|||
|
|
# ===========================
|
|||
|
|
echo "[1/6] Fetch pro matches via OpenDota (pages=$PAGES)"
|
|||
|
|
$PY educationML/fetch_pro_matches_opendota.py
|
|||
|
|
|
|||
|
|
# =========================================
|
|||
|
|
# [2/6] Детали матча + драфт (устойчивый)
|
|||
|
|
# =========================================
|
|||
|
|
if [ "$USE_STRATZ_DETAILS" = "1" ]; then
|
|||
|
|
echo "[2/6] Fetch match details + draft via STRATZ"
|
|||
|
|
$PY - <<'PYCODE'
|
|||
|
|
|
|||
|
|
PYCODE
|
|||
|
|
|
|||
|
|
else
|
|||
|
|
echo "[2/6] Fetch match details + draft via OpenDota (robust)"
|
|||
|
|
$PY - <<'PYCODE'
|
|||
|
|
import os, time, socket, sys, pandas as pd, requests, httpx
|
|||
|
|
from urllib3.util.retry import Retry
|
|||
|
|
from requests.adapters import HTTPAdapter
|
|||
|
|
|
|||
|
|
# IPv4-only (если FORCE_IPV4=1)
|
|||
|
|
if os.getenv("FORCE_IPV4","1") == "1":
|
|||
|
|
_orig = socket.getaddrinfo
|
|||
|
|
def _v4(host, port, family=0, type=0, proto=0, flags=0):
|
|||
|
|
return _orig(host, port, socket.AF_INET, type, proto, flags)
|
|||
|
|
socket.getaddrinfo = _v4
|
|||
|
|
|
|||
|
|
API_KEY = os.getenv("OPENDOTA_API_KEY")
|
|||
|
|
SLEEP_PER_100 = float(os.getenv("SLEEP_PER_100","1.0"))
|
|||
|
|
BASE = "https://api.opendota.com/api/matches/{mid}"
|
|||
|
|
headers = {"User-Agent":"korobkaGames/1.0","Accept":"application/json","Connection":"close"}
|
|||
|
|
|
|||
|
|
# список match_id
|
|||
|
|
pro = pd.read_parquet("data/pro_matches.parquet")
|
|||
|
|
match_ids = pro['match_id'].drop_duplicates().tolist()
|
|||
|
|
|
|||
|
|
# requests с Retry
|
|||
|
|
sess = requests.Session()
|
|||
|
|
retries = Retry(total=6, connect=6, read=6, backoff_factor=0.7,
|
|||
|
|
status_forcelist=[429,500,502,503,504],
|
|||
|
|
allowed_methods=frozenset(["GET"]))
|
|||
|
|
sess.mount("https://", HTTPAdapter(max_retries=retries))
|
|||
|
|
|
|||
|
|
def fetch_one(mid: int):
|
|||
|
|
url = BASE.format(mid=mid)
|
|||
|
|
if API_KEY:
|
|||
|
|
url += f"?api_key={API_KEY}"
|
|||
|
|
try:
|
|||
|
|
r = sess.get(url, headers=headers, timeout=(5,40))
|
|||
|
|
r.raise_for_status()
|
|||
|
|
return r.json()
|
|||
|
|
except requests.exceptions.SSLError:
|
|||
|
|
# fallback: httpx http2 off
|
|||
|
|
with httpx.Client(http2=False, timeout=40, headers=headers) as client:
|
|||
|
|
resp = client.get(url)
|
|||
|
|
resp.raise_for_status()
|
|||
|
|
return resp.json()
|
|||
|
|
|
|||
|
|
match_rows, draft_rows, failed = [], [], []
|
|||
|
|
for i, mid in enumerate(match_ids, 1):
|
|||
|
|
try:
|
|||
|
|
m = fetch_one(int(mid))
|
|||
|
|
match_rows.append({
|
|||
|
|
"match_id": int(mid),
|
|||
|
|
"date": pd.to_datetime(m.get("start_time",0), unit="s"),
|
|||
|
|
"patch": str(m.get("patch")),
|
|||
|
|
"radiant_win": bool(m.get("radiant_win")),
|
|||
|
|
"duration_sec": m.get("duration"),
|
|||
|
|
"league_id": (m.get("league") or {}).get("id"),
|
|||
|
|
"series_type": m.get("series_type"),
|
|||
|
|
})
|
|||
|
|
for pb in (m.get("picks_bans") or []):
|
|||
|
|
draft_rows.append({
|
|||
|
|
"match_id": int(mid),
|
|||
|
|
"is_pick": pb.get("is_pick", False),
|
|||
|
|
"team": pb.get("team"),
|
|||
|
|
"hero_id": pb.get("hero_id"),
|
|||
|
|
"order": pb.get("order")
|
|||
|
|
})
|
|||
|
|
except Exception:
|
|||
|
|
failed.append(int(mid))
|
|||
|
|
if i % 100 == 0:
|
|||
|
|
time.sleep(SLEEP_PER_100)
|
|||
|
|
|
|||
|
|
pd.DataFrame(match_rows).to_parquet("data/matches.parquet", index=False)
|
|||
|
|
pd.DataFrame(draft_rows).to_parquet("data/draft.parquet", index=False)
|
|||
|
|
pd.Series(failed, name="failed_match_id").to_csv("data/matches_failed.csv", index=False)
|
|||
|
|
print(f"Saved via OpenDota: matches={len(match_rows)} draft_rows={len(draft_rows)} failed={len(failed)}")
|
|||
|
|
if not match_rows:
|
|||
|
|
raise SystemExit("OpenDota details: ничего не скачано")
|
|||
|
|
PYCODE
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ===========================
|
|||
|
|
# [3/6] Простой Elo baseline
|
|||
|
|
# ===========================
|
|||
|
|
echo "[3/6] Build Elo"
|
|||
|
|
$PY - <<'PYCODE'
|
|||
|
|
import pandas as pd
|
|||
|
|
matches = pd.read_parquet("data/matches.parquet").sort_values("date")
|
|||
|
|
pro = pd.read_parquet("data/pro_matches.parquet")[['match_id','radiant_name','dire_name']]
|
|||
|
|
df = matches.merge(pro, on='match_id', how='left')
|
|||
|
|
|
|||
|
|
K = 24
|
|||
|
|
elo = {}
|
|||
|
|
def get_elo(t): return elo.get(t, 1500)
|
|||
|
|
def expect(a,b): return 1.0/(1.0+10**((b-a)/400))
|
|||
|
|
|
|||
|
|
rows=[]
|
|||
|
|
for _, r in df.iterrows():
|
|||
|
|
A, B = r['radiant_name'], r['dire_name']
|
|||
|
|
ra, rb = get_elo(A), get_elo(B)
|
|||
|
|
ea, eb = expect(ra,rb), expect(rb,ra)
|
|||
|
|
y = 1.0 if r['radiant_win'] else 0.0
|
|||
|
|
rows.append({
|
|||
|
|
'match_id': r['match_id'],
|
|||
|
|
'date': r['date'],
|
|||
|
|
'elo_radiant': ra, 'elo_dire': rb,
|
|||
|
|
'elo_diff_90': ra - rb # упрощённо без окон
|
|||
|
|
})
|
|||
|
|
elo[A] = ra + K*(y-ea)
|
|||
|
|
elo[B] = rb + K*((1-y)-eb)
|
|||
|
|
|
|||
|
|
pd.DataFrame(rows).to_parquet("data/elo.parquet", index=False)
|
|||
|
|
print("Saved data/elo.parquet")
|
|||
|
|
PYCODE
|
|||
|
|
|
|||
|
|
# [4] вместо старого build_dataset_draft.py
|
|||
|
|
$PY educationML/build_dataset_mixed.py
|
|||
|
|
|
|||
|
|
# ===========================
|
|||
|
|
# [5/6] Обучение модели
|
|||
|
|
# ===========================
|
|||
|
|
echo "[5/6] Train CatBoost"
|
|||
|
|
$PY - <<'PYCODE'
|
|||
|
|
import pandas as pd
|
|||
|
|
from catboost import CatBoostClassifier
|
|||
|
|
from sklearn.model_selection import TimeSeriesSplit
|
|||
|
|
from sklearn.metrics import log_loss, brier_score_loss
|
|||
|
|
|
|||
|
|
df = pd.read_parquet("data/dataset_mixed.parquet").sort_values("date")
|
|||
|
|
cat_cols = ['patch','source','r_h1','r_h2','r_h3','r_h4','r_h5','d_h1','d_h2','d_h3','d_h4','d_h5']
|
|||
|
|
X = df.drop(columns=['y','date','match_id']) if 'match_id' in df.columns else df.drop(columns=['y','date'])
|
|||
|
|
y = df['y']
|
|||
|
|
cat_idx = [X.columns.get_loc(c) for c in cat_cols]
|
|||
|
|
|
|||
|
|
tscv = TimeSeriesSplit(n_splits=5)
|
|||
|
|
ll, br = [], []
|
|||
|
|
for tr, te in tscv.split(X):
|
|||
|
|
model = CatBoostClassifier(
|
|||
|
|
depth=8, iterations=1200, learning_rate=0.03,
|
|||
|
|
loss_function='Logloss', eval_metric='Logloss', verbose=False
|
|||
|
|
)
|
|||
|
|
model.fit(X.iloc[tr], y.iloc[tr], cat_features=cat_idx)
|
|||
|
|
p = model.predict_proba(X.iloc[te])[:,1]
|
|||
|
|
ll.append(log_loss(y.iloc[te], p))
|
|||
|
|
br.append(brier_score_loss(y.iloc[te], p))
|
|||
|
|
print("CV LogLoss=", sum(ll)/len(ll), " Brier=", sum(br)/len(br))
|
|||
|
|
|
|||
|
|
final = CatBoostClassifier(depth=8, iterations=1500, learning_rate=0.03,
|
|||
|
|
loss_function='Logloss', verbose=False)
|
|||
|
|
final.fit(X, y, cat_features=cat_idx)
|
|||
|
|
final.save_model("artifacts/model_draft.cbm")
|
|||
|
|
pd.Series(X.columns).to_csv("artifacts/feature_order.csv", index=False)
|
|||
|
|
print("Saved artifacts/model_draft.cbm and artifacts/feature_order.csv")
|
|||
|
|
PYCODE
|
|||
|
|
|
|||
|
|
# ===========================
|
|||
|
|
# [6/6] REST-сервис предсказаний
|
|||
|
|
# ===========================
|
|||
|
|
echo "[6/6] Start API → http://127.0.0.1:$PORT"
|
|||
|
|
|
|||
|
|
exec uvicorn serve:app --host 0.0.0.0 --port "$PORT"
|