Files
predictV1/run.sh

237 lines
8.0 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
set -euo pipefail
# ===========================
# Конфиг (меняйте по желанию)
# ===========================
PY=python3
VENV=".venv"
# Порт REST-сервиса:
PORT="${PORT:-8000}"
# Принудить IPv4 (полезно при проблемном IPv6 у провайдера):
FORCE_IPV4="${FORCE_IPV4:-1}"
# API-ключи (необязательно, но улучшает стабильность/квоты):
# export OPENDOTA_API_KEY=...
# export STRATZ_TOKEN=...
OPENDOTA_API_KEY="${OPENDOTA_API_KEY:-}"
STRATZ_TOKEN="${STRATZ_TOKEN:-}"
# Использовать STRATZ вместо OpenDota
# только для шага 1 (список матчей): USE_STRATZ_LIST=1
# только для шага 2 (детали/драфт): USE_STRATZ_DETAILS=1
USE_STRATZ_LIST="${USE_STRATZ_LIST:-0}"
USE_STRATZ_DETAILS="${USE_STRATZ_DETAILS:-0}"
# Задержка после каждых 100 запросов (смягчить 429):
SLEEP_PER_100="${SLEEP_PER_100:-1.0}"
# ===========================
# Окружение и зависимости
# ===========================
if [ ! -d "$VENV" ]; then
$PY -m venv "$VENV"
fi
# shellcheck disable=SC1091
source "$VENV/bin/activate"
pip install -U pip
pip install pandas pyarrow requests httpx "urllib3>=2.2" certifi catboost scikit-learn fastapi uvicorn
mkdir -p data artifacts
# ===========================
# Хелперы
# ===========================
export FORCE_IPV4
export OPENDOTA_API_KEY
export STRATZ_TOKEN
export PAGES
export SLEEP_PER_100
# ===========================
# [1/7] Список pub-матчей
# ===========================
# [1b] Паблики (high-rank)
$PY educationML/fetch_public_matches.py
# [2b] Детали пабликов (герои из players)
$PY educationML/fetch_public_details.py
# ===========================
# [2/7] Список pro-матчей
# ===========================
echo "[1/6] Fetch pro matches via OpenDota (pages=$PAGES)"
$PY educationML/fetch_pro_matches_opendota.py
# =========================================
# [2/6] Детали матча + драфт (устойчивый)
# =========================================
if [ "$USE_STRATZ_DETAILS" = "1" ]; then
echo "[2/6] Fetch match details + draft via STRATZ"
$PY - <<'PYCODE'
PYCODE
else
echo "[2/6] Fetch match details + draft via OpenDota (robust)"
$PY - <<'PYCODE'
import os, time, socket, sys, pandas as pd, requests, httpx
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
# IPv4-only (если FORCE_IPV4=1)
if os.getenv("FORCE_IPV4","1") == "1":
_orig = socket.getaddrinfo
def _v4(host, port, family=0, type=0, proto=0, flags=0):
return _orig(host, port, socket.AF_INET, type, proto, flags)
socket.getaddrinfo = _v4
API_KEY = os.getenv("OPENDOTA_API_KEY")
SLEEP_PER_100 = float(os.getenv("SLEEP_PER_100","1.0"))
BASE = "https://api.opendota.com/api/matches/{mid}"
headers = {"User-Agent":"korobkaGames/1.0","Accept":"application/json","Connection":"close"}
# список match_id
pro = pd.read_parquet("data/pro_matches.parquet")
match_ids = pro['match_id'].drop_duplicates().tolist()
# requests с Retry
sess = requests.Session()
retries = Retry(total=6, connect=6, read=6, backoff_factor=0.7,
status_forcelist=[429,500,502,503,504],
allowed_methods=frozenset(["GET"]))
sess.mount("https://", HTTPAdapter(max_retries=retries))
def fetch_one(mid: int):
url = BASE.format(mid=mid)
if API_KEY:
url += f"?api_key={API_KEY}"
try:
r = sess.get(url, headers=headers, timeout=(5,40))
r.raise_for_status()
return r.json()
except requests.exceptions.SSLError:
# fallback: httpx http2 off
with httpx.Client(http2=False, timeout=40, headers=headers) as client:
resp = client.get(url)
resp.raise_for_status()
return resp.json()
match_rows, draft_rows, failed = [], [], []
for i, mid in enumerate(match_ids, 1):
try:
m = fetch_one(int(mid))
match_rows.append({
"match_id": int(mid),
"date": pd.to_datetime(m.get("start_time",0), unit="s"),
"patch": str(m.get("patch")),
"radiant_win": bool(m.get("radiant_win")),
"duration_sec": m.get("duration"),
"league_id": (m.get("league") or {}).get("id"),
"series_type": m.get("series_type"),
})
for pb in (m.get("picks_bans") or []):
draft_rows.append({
"match_id": int(mid),
"is_pick": pb.get("is_pick", False),
"team": pb.get("team"),
"hero_id": pb.get("hero_id"),
"order": pb.get("order")
})
except Exception:
failed.append(int(mid))
if i % 100 == 0:
time.sleep(SLEEP_PER_100)
pd.DataFrame(match_rows).to_parquet("data/matches.parquet", index=False)
pd.DataFrame(draft_rows).to_parquet("data/draft.parquet", index=False)
pd.Series(failed, name="failed_match_id").to_csv("data/matches_failed.csv", index=False)
print(f"Saved via OpenDota: matches={len(match_rows)} draft_rows={len(draft_rows)} failed={len(failed)}")
if not match_rows:
raise SystemExit("OpenDota details: ничего не скачано")
PYCODE
fi
# ===========================
# [3/6] Простой Elo baseline
# ===========================
echo "[3/6] Build Elo"
$PY - <<'PYCODE'
import pandas as pd
matches = pd.read_parquet("data/matches.parquet").sort_values("date")
pro = pd.read_parquet("data/pro_matches.parquet")[['match_id','radiant_name','dire_name']]
df = matches.merge(pro, on='match_id', how='left')
K = 24
elo = {}
def get_elo(t): return elo.get(t, 1500)
def expect(a,b): return 1.0/(1.0+10**((b-a)/400))
rows=[]
for _, r in df.iterrows():
A, B = r['radiant_name'], r['dire_name']
ra, rb = get_elo(A), get_elo(B)
ea, eb = expect(ra,rb), expect(rb,ra)
y = 1.0 if r['radiant_win'] else 0.0
rows.append({
'match_id': r['match_id'],
'date': r['date'],
'elo_radiant': ra, 'elo_dire': rb,
'elo_diff_90': ra - rb # упрощённо без окон
})
elo[A] = ra + K*(y-ea)
elo[B] = rb + K*((1-y)-eb)
pd.DataFrame(rows).to_parquet("data/elo.parquet", index=False)
print("Saved data/elo.parquet")
PYCODE
# [4] вместо старого build_dataset_draft.py
$PY educationML/build_dataset_mixed.py
# ===========================
# [5/6] Обучение модели
# ===========================
echo "[5/6] Train CatBoost"
$PY - <<'PYCODE'
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import log_loss, brier_score_loss
df = pd.read_parquet("data/dataset_mixed.parquet").sort_values("date")
cat_cols = ['patch','source','r_h1','r_h2','r_h3','r_h4','r_h5','d_h1','d_h2','d_h3','d_h4','d_h5']
X = df.drop(columns=['y','date','match_id']) if 'match_id' in df.columns else df.drop(columns=['y','date'])
y = df['y']
cat_idx = [X.columns.get_loc(c) for c in cat_cols]
tscv = TimeSeriesSplit(n_splits=5)
ll, br = [], []
for tr, te in tscv.split(X):
model = CatBoostClassifier(
depth=8, iterations=1200, learning_rate=0.03,
loss_function='Logloss', eval_metric='Logloss', verbose=False
)
model.fit(X.iloc[tr], y.iloc[tr], cat_features=cat_idx)
p = model.predict_proba(X.iloc[te])[:,1]
ll.append(log_loss(y.iloc[te], p))
br.append(brier_score_loss(y.iloc[te], p))
print("CV LogLoss=", sum(ll)/len(ll), " Brier=", sum(br)/len(br))
final = CatBoostClassifier(depth=8, iterations=1500, learning_rate=0.03,
loss_function='Logloss', verbose=False)
final.fit(X, y, cat_features=cat_idx)
final.save_model("artifacts/model_draft.cbm")
pd.Series(X.columns).to_csv("artifacts/feature_order.csv", index=False)
print("Saved artifacts/model_draft.cbm and artifacts/feature_order.csv")
PYCODE
# ===========================
# [6/6] REST-сервис предсказаний
# ===========================
echo "[6/6] Start API → http://127.0.0.1:$PORT"
exec uvicorn serve:app --host 0.0.0.0 --port "$PORT"