#!/usr/bin/env bash set -euo pipefail # =========================== # Конфиг (меняйте по желанию) # =========================== PY=python3 VENV=".venv" # Порт REST-сервиса: PORT="${PORT:-8000}" # Принудить IPv4 (полезно при проблемном IPv6 у провайдера): FORCE_IPV4="${FORCE_IPV4:-1}" # API-ключи (необязательно, но улучшает стабильность/квоты): # export OPENDOTA_API_KEY=... # export STRATZ_TOKEN=... OPENDOTA_API_KEY="${OPENDOTA_API_KEY:-}" STRATZ_TOKEN="${STRATZ_TOKEN:-}" # Использовать STRATZ вместо OpenDota # только для шага 1 (список матчей): USE_STRATZ_LIST=1 # только для шага 2 (детали/драфт): USE_STRATZ_DETAILS=1 USE_STRATZ_LIST="${USE_STRATZ_LIST:-0}" USE_STRATZ_DETAILS="${USE_STRATZ_DETAILS:-0}" # Задержка после каждых 100 запросов (смягчить 429): SLEEP_PER_100="${SLEEP_PER_100:-1.0}" # =========================== # Окружение и зависимости # =========================== if [ ! -d "$VENV" ]; then $PY -m venv "$VENV" fi # shellcheck disable=SC1091 source "$VENV/bin/activate" pip install -U pip pip install pandas pyarrow requests httpx "urllib3>=2.2" certifi catboost scikit-learn fastapi uvicorn mkdir -p data artifacts # =========================== # Хелперы # =========================== export FORCE_IPV4 export OPENDOTA_API_KEY export STRATZ_TOKEN export PAGES export SLEEP_PER_100 # =========================== # [1/7] Список pub-матчей # =========================== # [1b] Паблики (high-rank) $PY educationML/fetch_public_matches.py # [2b] Детали пабликов (герои из players) $PY educationML/fetch_public_details.py # =========================== # [2/7] Список pro-матчей # =========================== echo "[1/6] Fetch pro matches via OpenDota (pages=$PAGES)" $PY educationML/fetch_pro_matches_opendota.py # ========================================= # [2/6] Детали матча + драфт (устойчивый) # ========================================= if [ "$USE_STRATZ_DETAILS" = "1" ]; then echo "[2/6] Fetch match details + draft via STRATZ" $PY - <<'PYCODE' PYCODE else echo "[2/6] Fetch match details + draft via OpenDota (robust)" $PY - <<'PYCODE' import os, time, socket, sys, pandas as pd, requests, httpx from urllib3.util.retry import Retry from requests.adapters import HTTPAdapter # IPv4-only (если FORCE_IPV4=1) if os.getenv("FORCE_IPV4","1") == "1": _orig = socket.getaddrinfo def _v4(host, port, family=0, type=0, proto=0, flags=0): return _orig(host, port, socket.AF_INET, type, proto, flags) socket.getaddrinfo = _v4 API_KEY = os.getenv("OPENDOTA_API_KEY") SLEEP_PER_100 = float(os.getenv("SLEEP_PER_100","1.0")) BASE = "https://api.opendota.com/api/matches/{mid}" headers = {"User-Agent":"korobkaGames/1.0","Accept":"application/json","Connection":"close"} # список match_id pro = pd.read_parquet("data/pro_matches.parquet") match_ids = pro['match_id'].drop_duplicates().tolist() # requests с Retry sess = requests.Session() retries = Retry(total=6, connect=6, read=6, backoff_factor=0.7, status_forcelist=[429,500,502,503,504], allowed_methods=frozenset(["GET"])) sess.mount("https://", HTTPAdapter(max_retries=retries)) def fetch_one(mid: int): url = BASE.format(mid=mid) if API_KEY: url += f"?api_key={API_KEY}" try: r = sess.get(url, headers=headers, timeout=(5,40)) r.raise_for_status() return r.json() except requests.exceptions.SSLError: # fallback: httpx http2 off with httpx.Client(http2=False, timeout=40, headers=headers) as client: resp = client.get(url) resp.raise_for_status() return resp.json() match_rows, draft_rows, failed = [], [], [] for i, mid in enumerate(match_ids, 1): try: m = fetch_one(int(mid)) match_rows.append({ "match_id": int(mid), "date": pd.to_datetime(m.get("start_time",0), unit="s"), "patch": str(m.get("patch")), "radiant_win": bool(m.get("radiant_win")), "duration_sec": m.get("duration"), "league_id": (m.get("league") or {}).get("id"), "series_type": m.get("series_type"), }) for pb in (m.get("picks_bans") or []): draft_rows.append({ "match_id": int(mid), "is_pick": pb.get("is_pick", False), "team": pb.get("team"), "hero_id": pb.get("hero_id"), "order": pb.get("order") }) except Exception: failed.append(int(mid)) if i % 100 == 0: time.sleep(SLEEP_PER_100) pd.DataFrame(match_rows).to_parquet("data/matches.parquet", index=False) pd.DataFrame(draft_rows).to_parquet("data/draft.parquet", index=False) pd.Series(failed, name="failed_match_id").to_csv("data/matches_failed.csv", index=False) print(f"Saved via OpenDota: matches={len(match_rows)} draft_rows={len(draft_rows)} failed={len(failed)}") if not match_rows: raise SystemExit("OpenDota details: ничего не скачано") PYCODE fi # =========================== # [3/6] Простой Elo baseline # =========================== echo "[3/6] Build Elo" $PY - <<'PYCODE' import pandas as pd matches = pd.read_parquet("data/matches.parquet").sort_values("date") pro = pd.read_parquet("data/pro_matches.parquet")[['match_id','radiant_name','dire_name']] df = matches.merge(pro, on='match_id', how='left') K = 24 elo = {} def get_elo(t): return elo.get(t, 1500) def expect(a,b): return 1.0/(1.0+10**((b-a)/400)) rows=[] for _, r in df.iterrows(): A, B = r['radiant_name'], r['dire_name'] ra, rb = get_elo(A), get_elo(B) ea, eb = expect(ra,rb), expect(rb,ra) y = 1.0 if r['radiant_win'] else 0.0 rows.append({ 'match_id': r['match_id'], 'date': r['date'], 'elo_radiant': ra, 'elo_dire': rb, 'elo_diff_90': ra - rb # упрощённо без окон }) elo[A] = ra + K*(y-ea) elo[B] = rb + K*((1-y)-eb) pd.DataFrame(rows).to_parquet("data/elo.parquet", index=False) print("Saved data/elo.parquet") PYCODE # [4] вместо старого build_dataset_draft.py $PY educationML/build_dataset_mixed.py # =========================== # [5/6] Обучение модели # =========================== echo "[5/6] Train CatBoost" $PY - <<'PYCODE' import pandas as pd from catboost import CatBoostClassifier from sklearn.model_selection import TimeSeriesSplit from sklearn.metrics import log_loss, brier_score_loss df = pd.read_parquet("data/dataset_mixed.parquet").sort_values("date") cat_cols = ['patch','source','r_h1','r_h2','r_h3','r_h4','r_h5','d_h1','d_h2','d_h3','d_h4','d_h5'] X = df.drop(columns=['y','date','match_id']) if 'match_id' in df.columns else df.drop(columns=['y','date']) y = df['y'] cat_idx = [X.columns.get_loc(c) for c in cat_cols] tscv = TimeSeriesSplit(n_splits=5) ll, br = [], [] for tr, te in tscv.split(X): model = CatBoostClassifier( depth=8, iterations=1200, learning_rate=0.03, loss_function='Logloss', eval_metric='Logloss', verbose=False ) model.fit(X.iloc[tr], y.iloc[tr], cat_features=cat_idx) p = model.predict_proba(X.iloc[te])[:,1] ll.append(log_loss(y.iloc[te], p)) br.append(brier_score_loss(y.iloc[te], p)) print("CV LogLoss=", sum(ll)/len(ll), " Brier=", sum(br)/len(br)) final = CatBoostClassifier(depth=8, iterations=1500, learning_rate=0.03, loss_function='Logloss', verbose=False) final.fit(X, y, cat_features=cat_idx) final.save_model("artifacts/model_draft.cbm") pd.Series(X.columns).to_csv("artifacts/feature_order.csv", index=False) print("Saved artifacts/model_draft.cbm and artifacts/feature_order.csv") PYCODE # =========================== # [6/6] REST-сервис предсказаний # =========================== echo "[6/6] Start API → http://127.0.0.1:$PORT" exec uvicorn serve:app --host 0.0.0.0 --port "$PORT"