domain_updater.py 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153
  1. #!/usr/bin/env python3
  2. import argparse
  3. import csv
  4. import datetime as dt
  5. import functools
  6. import json
  7. import math
  8. import os
  9. import re
  10. import subprocess
  11. import sys
  12. import tempfile
  13. import urllib.parse
  14. import urllib.request
  15. DOMAIN_RE = re.compile(r"^(?=.{1,253}$)(?!-)[A-Za-z0-9-]{1,63}(?<!-)(\.(?!-)[A-Za-z0-9-]{1,63}(?<!-))+$")
  16. IPV4_RE = re.compile(r"^(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}$")
  17. def utc_now_iso():
  18. return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
  19. def read_json_file(path, default=None):
  20. if default is None:
  21. default = {}
  22. if not os.path.exists(path):
  23. return default
  24. try:
  25. with open(path, "r", encoding="utf-8") as f:
  26. return json.load(f)
  27. except (ValueError, json.JSONDecodeError):
  28. return default
  29. def _atomic_write(path, payload_bytes):
  30. """Write `payload_bytes` to `path` atomically.
  31. Strategy: write to a sibling temp file, fsync, then os.replace.
  32. Concurrent readers either see the previous version or the new
  33. one — never a truncated/empty file. Required for state.json,
  34. which is the sole fallback source on the next run.
  35. """
  36. parent = os.path.dirname(path) or "."
  37. os.makedirs(parent, exist_ok=True)
  38. fd, tmp_path = tempfile.mkstemp(
  39. prefix=".{}.".format(os.path.basename(path)),
  40. suffix=".tmp",
  41. dir=parent,
  42. )
  43. try:
  44. with os.fdopen(fd, "wb") as f:
  45. f.write(payload_bytes)
  46. f.flush()
  47. try:
  48. os.fsync(f.fileno())
  49. except OSError:
  50. # fsync may fail on some filesystems (e.g. tmpfs without
  51. # backing store); the rename below still gives atomicity
  52. # within a single filesystem.
  53. pass
  54. os.replace(tmp_path, path)
  55. except Exception:
  56. try:
  57. os.unlink(tmp_path)
  58. except OSError:
  59. pass
  60. raise
  61. def write_json_file(path, data):
  62. payload = json.dumps(data, ensure_ascii=True, indent=2)
  63. _atomic_write(path, payload.encode("utf-8"))
  64. def write_text_file(path, data):
  65. _atomic_write(path, data.encode("utf-8"))
  66. def build_url(base_url, params):
  67. if not params:
  68. return base_url
  69. parsed = urllib.parse.urlparse(base_url)
  70. current = urllib.parse.parse_qs(parsed.query)
  71. for k, v in params.items():
  72. current[k] = [str(v)]
  73. query = urllib.parse.urlencode(current, doseq=True)
  74. return urllib.parse.urlunparse(parsed._replace(query=query))
  75. def resolve_path(base_dir, path_value):
  76. path_text = str(path_value or "").strip()
  77. if not path_text:
  78. return ""
  79. if os.path.isabs(path_text):
  80. return os.path.normpath(path_text)
  81. return os.path.normpath(os.path.join(base_dir, path_text))
  82. def get_source_type(cfg):
  83. source_cfg = cfg.get("source", {})
  84. if isinstance(source_cfg, dict):
  85. source_type = str(source_cfg.get("type", "api")).strip().lower()
  86. if source_type:
  87. return source_type
  88. return "api"
  89. def fetch_api_json(cfg):
  90. api = cfg["api"]
  91. url = build_url(api["url"], api.get("params", {}))
  92. method = api.get("method", "GET").upper()
  93. headers = api.get("headers", {})
  94. timeout = int(api.get("timeout_sec", 10))
  95. body_obj = api.get("body")
  96. body = None
  97. if body_obj is not None:
  98. body = json.dumps(body_obj).encode("utf-8")
  99. headers = {**headers, "Content-Type": "application/json"}
  100. req = urllib.request.Request(url=url, data=body, headers=headers, method=method)
  101. with urllib.request.urlopen(req, timeout=timeout) as resp:
  102. raw = resp.read().decode("utf-8", errors="replace")
  103. return json.loads(raw)
  104. def load_cfst_rows(cfg, config_path_abs):
  105. cfst_cfg = cfg.get("cfst_local", {})
  106. config_dir = os.path.dirname(config_path_abs)
  107. work_dir = resolve_path(config_dir, cfst_cfg.get("work_dir", "./cfst"))
  108. binary_path = resolve_path(work_dir, cfst_cfg.get("binary", "./cfst"))
  109. result_file = resolve_path(work_dir, cfst_cfg.get("result_file", "result.csv"))
  110. encoding = str(cfst_cfg.get("encoding", "utf-8")).strip() or "utf-8"
  111. skip_run = bool(cfst_cfg.get("skip_run", False))
  112. timeout_sec = int(cfst_cfg.get("run_timeout_sec", 600))
  113. run_args = cfst_cfg.get("run_args", ["-o", os.path.basename(result_file)])
  114. if not isinstance(run_args, list):
  115. raise ValueError("cfst_local.run_args must be an array")
  116. command = [binary_path] + [str(x) for x in run_args]
  117. if not skip_run:
  118. completed = subprocess.run(
  119. command,
  120. cwd=work_dir,
  121. check=False,
  122. capture_output=True,
  123. text=True,
  124. encoding=encoding,
  125. errors="replace",
  126. timeout=timeout_sec,
  127. )
  128. if completed.returncode != 0:
  129. stderr = (completed.stderr or "").strip()
  130. stdout = (completed.stdout or "").strip()
  131. details = stderr or stdout or f"exit code {completed.returncode}"
  132. raise RuntimeError(f"cfst run failed: {details}")
  133. if not os.path.exists(result_file):
  134. raise RuntimeError(f"cfst result file not found: {result_file}")
  135. with open(result_file, "r", encoding=encoding, errors="replace", newline="") as f:
  136. reader = csv.reader(f)
  137. rows = [row for row in reader if any(str(col).strip() for col in row)]
  138. header_rows = int(cfst_cfg.get("header_rows", 1))
  139. if len(rows) <= header_rows:
  140. raise RuntimeError("cfst result has no data rows")
  141. columns_cfg = cfst_cfg.get("columns", {})
  142. if not isinstance(columns_cfg, dict):
  143. raise ValueError("cfst_local.columns must be an object")
  144. def col_index(name, default_index):
  145. raw = columns_cfg.get(name, default_index)
  146. try:
  147. idx = int(raw)
  148. except Exception as exc:
  149. raise ValueError(f"cfst_local.columns.{name} must be an integer") from exc
  150. if idx < 0:
  151. raise ValueError(f"cfst_local.columns.{name} must be >= 0")
  152. return idx
  153. ip_idx = col_index("ip", 0)
  154. sent_idx = col_index("sent", 1)
  155. received_idx = col_index("received", 2)
  156. loss_idx = col_index("loss_rate", 3)
  157. latency_idx = col_index("avg_latency", 4)
  158. speed_idx = col_index("download_speed", 5)
  159. region_idx = col_index("region", 6)
  160. out = []
  161. for row in rows[header_rows:]:
  162. if ip_idx >= len(row):
  163. continue
  164. domain = normalize_domain(row[ip_idx])
  165. if not domain:
  166. continue
  167. out.append(
  168. {
  169. "domain": domain,
  170. "ip": domain,
  171. "sent": row[sent_idx].strip() if sent_idx < len(row) else "",
  172. "received": row[received_idx].strip() if received_idx < len(row) else "",
  173. "loss_rate": row[loss_idx].strip() if loss_idx < len(row) else "",
  174. "avg_latency": row[latency_idx].strip() if latency_idx < len(row) else "",
  175. "download_speed": row[speed_idx].strip() if speed_idx < len(row) else "",
  176. "region": row[region_idx].strip() if region_idx < len(row) else "",
  177. }
  178. )
  179. if not out:
  180. raise RuntimeError("cfst result parsed to zero valid rows")
  181. return out
  182. def flatten_values(value):
  183. out = []
  184. if isinstance(value, str):
  185. out.append(value)
  186. elif isinstance(value, list):
  187. for item in value:
  188. out.extend(flatten_values(item))
  189. elif isinstance(value, dict):
  190. for item in value.values():
  191. out.extend(flatten_values(item))
  192. return out
  193. def get_by_json_path(data, path):
  194. cur = data
  195. for part in path.split("."):
  196. if isinstance(cur, dict) and part in cur:
  197. cur = cur[part]
  198. else:
  199. return None
  200. return cur
  201. def get_values_by_path(data, path):
  202. parts = path.split(".")
  203. def walk(cur, idx):
  204. if idx >= len(parts):
  205. return [cur]
  206. part = parts[idx]
  207. if part.endswith("[]"):
  208. key = part[:-2]
  209. if isinstance(cur, dict):
  210. arr = cur.get(key)
  211. else:
  212. arr = None
  213. if not isinstance(arr, list):
  214. return []
  215. out = []
  216. for item in arr:
  217. out.extend(walk(item, idx + 1))
  218. return out
  219. if isinstance(cur, dict) and part in cur:
  220. return walk(cur[part], idx + 1)
  221. return []
  222. return walk(data, 0)
  223. def parse_domains(payload, parser_cfg):
  224. domains = []
  225. for p in parser_cfg.get("field_paths", []):
  226. values = get_values_by_path(payload, p)
  227. domains.extend(flatten_values(values))
  228. for p in parser_cfg.get("json_paths", []):
  229. v = get_by_json_path(payload, p)
  230. if v is not None:
  231. domains.extend(flatten_values(v))
  232. if not domains:
  233. regex_s = parser_cfg.get("regex", r"[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
  234. text = json.dumps(payload, ensure_ascii=True)
  235. domains.extend(re.findall(regex_s, text))
  236. clean = []
  237. seen = set()
  238. for d in domains:
  239. d = str(d).strip().lower().rstrip(".")
  240. if (DOMAIN_RE.match(d) or IPV4_RE.match(d)) and d not in seen:
  241. seen.add(d)
  242. clean.append(d)
  243. return clean
  244. def parse_timezone(tz_raw):
  245. if tz_raw is None:
  246. return dt.timezone.utc
  247. s = str(tz_raw).strip().upper()
  248. if s in {"", "UTC", "Z", "+00:00", "+0000"}:
  249. return dt.timezone.utc
  250. m = re.match(r"^([+-])(\d{2}):?(\d{2})$", s)
  251. if not m:
  252. raise ValueError(f"invalid created_time_timezone: {tz_raw}")
  253. sign = 1 if m.group(1) == "+" else -1
  254. hh = int(m.group(2))
  255. mm = int(m.group(3))
  256. if hh > 23 or mm > 59:
  257. raise ValueError(f"invalid created_time_timezone offset: {tz_raw}")
  258. return dt.timezone(sign * dt.timedelta(hours=hh, minutes=mm))
  259. def parse_created_time(value, formats, timezone):
  260. if value is None:
  261. return None
  262. s = str(value).strip()
  263. if not s:
  264. return None
  265. for fmt in formats:
  266. try:
  267. parsed = dt.datetime.strptime(s, fmt)
  268. if parsed.tzinfo is None:
  269. parsed = parsed.replace(tzinfo=timezone)
  270. return parsed.astimezone(dt.timezone.utc)
  271. except Exception:
  272. continue
  273. try:
  274. iso_text = s.replace("Z", "+00:00")
  275. parsed = dt.datetime.fromisoformat(iso_text)
  276. if parsed.tzinfo is None:
  277. parsed = parsed.replace(tzinfo=timezone)
  278. return parsed.astimezone(dt.timezone.utc)
  279. except Exception:
  280. return None
  281. def normalize_domain(value):
  282. if value is None:
  283. return ""
  284. return str(value).strip().lower().rstrip(".")
  285. def to_float_or_none(value):
  286. try:
  287. f = float(value)
  288. if not math.isfinite(f):
  289. return None
  290. return f
  291. except Exception:
  292. return None
  293. def resolve_field(record, field_name, field_map):
  294. path = field_map.get(field_name)
  295. if not path:
  296. raise ValueError(f"field '{field_name}' is not registered in record_mapping.field_map")
  297. if not isinstance(record, dict):
  298. return None
  299. return get_by_json_path(record, path)
  300. def extract_records(payload, record_mapping):
  301. records_path = str(record_mapping.get("records_path", "")).strip()
  302. raw = get_values_by_path(payload, records_path)
  303. return [x for x in raw if isinstance(x, dict)]
  304. def validate_config(cfg):
  305. source_type = get_source_type(cfg)
  306. if source_type not in {"api", "cfst_local"}:
  307. raise ValueError("source.type must be 'api' or 'cfst_local'")
  308. output_cfg = cfg.get("output", {})
  309. if output_cfg and not isinstance(output_cfg, dict):
  310. raise ValueError("output must be an object")
  311. if source_type == "cfst_local":
  312. cfst_cfg = cfg.get("cfst_local")
  313. if not isinstance(cfst_cfg, dict):
  314. raise ValueError("cfst_local is required and must be an object when source.type=cfst_local")
  315. work_dir = str(cfst_cfg.get("work_dir", "")).strip()
  316. if not work_dir:
  317. raise ValueError("cfst_local.work_dir is required")
  318. binary = str(cfst_cfg.get("binary", "")).strip()
  319. if not binary:
  320. raise ValueError("cfst_local.binary is required")
  321. result_file = str(cfst_cfg.get("result_file", "")).strip()
  322. if not result_file:
  323. raise ValueError("cfst_local.result_file is required")
  324. run_args = cfst_cfg.get("run_args", [])
  325. if not isinstance(run_args, list):
  326. raise ValueError("cfst_local.run_args must be an array")
  327. columns_cfg = cfst_cfg.get("columns", {})
  328. if columns_cfg and not isinstance(columns_cfg, dict):
  329. raise ValueError("cfst_local.columns must be an object")
  330. return
  331. record_mapping = cfg.get("record_mapping")
  332. if not isinstance(record_mapping, dict):
  333. raise ValueError("record_mapping is required and must be an object")
  334. records_path = str(record_mapping.get("records_path", "")).strip()
  335. if not records_path:
  336. raise ValueError("record_mapping.records_path is required")
  337. field_map = record_mapping.get("field_map")
  338. if not isinstance(field_map, dict) or not field_map:
  339. raise ValueError("record_mapping.field_map is required and must be a non-empty object")
  340. for key, path in field_map.items():
  341. if not str(key).strip() or not str(path).strip():
  342. raise ValueError("record_mapping.field_map contains empty field name or path")
  343. for required in ["domain", "created_at"]:
  344. if required not in field_map:
  345. raise ValueError(f"record_mapping.field_map.{required} is required")
  346. created_time_formats = record_mapping.get("created_time_formats")
  347. if not isinstance(created_time_formats, list) or not created_time_formats:
  348. raise ValueError("record_mapping.created_time_formats is required and must be a non-empty array")
  349. for fmt in created_time_formats:
  350. if not str(fmt).strip():
  351. raise ValueError("record_mapping.created_time_formats contains empty format")
  352. parse_timezone(record_mapping.get("created_time_timezone", "UTC"))
  353. def ensure_field_registered(field_name, where):
  354. if field_name not in field_map:
  355. raise ValueError(f"{where}: field '{field_name}' is not in record_mapping.field_map")
  356. record_filter = cfg.get("record_filter", {})
  357. if record_filter.get("enabled", False):
  358. rules = record_filter.get("exclude_if_any", [])
  359. if not isinstance(rules, list):
  360. raise ValueError("record_filter.exclude_if_any must be an array")
  361. for i, rule in enumerate(rules):
  362. if not isinstance(rule, dict):
  363. raise ValueError(f"record_filter.exclude_if_any[{i}] must be an object")
  364. field_name = str(rule.get("field", "")).strip()
  365. if not field_name:
  366. raise ValueError(f"record_filter.exclude_if_any[{i}].field is required")
  367. ensure_field_registered(field_name, f"record_filter.exclude_if_any[{i}]")
  368. has_matcher = any(k in rule for k in ["contains", "equals", "regex"])
  369. if not has_matcher:
  370. raise ValueError(f"record_filter.exclude_if_any[{i}] must include one of contains/equals/regex")
  371. scoring = cfg.get("scoring", {})
  372. if scoring.get("enabled", False):
  373. strategy = str(scoring.get("strategy", "")).strip()
  374. if strategy not in {"weighted_average", "lexicographic"}:
  375. raise ValueError("scoring.strategy must be 'weighted_average' or 'lexicographic'")
  376. within_hours = to_float_or_none(scoring.get("within_hours", 0))
  377. if within_hours is None or within_hours < 0:
  378. raise ValueError("scoring.within_hours must be a non-negative number (0 = disabled)")
  379. if strategy == "weighted_average":
  380. weighted_fields = scoring.get("weighted_fields")
  381. if not isinstance(weighted_fields, list) or not weighted_fields:
  382. raise ValueError("scoring.weighted_fields is required for weighted_average strategy")
  383. for i, item in enumerate(weighted_fields):
  384. if not isinstance(item, dict):
  385. raise ValueError(f"scoring.weighted_fields[{i}] must be an object")
  386. field_name = str(item.get("field", "")).strip()
  387. if not field_name:
  388. raise ValueError(f"scoring.weighted_fields[{i}].field is required")
  389. ensure_field_registered(field_name, f"scoring.weighted_fields[{i}]")
  390. weight = to_float_or_none(item.get("weight"))
  391. if weight is None or weight <= 0:
  392. raise ValueError(f"scoring.weighted_fields[{i}].weight must be > 0")
  393. if strategy == "lexicographic":
  394. lex_fields = scoring.get("lexicographic_fields")
  395. if not isinstance(lex_fields, list) or not lex_fields:
  396. raise ValueError("scoring.lexicographic_fields is required for lexicographic strategy")
  397. for i, item in enumerate(lex_fields):
  398. if isinstance(item, str):
  399. field_name = item.strip()
  400. order = ""
  401. elif isinstance(item, dict):
  402. field_name = str(item.get("field", "")).strip()
  403. order = str(item.get("order", "")).strip().lower()
  404. else:
  405. raise ValueError(f"scoring.lexicographic_fields[{i}] must be string or object")
  406. if not field_name:
  407. raise ValueError(f"scoring.lexicographic_fields[{i}] field is required")
  408. ensure_field_registered(field_name, f"scoring.lexicographic_fields[{i}]")
  409. if order and order not in {"asc", "desc"}:
  410. raise ValueError(f"scoring.lexicographic_fields[{i}].order must be asc or desc")
  411. tie_breakers = scoring.get("tie_breakers", [])
  412. if tie_breakers is not None:
  413. if not isinstance(tie_breakers, list):
  414. raise ValueError("scoring.tie_breakers must be an array")
  415. for i, item in enumerate(tie_breakers):
  416. if not isinstance(item, dict):
  417. raise ValueError(f"scoring.tie_breakers[{i}] must be an object")
  418. field_name = str(item.get("field", "")).strip()
  419. order = str(item.get("order", "")).strip().lower()
  420. if not field_name:
  421. raise ValueError(f"scoring.tie_breakers[{i}].field is required")
  422. if order not in {"asc", "desc"}:
  423. raise ValueError(f"scoring.tie_breakers[{i}].order must be asc or desc")
  424. ensure_field_registered(field_name, f"scoring.tie_breakers[{i}]")
  425. def rule_matches(value, rule):
  426. if value is None or not isinstance(rule, dict):
  427. return False
  428. values = flatten_values(value)
  429. if not values:
  430. values = [value]
  431. case_sensitive = bool(rule.get("case_sensitive", False))
  432. if "contains" in rule:
  433. needle = str(rule.get("contains", ""))
  434. if not needle:
  435. return False
  436. for item in values:
  437. hay = str(item)
  438. if case_sensitive:
  439. if needle in hay:
  440. return True
  441. else:
  442. if needle.lower() in hay.lower():
  443. return True
  444. return False
  445. if "equals" in rule:
  446. target = str(rule.get("equals", ""))
  447. for item in values:
  448. item_s = str(item)
  449. if case_sensitive:
  450. if item_s == target:
  451. return True
  452. else:
  453. if item_s.lower() == target.lower():
  454. return True
  455. return False
  456. if "regex" in rule:
  457. pattern = str(rule.get("regex", ""))
  458. if not pattern:
  459. return False
  460. flags = 0 if case_sensitive else re.IGNORECASE
  461. try:
  462. rx = re.compile(pattern, flags)
  463. except Exception:
  464. return False
  465. for item in values:
  466. if rx.search(str(item)):
  467. return True
  468. return False
  469. return False
  470. def collect_excluded_domains(records, field_map, record_filter_cfg):
  471. if not record_filter_cfg.get("enabled", False):
  472. return set()
  473. rules = record_filter_cfg.get("exclude_if_any", [])
  474. if not rules:
  475. return set()
  476. blocked = set()
  477. for record in records:
  478. domain = normalize_domain(resolve_field(record, "domain", field_map))
  479. if not domain:
  480. continue
  481. for rule in rules:
  482. field_name = str(rule.get("field", "")).strip()
  483. if not field_name:
  484. continue
  485. value = resolve_field(record, field_name, field_map)
  486. if rule_matches(value, rule):
  487. blocked.add(domain)
  488. break
  489. return blocked
  490. def build_lexicographic_descriptors(scoring_cfg, prefer_lower):
  491. out = []
  492. for item in scoring_cfg.get("lexicographic_fields", []):
  493. if isinstance(item, str):
  494. field_name = item.strip()
  495. order = "asc" if prefer_lower else "desc"
  496. else:
  497. field_name = str(item.get("field", "")).strip()
  498. order = str(item.get("order", "")).strip().lower()
  499. if not order:
  500. order = "asc" if prefer_lower else "desc"
  501. out.append({"field": field_name, "order": order})
  502. return out
  503. def parse_scored_records(records, field_map, record_mapping_cfg, scoring_cfg):
  504. if not scoring_cfg.get("enabled", False):
  505. return []
  506. strategy = str(scoring_cfg.get("strategy", "weighted_average")).strip()
  507. prefer_lower = bool(scoring_cfg.get("prefer_lower", False))
  508. timezone = parse_timezone(record_mapping_cfg.get("created_time_timezone", "UTC"))
  509. time_formats = [str(x) for x in record_mapping_cfg.get("created_time_formats", [])]
  510. weighted_fields = scoring_cfg.get("weighted_fields", []) if strategy == "weighted_average" else []
  511. lex_descriptors = build_lexicographic_descriptors(scoring_cfg, prefer_lower) if strategy == "lexicographic" else []
  512. needed_fields = set()
  513. for item in weighted_fields:
  514. needed_fields.add(str(item.get("field", "")).strip())
  515. for item in lex_descriptors:
  516. needed_fields.add(str(item.get("field", "")).strip())
  517. for item in scoring_cfg.get("tie_breakers", []):
  518. needed_fields.add(str(item.get("field", "")).strip())
  519. needed_fields.discard("domain")
  520. needed_fields.discard("created_at")
  521. out = []
  522. for record in records:
  523. domain = normalize_domain(resolve_field(record, "domain", field_map))
  524. if not domain:
  525. continue
  526. created_raw = resolve_field(record, "created_at", field_map)
  527. created_at = parse_created_time(created_raw, time_formats, timezone)
  528. field_values = {}
  529. for field_name in needed_fields:
  530. field_values[field_name] = resolve_field(record, field_name, field_map)
  531. score_value = None
  532. scores = []
  533. lex_values = []
  534. if strategy == "weighted_average":
  535. total = 0.0
  536. total_weight = 0.0
  537. missing = False
  538. for item in weighted_fields:
  539. field_name = str(item.get("field", "")).strip()
  540. weight = float(item.get("weight"))
  541. raw_v = resolve_field(record, field_name, field_map)
  542. val = to_float_or_none(raw_v)
  543. scores.append(val)
  544. if val is None:
  545. missing = True
  546. continue
  547. total += val * weight
  548. total_weight += weight
  549. if not missing and total_weight > 0:
  550. score_value = total / total_weight
  551. if strategy == "lexicographic":
  552. for item in lex_descriptors:
  553. field_name = item["field"]
  554. order = item["order"]
  555. raw_v = resolve_field(record, field_name, field_map)
  556. num_v = to_float_or_none(raw_v)
  557. v = num_v if num_v is not None else raw_v
  558. lex_values.append({"field": field_name, "value": v, "order": order})
  559. scores.append(v)
  560. out.append(
  561. {
  562. "domain": domain,
  563. "created_at": created_at,
  564. "created_raw": created_raw,
  565. "scores": scores,
  566. "score_value": score_value,
  567. "lex_values": lex_values,
  568. "field_values": field_values,
  569. "raw_record": record,
  570. }
  571. )
  572. return out
  573. def cmp_scalar(a, b, order):
  574. a_none = a is None
  575. b_none = b is None
  576. if a_none and b_none:
  577. return 0
  578. if a_none:
  579. return 1
  580. if b_none:
  581. return -1
  582. if isinstance(a, dt.datetime):
  583. a = a.timestamp()
  584. if isinstance(b, dt.datetime):
  585. b = b.timestamp()
  586. a_num = to_float_or_none(a)
  587. b_num = to_float_or_none(b)
  588. if a_num is not None and b_num is not None:
  589. if a_num < b_num:
  590. base = -1
  591. elif a_num > b_num:
  592. base = 1
  593. else:
  594. base = 0
  595. else:
  596. a_s = str(a).lower()
  597. b_s = str(b).lower()
  598. if a_s < b_s:
  599. base = -1
  600. elif a_s > b_s:
  601. base = 1
  602. else:
  603. base = 0
  604. return base if order == "asc" else -base
  605. def get_sort_field_value(record, field_name):
  606. if field_name == "domain":
  607. return record.get("domain")
  608. if field_name == "created_at":
  609. return record.get("created_at")
  610. return record.get("field_values", {}).get(field_name)
  611. def rank_scored_records(records, scoring_cfg):
  612. if not records:
  613. return []
  614. within_hours = float(scoring_cfg.get("within_hours", 0))
  615. strategy = str(scoring_cfg.get("strategy", "weighted_average")).strip()
  616. prefer_lower = bool(scoring_cfg.get("prefer_lower", False))
  617. tie_breakers = scoring_cfg.get("tie_breakers", [])
  618. if within_hours > 0:
  619. now = dt.datetime.now(dt.timezone.utc)
  620. cutoff = now - dt.timedelta(hours=within_hours)
  621. recent = [r for r in records if r.get("created_at") is not None and r["created_at"] >= cutoff]
  622. candidates = recent if recent else records
  623. else:
  624. candidates = records
  625. default_lex_order = "asc" if prefer_lower else "desc"
  626. def compare(a, b):
  627. if strategy == "weighted_average":
  628. order = "asc" if prefer_lower else "desc"
  629. c = cmp_scalar(a.get("score_value"), b.get("score_value"), order)
  630. if c != 0:
  631. return c
  632. elif strategy == "lexicographic":
  633. a_lex = a.get("lex_values", [])
  634. b_lex = b.get("lex_values", [])
  635. n = max(len(a_lex), len(b_lex))
  636. for i in range(n):
  637. av = a_lex[i]["value"] if i < len(a_lex) else None
  638. bv = b_lex[i]["value"] if i < len(b_lex) else None
  639. order = default_lex_order
  640. if i < len(a_lex) and a_lex[i].get("order"):
  641. order = a_lex[i]["order"]
  642. c = cmp_scalar(av, bv, order)
  643. if c != 0:
  644. return c
  645. for item in tie_breakers:
  646. field_name = str(item.get("field", "")).strip()
  647. order = str(item.get("order", "asc")).strip().lower()
  648. av = get_sort_field_value(a, field_name)
  649. bv = get_sort_field_value(b, field_name)
  650. c = cmp_scalar(av, bv, order)
  651. if c != 0:
  652. return c
  653. return cmp_scalar(a.get("domain"), b.get("domain"), "asc")
  654. return sorted(candidates, key=functools.cmp_to_key(compare))
  655. def apply_filter(domains, filter_cfg):
  656. include_suffixes = [s.lower() for s in filter_cfg.get("include_suffixes", []) if s]
  657. exclude_regex = [re.compile(x) for x in filter_cfg.get("exclude_regex", []) if x]
  658. out = []
  659. for d in domains:
  660. if include_suffixes and not any(d.endswith(s) for s in include_suffixes):
  661. continue
  662. if any(rx.search(d) for rx in exclude_regex):
  663. continue
  664. out.append(d)
  665. return out
  666. def choose_top_candidate_domains(filtered_domains, top_n, ranked_scored):
  667. if ranked_scored:
  668. domains_by_score = [x["domain"] for x in ranked_scored]
  669. return domains_by_score[:top_n]
  670. return filtered_domains[:top_n]
  671. def text_or_blank(value):
  672. if value is None:
  673. return ""
  674. return str(value).strip()
  675. def set_if_nonempty_text(obj, key, value):
  676. text = text_or_blank(value)
  677. if text:
  678. obj[key] = text
  679. def base_top_candidate(domain, source_type):
  680. candidate = {"domain": domain, "source_type": source_type}
  681. if IPV4_RE.match(domain):
  682. candidate["ip"] = domain
  683. return candidate
  684. def maybe_resolve_field(record, field_name, field_map):
  685. if not isinstance(record, dict):
  686. return None
  687. if field_name not in field_map:
  688. return None
  689. return resolve_field(record, field_name, field_map)
  690. def build_cfst_candidate(row):
  691. domain = row.get("domain", "")
  692. candidate = base_top_candidate(domain=domain, source_type="cfst_local")
  693. set_if_nonempty_text(candidate, "ip", row.get("ip") or domain)
  694. set_if_nonempty_text(candidate, "loss_rate", row.get("loss_rate"))
  695. set_if_nonempty_text(candidate, "avg_latency", row.get("avg_latency"))
  696. set_if_nonempty_text(candidate, "download_speed", row.get("download_speed"))
  697. set_if_nonempty_text(candidate, "region", row.get("region"))
  698. return candidate
  699. def build_api_candidate(domain, record, field_map, scored_record=None):
  700. candidate = base_top_candidate(domain=domain, source_type="api")
  701. if record:
  702. set_if_nonempty_text(candidate, "created_raw", maybe_resolve_field(record, "created_at", field_map))
  703. set_if_nonempty_text(candidate, "avg_latency", maybe_resolve_field(record, "avg_latency", field_map))
  704. set_if_nonempty_text(candidate, "loss_rate", maybe_resolve_field(record, "avg_pkg_lost_rate", field_map))
  705. set_if_nonempty_text(candidate, "download_speed", maybe_resolve_field(record, "download_speed", field_map))
  706. location_country = text_or_blank(maybe_resolve_field(record, "location_country", field_map))
  707. location_city = text_or_blank(maybe_resolve_field(record, "location_city", field_map))
  708. region = "/".join([x for x in [location_country, location_city] if x])
  709. set_if_nonempty_text(candidate, "region", region)
  710. if scored_record:
  711. if scored_record.get("score_value") is not None:
  712. candidate["score_value"] = scored_record.get("score_value")
  713. scores = list(scored_record.get("scores", []))
  714. if scores:
  715. candidate["scores"] = scores
  716. if "created_raw" not in candidate:
  717. set_if_nonempty_text(candidate, "created_raw", scored_record.get("created_raw"))
  718. return candidate
  719. def build_top_candidates(source_type, candidate_domains, cfst_rows=None, records=None, field_map=None, ranked_scored=None):
  720. if source_type == "cfst_local":
  721. row_map = {}
  722. for row in cfst_rows or []:
  723. domain = row.get("domain", "")
  724. if domain and domain not in row_map:
  725. row_map[domain] = row
  726. return [build_cfst_candidate(row_map.get(d, {"domain": d, "ip": d})) for d in candidate_domains]
  727. ranked_map = {}
  728. for item in ranked_scored or []:
  729. domain = item.get("domain", "")
  730. if domain and domain not in ranked_map:
  731. ranked_map[domain] = item
  732. record_map = {}
  733. for record in records or []:
  734. domain = normalize_domain(maybe_resolve_field(record, "domain", field_map or {}))
  735. if domain and domain not in record_map:
  736. record_map[domain] = record
  737. out = []
  738. for domain in candidate_domains:
  739. scored_record = ranked_map.get(domain)
  740. record = None
  741. if scored_record:
  742. record = scored_record.get("raw_record")
  743. if record is None:
  744. record = record_map.get(domain)
  745. out.append(build_api_candidate(domain, record, field_map or {}, scored_record=scored_record))
  746. return out
  747. def run_notify(cmd, domain, status):
  748. if not cmd:
  749. return
  750. env = os.environ.copy()
  751. env["AUTODOMAIN"] = domain
  752. env["AUTODOMAIN_STATUS"] = status
  753. subprocess.run(cmd, shell=True, check=False, env=env)
  754. def choose_domain(filtered_domains, top_n, ranked_scored):
  755. if ranked_scored:
  756. top_domains = choose_top_candidate_domains(filtered_domains, top_n, ranked_scored)
  757. if top_domains:
  758. return top_domains[0], top_domains
  759. if filtered_domains:
  760. return filtered_domains[0], filtered_domains[:top_n]
  761. return None, []
  762. def build_output_settings(output_cfg, config_path_abs):
  763. # H3: legacy keys (current_domain_file/current_domain_json/substore_vars_file)
  764. # were ambiguous and drifted between server/local configs. Refuse to start
  765. # so operators migrate to the unified key set documented in CLAUDE.md.
  766. legacy_key_map = {
  767. "current_domain_file": "selected_value_file",
  768. "current_domain_json": "selected_value_json",
  769. "substore_vars_file": "export_vars_file",
  770. }
  771. legacy_in_use = [k for k in legacy_key_map if k in output_cfg]
  772. if legacy_in_use:
  773. renames = ", ".join(
  774. "{} -> {}".format(k, legacy_key_map[k]) for k in legacy_in_use
  775. )
  776. raise ValueError(
  777. "deprecated output keys in config: {}. Rename them: {}.".format(
  778. ", ".join(legacy_in_use), renames
  779. )
  780. )
  781. runtime_dir_cfg = output_cfg.get("runtime_dir", "./runtime")
  782. runtime_dir = resolve_path(os.path.dirname(config_path_abs), runtime_dir_cfg)
  783. selected_text_name = output_cfg.get("selected_value_file", "current_domain.txt")
  784. selected_json_name = output_cfg.get("selected_value_json", "current_domain.json")
  785. state_name = output_cfg.get("state_file", "state.json")
  786. vars_name = output_cfg.get("export_vars_file", "substore_vars.json")
  787. return {
  788. "runtime_dir": runtime_dir,
  789. "selected_text_path": os.path.join(runtime_dir, selected_text_name),
  790. "selected_json_path": os.path.join(runtime_dir, selected_json_name),
  791. "state_path": os.path.join(runtime_dir, state_name),
  792. "vars_path": os.path.join(runtime_dir, vars_name),
  793. "selected_json_key": str(output_cfg.get("selected_value_json_key", "domain")).strip() or "domain",
  794. "state_last_good_key": str(output_cfg.get("state_last_good_key", "last_good_domain")).strip() or "last_good_domain",
  795. "vars_value_key": str(output_cfg.get("substore_value_key", "AUTO_DOMAIN")).strip() or "AUTO_DOMAIN",
  796. }
  797. def print_output_settings(config_path_abs, cfg):
  798. output_cfg = cfg.get("output", {})
  799. settings = build_output_settings(output_cfg, config_path_abs)
  800. print(json.dumps(settings, ensure_ascii=True))
  801. def main():
  802. ap = argparse.ArgumentParser(description="Auto select preferred endpoint value")
  803. ap.add_argument("--config", default="config.server.json", help="Path to config JSON")
  804. ap.add_argument(
  805. "--print-output-settings",
  806. action="store_true",
  807. help="Print resolved output settings as JSON and exit",
  808. )
  809. args = ap.parse_args()
  810. config_path_abs = os.path.abspath(args.config)
  811. if not os.path.exists(config_path_abs):
  812. print(json.dumps({"status": "error", "error": f"config file not found: {config_path_abs}"}, ensure_ascii=True), file=sys.stderr)
  813. sys.exit(1)
  814. cfg = read_json_file(config_path_abs)
  815. try:
  816. validate_config(cfg)
  817. except Exception as e:
  818. print(json.dumps({"status": "error", "error": f"invalid config: {e}"}, ensure_ascii=True), file=sys.stderr)
  819. sys.exit(1)
  820. if args.print_output_settings:
  821. print_output_settings(config_path_abs, cfg)
  822. return
  823. output_cfg = cfg.get("output", {})
  824. output_settings = build_output_settings(output_cfg, config_path_abs)
  825. notify_cfg = cfg.get("notify", {})
  826. selected_text_file = output_settings["selected_text_path"]
  827. selected_json_file = output_settings["selected_json_path"]
  828. state_file = output_settings["state_path"]
  829. vars_file = output_settings["vars_path"]
  830. selected_json_key = output_settings["selected_json_key"]
  831. state_last_good_key = output_settings["state_last_good_key"]
  832. vars_value_key = output_settings["vars_value_key"]
  833. state = read_json_file(state_file, default={})
  834. last_good = state.get(state_last_good_key, "")
  835. source_type = get_source_type(cfg)
  836. try:
  837. top_n = int(cfg.get("selection", {}).get("top_n", 3))
  838. payload = None
  839. if source_type == "cfst_local":
  840. cfst_rows = load_cfst_rows(cfg, config_path_abs)
  841. parsed = [row["domain"] for row in cfst_rows]
  842. filtered = apply_filter(parsed, cfg.get("domain_filter", {}))
  843. filtered_set = set(filtered)
  844. cfst_rows = [row for row in cfst_rows if row["domain"] in filtered_set]
  845. if not cfst_rows:
  846. raise RuntimeError("No valid IP available from cfst result after filtering")
  847. selected = cfst_rows[0]["domain"]
  848. candidate_domains = [row["domain"] for row in cfst_rows[:top_n]]
  849. top_candidates = build_top_candidates("cfst_local", candidate_domains, cfst_rows=cfst_rows)
  850. else:
  851. payload = fetch_api_json(cfg)
  852. parsed = parse_domains(payload, cfg.get("parser", {}))
  853. filtered = apply_filter(parsed, cfg.get("domain_filter", {}))
  854. record_mapping_cfg = cfg.get("record_mapping", {})
  855. field_map = record_mapping_cfg.get("field_map", {})
  856. records = extract_records(payload, record_mapping_cfg)
  857. record_filter_cfg = cfg.get("record_filter", {})
  858. blocked_domains = collect_excluded_domains(records, field_map, record_filter_cfg)
  859. if blocked_domains:
  860. filtered = [d for d in filtered if d not in blocked_domains]
  861. scoring_cfg = cfg.get("scoring", {})
  862. scored_records = parse_scored_records(records, field_map, record_mapping_cfg, scoring_cfg)
  863. filtered_set = set(filtered)
  864. scored_records = [r for r in scored_records if r["domain"] in filtered_set]
  865. ranked_scored = rank_scored_records(scored_records, scoring_cfg)
  866. selected, candidate_domains = choose_domain(filtered, top_n, ranked_scored)
  867. top_candidates = build_top_candidates(
  868. "api",
  869. candidate_domains,
  870. records=records,
  871. field_map=field_map,
  872. ranked_scored=ranked_scored,
  873. )
  874. status = "ok"
  875. if not selected and last_good:
  876. selected = last_good
  877. status = "fallback_last_good"
  878. if not selected:
  879. if source_type == "cfst_local":
  880. raise RuntimeError("No valid IP available from cfst and no fallback in state")
  881. raise RuntimeError("No valid domain available from API and no fallback in state")
  882. write_text_file(selected_text_file, selected + "\n")
  883. current_json = {
  884. selected_json_key: selected,
  885. "updated_at": utc_now_iso(),
  886. "status": status,
  887. "source_type": source_type,
  888. "source_count": len(parsed),
  889. "top_candidates": top_candidates,
  890. }
  891. write_json_file(selected_json_file, current_json)
  892. write_json_file(
  893. vars_file,
  894. {
  895. vars_value_key: selected,
  896. "UPDATED_AT": current_json["updated_at"],
  897. "STATUS": status,
  898. },
  899. )
  900. new_state = {
  901. "updated_at": current_json["updated_at"],
  902. state_last_good_key: selected,
  903. "status": status,
  904. "source_count": len(parsed),
  905. "source_type": source_type,
  906. }
  907. write_json_file(state_file, new_state)
  908. run_notify(notify_cfg.get("command", ""), selected, status)
  909. print(json.dumps(current_json, ensure_ascii=True))
  910. except Exception as e:
  911. now = utc_now_iso()
  912. err_state = {
  913. "updated_at": now,
  914. "status": "error",
  915. "error": str(e),
  916. state_last_good_key: last_good,
  917. "source_type": source_type,
  918. }
  919. write_json_file(state_file, err_state)
  920. if last_good:
  921. write_text_file(selected_text_file, last_good + "\n")
  922. write_json_file(
  923. selected_json_file,
  924. {
  925. selected_json_key: last_good,
  926. "updated_at": now,
  927. "status": "error_use_last_good",
  928. "error": str(e),
  929. "source_type": source_type,
  930. },
  931. )
  932. write_json_file(
  933. vars_file,
  934. {
  935. vars_value_key: last_good,
  936. "UPDATED_AT": now,
  937. "STATUS": "error_use_last_good",
  938. },
  939. )
  940. run_notify(notify_cfg.get("command", ""), last_good, "error_use_last_good")
  941. print(json.dumps({"status": "error_use_last_good", "error": str(e)}, ensure_ascii=True))
  942. return
  943. print(json.dumps({"status": "error", "error": str(e)}, ensure_ascii=True), file=sys.stderr)
  944. sys.exit(1)
  945. if __name__ == "__main__":
  946. main()