domain_updater.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089
  1. #!/usr/bin/env python3
  2. import argparse
  3. import csv
  4. import datetime as dt
  5. import functools
  6. import json
  7. import math
  8. import os
  9. import re
  10. import socket
  11. import ssl
  12. import subprocess
  13. import sys
  14. import time
  15. import urllib.parse
  16. import urllib.request
  17. DOMAIN_RE = re.compile(r"^(?=.{1,253}$)(?!-)[A-Za-z0-9-]{1,63}(?<!-)(\.(?!-)[A-Za-z0-9-]{1,63}(?<!-))+$")
  18. IPV4_RE = re.compile(r"^(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}$")
  19. def utc_now_iso():
  20. return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
  21. def read_json_file(path, default=None):
  22. if default is None:
  23. default = {}
  24. if not os.path.exists(path):
  25. return default
  26. with open(path, "r", encoding="utf-8") as f:
  27. return json.load(f)
  28. def write_json_file(path, data):
  29. os.makedirs(os.path.dirname(path), exist_ok=True)
  30. with open(path, "w", encoding="utf-8") as f:
  31. json.dump(data, f, ensure_ascii=True, indent=2)
  32. def write_text_file(path, data):
  33. os.makedirs(os.path.dirname(path), exist_ok=True)
  34. with open(path, "w", encoding="utf-8") as f:
  35. f.write(data)
  36. def build_url(base_url, params):
  37. if not params:
  38. return base_url
  39. parsed = urllib.parse.urlparse(base_url)
  40. current = urllib.parse.parse_qs(parsed.query)
  41. for k, v in params.items():
  42. current[k] = [str(v)]
  43. query = urllib.parse.urlencode(current, doseq=True)
  44. return urllib.parse.urlunparse(parsed._replace(query=query))
  45. def resolve_path(base_dir, path_value):
  46. path_text = str(path_value or "").strip()
  47. if not path_text:
  48. return ""
  49. if os.path.isabs(path_text):
  50. return os.path.normpath(path_text)
  51. return os.path.normpath(os.path.join(base_dir, path_text))
  52. def get_source_type(cfg):
  53. source_cfg = cfg.get("source", {})
  54. if isinstance(source_cfg, dict):
  55. source_type = str(source_cfg.get("type", "api")).strip().lower()
  56. if source_type:
  57. return source_type
  58. return "api"
  59. def fetch_api_json(cfg):
  60. api = cfg["api"]
  61. url = build_url(api["url"], api.get("params", {}))
  62. method = api.get("method", "GET").upper()
  63. headers = api.get("headers", {})
  64. timeout = int(api.get("timeout_sec", 10))
  65. body_obj = api.get("body")
  66. body = None
  67. if body_obj is not None:
  68. body = json.dumps(body_obj).encode("utf-8")
  69. headers = {**headers, "Content-Type": "application/json"}
  70. req = urllib.request.Request(url=url, data=body, headers=headers, method=method)
  71. with urllib.request.urlopen(req, timeout=timeout) as resp:
  72. raw = resp.read().decode("utf-8", errors="replace")
  73. return json.loads(raw)
  74. def load_cfst_rows(cfg, config_path_abs):
  75. cfst_cfg = cfg.get("cfst_local", {})
  76. config_dir = os.path.dirname(config_path_abs)
  77. work_dir = resolve_path(config_dir, cfst_cfg.get("work_dir", "./cfst"))
  78. binary_path = resolve_path(work_dir, cfst_cfg.get("binary", "./cfst"))
  79. result_file = resolve_path(work_dir, cfst_cfg.get("result_file", "result.csv"))
  80. encoding = str(cfst_cfg.get("encoding", "utf-8")).strip() or "utf-8"
  81. skip_run = bool(cfst_cfg.get("skip_run", False))
  82. timeout_sec = int(cfst_cfg.get("run_timeout_sec", 600))
  83. run_args = cfst_cfg.get("run_args", ["-o", os.path.basename(result_file)])
  84. if not isinstance(run_args, list):
  85. raise ValueError("cfst_local.run_args must be an array")
  86. command = [binary_path] + [str(x) for x in run_args]
  87. if not skip_run:
  88. completed = subprocess.run(
  89. command,
  90. cwd=work_dir,
  91. check=False,
  92. capture_output=True,
  93. text=True,
  94. encoding=encoding,
  95. errors="replace",
  96. timeout=timeout_sec,
  97. )
  98. if completed.returncode != 0:
  99. stderr = (completed.stderr or "").strip()
  100. stdout = (completed.stdout or "").strip()
  101. details = stderr or stdout or f"exit code {completed.returncode}"
  102. raise RuntimeError(f"cfst run failed: {details}")
  103. if not os.path.exists(result_file):
  104. raise RuntimeError(f"cfst result file not found: {result_file}")
  105. with open(result_file, "r", encoding=encoding, errors="replace", newline="") as f:
  106. reader = csv.reader(f)
  107. rows = [row for row in reader if any(str(col).strip() for col in row)]
  108. header_rows = int(cfst_cfg.get("header_rows", 1))
  109. if len(rows) <= header_rows:
  110. raise RuntimeError("cfst result has no data rows")
  111. columns_cfg = cfst_cfg.get("columns", {})
  112. if not isinstance(columns_cfg, dict):
  113. raise ValueError("cfst_local.columns must be an object")
  114. def col_index(name, default_index):
  115. raw = columns_cfg.get(name, default_index)
  116. try:
  117. idx = int(raw)
  118. except Exception as exc:
  119. raise ValueError(f"cfst_local.columns.{name} must be an integer") from exc
  120. if idx < 0:
  121. raise ValueError(f"cfst_local.columns.{name} must be >= 0")
  122. return idx
  123. ip_idx = col_index("ip", 0)
  124. sent_idx = col_index("sent", 1)
  125. received_idx = col_index("received", 2)
  126. loss_idx = col_index("loss_rate", 3)
  127. latency_idx = col_index("avg_latency", 4)
  128. speed_idx = col_index("download_speed", 5)
  129. region_idx = col_index("region", 6)
  130. out = []
  131. for row in rows[header_rows:]:
  132. if ip_idx >= len(row):
  133. continue
  134. domain = normalize_domain(row[ip_idx])
  135. if not domain:
  136. continue
  137. out.append(
  138. {
  139. "domain": domain,
  140. "ip": domain,
  141. "sent": row[sent_idx].strip() if sent_idx < len(row) else "",
  142. "received": row[received_idx].strip() if received_idx < len(row) else "",
  143. "loss_rate": row[loss_idx].strip() if loss_idx < len(row) else "",
  144. "avg_latency": row[latency_idx].strip() if latency_idx < len(row) else "",
  145. "download_speed": row[speed_idx].strip() if speed_idx < len(row) else "",
  146. "region": row[region_idx].strip() if region_idx < len(row) else "",
  147. }
  148. )
  149. if not out:
  150. raise RuntimeError("cfst result parsed to zero valid rows")
  151. return out
  152. def flatten_values(value):
  153. out = []
  154. if isinstance(value, str):
  155. out.append(value)
  156. elif isinstance(value, list):
  157. for item in value:
  158. out.extend(flatten_values(item))
  159. elif isinstance(value, dict):
  160. for item in value.values():
  161. out.extend(flatten_values(item))
  162. return out
  163. def get_by_json_path(data, path):
  164. cur = data
  165. for part in path.split("."):
  166. if isinstance(cur, dict) and part in cur:
  167. cur = cur[part]
  168. else:
  169. return None
  170. return cur
  171. def get_values_by_path(data, path):
  172. parts = path.split(".")
  173. def walk(cur, idx):
  174. if idx >= len(parts):
  175. return [cur]
  176. part = parts[idx]
  177. if part.endswith("[]"):
  178. key = part[:-2]
  179. if isinstance(cur, dict):
  180. arr = cur.get(key)
  181. else:
  182. arr = None
  183. if not isinstance(arr, list):
  184. return []
  185. out = []
  186. for item in arr:
  187. out.extend(walk(item, idx + 1))
  188. return out
  189. if isinstance(cur, dict) and part in cur:
  190. return walk(cur[part], idx + 1)
  191. return []
  192. return walk(data, 0)
  193. def parse_domains(payload, parser_cfg):
  194. domains = []
  195. for p in parser_cfg.get("field_paths", []):
  196. values = get_values_by_path(payload, p)
  197. domains.extend(flatten_values(values))
  198. for p in parser_cfg.get("json_paths", []):
  199. v = get_by_json_path(payload, p)
  200. if v is not None:
  201. domains.extend(flatten_values(v))
  202. if not domains:
  203. regex_s = parser_cfg.get("regex", r"[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
  204. text = json.dumps(payload, ensure_ascii=True)
  205. domains.extend(re.findall(regex_s, text))
  206. clean = []
  207. seen = set()
  208. for d in domains:
  209. d = str(d).strip().lower().rstrip(".")
  210. if (DOMAIN_RE.match(d) or IPV4_RE.match(d)) and d not in seen:
  211. seen.add(d)
  212. clean.append(d)
  213. return clean
  214. def parse_timezone(tz_raw):
  215. if tz_raw is None:
  216. return dt.timezone.utc
  217. s = str(tz_raw).strip().upper()
  218. if s in {"", "UTC", "Z", "+00:00", "+0000"}:
  219. return dt.timezone.utc
  220. m = re.match(r"^([+-])(\d{2}):?(\d{2})$", s)
  221. if not m:
  222. raise ValueError(f"invalid created_time_timezone: {tz_raw}")
  223. sign = 1 if m.group(1) == "+" else -1
  224. hh = int(m.group(2))
  225. mm = int(m.group(3))
  226. if hh > 23 or mm > 59:
  227. raise ValueError(f"invalid created_time_timezone offset: {tz_raw}")
  228. return dt.timezone(sign * dt.timedelta(hours=hh, minutes=mm))
  229. def parse_created_time(value, formats, timezone):
  230. if value is None:
  231. return None
  232. s = str(value).strip()
  233. if not s:
  234. return None
  235. for fmt in formats:
  236. try:
  237. parsed = dt.datetime.strptime(s, fmt)
  238. if parsed.tzinfo is None:
  239. parsed = parsed.replace(tzinfo=timezone)
  240. return parsed.astimezone(dt.timezone.utc)
  241. except Exception:
  242. continue
  243. try:
  244. iso_text = s.replace("Z", "+00:00")
  245. parsed = dt.datetime.fromisoformat(iso_text)
  246. if parsed.tzinfo is None:
  247. parsed = parsed.replace(tzinfo=timezone)
  248. return parsed.astimezone(dt.timezone.utc)
  249. except Exception:
  250. return None
  251. def normalize_domain(value):
  252. if value is None:
  253. return ""
  254. return str(value).strip().lower().rstrip(".")
  255. def to_float_or_none(value):
  256. try:
  257. f = float(value)
  258. if not math.isfinite(f):
  259. return None
  260. return f
  261. except Exception:
  262. return None
  263. def resolve_field(record, field_name, field_map):
  264. path = field_map.get(field_name)
  265. if not path:
  266. raise ValueError(f"field '{field_name}' is not registered in record_mapping.field_map")
  267. if not isinstance(record, dict):
  268. return None
  269. return get_by_json_path(record, path)
  270. def extract_records(payload, record_mapping):
  271. records_path = str(record_mapping.get("records_path", "")).strip()
  272. raw = get_values_by_path(payload, records_path)
  273. return [x for x in raw if isinstance(x, dict)]
  274. def validate_config(cfg):
  275. source_type = get_source_type(cfg)
  276. if source_type not in {"api", "cfst_local"}:
  277. raise ValueError("source.type must be 'api' or 'cfst_local'")
  278. output_cfg = cfg.get("output", {})
  279. if output_cfg and not isinstance(output_cfg, dict):
  280. raise ValueError("output must be an object")
  281. if source_type == "cfst_local":
  282. cfst_cfg = cfg.get("cfst_local")
  283. if not isinstance(cfst_cfg, dict):
  284. raise ValueError("cfst_local is required and must be an object when source.type=cfst_local")
  285. work_dir = str(cfst_cfg.get("work_dir", "")).strip()
  286. if not work_dir:
  287. raise ValueError("cfst_local.work_dir is required")
  288. binary = str(cfst_cfg.get("binary", "")).strip()
  289. if not binary:
  290. raise ValueError("cfst_local.binary is required")
  291. result_file = str(cfst_cfg.get("result_file", "")).strip()
  292. if not result_file:
  293. raise ValueError("cfst_local.result_file is required")
  294. run_args = cfst_cfg.get("run_args", [])
  295. if not isinstance(run_args, list):
  296. raise ValueError("cfst_local.run_args must be an array")
  297. columns_cfg = cfst_cfg.get("columns", {})
  298. if columns_cfg and not isinstance(columns_cfg, dict):
  299. raise ValueError("cfst_local.columns must be an object")
  300. return
  301. record_mapping = cfg.get("record_mapping")
  302. if not isinstance(record_mapping, dict):
  303. raise ValueError("record_mapping is required and must be an object")
  304. records_path = str(record_mapping.get("records_path", "")).strip()
  305. if not records_path:
  306. raise ValueError("record_mapping.records_path is required")
  307. field_map = record_mapping.get("field_map")
  308. if not isinstance(field_map, dict) or not field_map:
  309. raise ValueError("record_mapping.field_map is required and must be a non-empty object")
  310. for key, path in field_map.items():
  311. if not str(key).strip() or not str(path).strip():
  312. raise ValueError("record_mapping.field_map contains empty field name or path")
  313. for required in ["domain", "created_at"]:
  314. if required not in field_map:
  315. raise ValueError(f"record_mapping.field_map.{required} is required")
  316. created_time_formats = record_mapping.get("created_time_formats")
  317. if not isinstance(created_time_formats, list) or not created_time_formats:
  318. raise ValueError("record_mapping.created_time_formats is required and must be a non-empty array")
  319. for fmt in created_time_formats:
  320. if not str(fmt).strip():
  321. raise ValueError("record_mapping.created_time_formats contains empty format")
  322. parse_timezone(record_mapping.get("created_time_timezone", "UTC"))
  323. def ensure_field_registered(field_name, where):
  324. if field_name not in field_map:
  325. raise ValueError(f"{where}: field '{field_name}' is not in record_mapping.field_map")
  326. record_filter = cfg.get("record_filter", {})
  327. if record_filter.get("enabled", False):
  328. rules = record_filter.get("exclude_if_any", [])
  329. if not isinstance(rules, list):
  330. raise ValueError("record_filter.exclude_if_any must be an array")
  331. for i, rule in enumerate(rules):
  332. if not isinstance(rule, dict):
  333. raise ValueError(f"record_filter.exclude_if_any[{i}] must be an object")
  334. field_name = str(rule.get("field", "")).strip()
  335. if not field_name:
  336. raise ValueError(f"record_filter.exclude_if_any[{i}].field is required")
  337. ensure_field_registered(field_name, f"record_filter.exclude_if_any[{i}]")
  338. has_matcher = any(k in rule for k in ["contains", "equals", "regex"])
  339. if not has_matcher:
  340. raise ValueError(f"record_filter.exclude_if_any[{i}] must include one of contains/equals/regex")
  341. scoring = cfg.get("scoring", {})
  342. if scoring.get("enabled", False):
  343. strategy = str(scoring.get("strategy", "")).strip()
  344. if strategy not in {"weighted_average", "lexicographic"}:
  345. raise ValueError("scoring.strategy must be 'weighted_average' or 'lexicographic'")
  346. within_hours = to_float_or_none(scoring.get("within_hours", 24))
  347. if within_hours is None or within_hours <= 0:
  348. raise ValueError("scoring.within_hours must be a positive number")
  349. if strategy == "weighted_average":
  350. weighted_fields = scoring.get("weighted_fields")
  351. if not isinstance(weighted_fields, list) or not weighted_fields:
  352. raise ValueError("scoring.weighted_fields is required for weighted_average strategy")
  353. for i, item in enumerate(weighted_fields):
  354. if not isinstance(item, dict):
  355. raise ValueError(f"scoring.weighted_fields[{i}] must be an object")
  356. field_name = str(item.get("field", "")).strip()
  357. if not field_name:
  358. raise ValueError(f"scoring.weighted_fields[{i}].field is required")
  359. ensure_field_registered(field_name, f"scoring.weighted_fields[{i}]")
  360. weight = to_float_or_none(item.get("weight"))
  361. if weight is None or weight <= 0:
  362. raise ValueError(f"scoring.weighted_fields[{i}].weight must be > 0")
  363. if strategy == "lexicographic":
  364. lex_fields = scoring.get("lexicographic_fields")
  365. if not isinstance(lex_fields, list) or not lex_fields:
  366. raise ValueError("scoring.lexicographic_fields is required for lexicographic strategy")
  367. for i, item in enumerate(lex_fields):
  368. if isinstance(item, str):
  369. field_name = item.strip()
  370. order = ""
  371. elif isinstance(item, dict):
  372. field_name = str(item.get("field", "")).strip()
  373. order = str(item.get("order", "")).strip().lower()
  374. else:
  375. raise ValueError(f"scoring.lexicographic_fields[{i}] must be string or object")
  376. if not field_name:
  377. raise ValueError(f"scoring.lexicographic_fields[{i}] field is required")
  378. ensure_field_registered(field_name, f"scoring.lexicographic_fields[{i}]")
  379. if order and order not in {"asc", "desc"}:
  380. raise ValueError(f"scoring.lexicographic_fields[{i}].order must be asc or desc")
  381. tie_breakers = scoring.get("tie_breakers", [])
  382. if tie_breakers is not None:
  383. if not isinstance(tie_breakers, list):
  384. raise ValueError("scoring.tie_breakers must be an array")
  385. for i, item in enumerate(tie_breakers):
  386. if not isinstance(item, dict):
  387. raise ValueError(f"scoring.tie_breakers[{i}] must be an object")
  388. field_name = str(item.get("field", "")).strip()
  389. order = str(item.get("order", "")).strip().lower()
  390. if not field_name:
  391. raise ValueError(f"scoring.tie_breakers[{i}].field is required")
  392. if order not in {"asc", "desc"}:
  393. raise ValueError(f"scoring.tie_breakers[{i}].order must be asc or desc")
  394. ensure_field_registered(field_name, f"scoring.tie_breakers[{i}]")
  395. def rule_matches(value, rule):
  396. if value is None or not isinstance(rule, dict):
  397. return False
  398. values = flatten_values(value)
  399. if not values:
  400. values = [value]
  401. case_sensitive = bool(rule.get("case_sensitive", False))
  402. if "contains" in rule:
  403. needle = str(rule.get("contains", ""))
  404. if not needle:
  405. return False
  406. for item in values:
  407. hay = str(item)
  408. if case_sensitive:
  409. if needle in hay:
  410. return True
  411. else:
  412. if needle.lower() in hay.lower():
  413. return True
  414. return False
  415. if "equals" in rule:
  416. target = str(rule.get("equals", ""))
  417. for item in values:
  418. item_s = str(item)
  419. if case_sensitive:
  420. if item_s == target:
  421. return True
  422. else:
  423. if item_s.lower() == target.lower():
  424. return True
  425. return False
  426. if "regex" in rule:
  427. pattern = str(rule.get("regex", ""))
  428. if not pattern:
  429. return False
  430. flags = 0 if case_sensitive else re.IGNORECASE
  431. try:
  432. rx = re.compile(pattern, flags)
  433. except Exception:
  434. return False
  435. for item in values:
  436. if rx.search(str(item)):
  437. return True
  438. return False
  439. return False
  440. def collect_excluded_domains(records, field_map, record_filter_cfg):
  441. if not record_filter_cfg.get("enabled", False):
  442. return set()
  443. rules = record_filter_cfg.get("exclude_if_any", [])
  444. if not rules:
  445. return set()
  446. blocked = set()
  447. for record in records:
  448. domain = normalize_domain(resolve_field(record, "domain", field_map))
  449. if not domain:
  450. continue
  451. for rule in rules:
  452. field_name = str(rule.get("field", "")).strip()
  453. if not field_name:
  454. continue
  455. value = resolve_field(record, field_name, field_map)
  456. if rule_matches(value, rule):
  457. blocked.add(domain)
  458. break
  459. return blocked
  460. def build_lexicographic_descriptors(scoring_cfg, prefer_lower):
  461. out = []
  462. for item in scoring_cfg.get("lexicographic_fields", []):
  463. if isinstance(item, str):
  464. field_name = item.strip()
  465. order = "asc" if prefer_lower else "desc"
  466. else:
  467. field_name = str(item.get("field", "")).strip()
  468. order = str(item.get("order", "")).strip().lower()
  469. if not order:
  470. order = "asc" if prefer_lower else "desc"
  471. out.append({"field": field_name, "order": order})
  472. return out
  473. def parse_scored_records(records, field_map, record_mapping_cfg, scoring_cfg):
  474. if not scoring_cfg.get("enabled", False):
  475. return []
  476. strategy = str(scoring_cfg.get("strategy", "weighted_average")).strip()
  477. prefer_lower = bool(scoring_cfg.get("prefer_lower", False))
  478. timezone = parse_timezone(record_mapping_cfg.get("created_time_timezone", "UTC"))
  479. time_formats = [str(x) for x in record_mapping_cfg.get("created_time_formats", [])]
  480. weighted_fields = scoring_cfg.get("weighted_fields", []) if strategy == "weighted_average" else []
  481. lex_descriptors = build_lexicographic_descriptors(scoring_cfg, prefer_lower) if strategy == "lexicographic" else []
  482. needed_fields = set()
  483. for item in weighted_fields:
  484. needed_fields.add(str(item.get("field", "")).strip())
  485. for item in lex_descriptors:
  486. needed_fields.add(str(item.get("field", "")).strip())
  487. for item in scoring_cfg.get("tie_breakers", []):
  488. needed_fields.add(str(item.get("field", "")).strip())
  489. needed_fields.discard("domain")
  490. needed_fields.discard("created_at")
  491. out = []
  492. for record in records:
  493. domain = normalize_domain(resolve_field(record, "domain", field_map))
  494. if not domain:
  495. continue
  496. created_raw = resolve_field(record, "created_at", field_map)
  497. created_at = parse_created_time(created_raw, time_formats, timezone)
  498. field_values = {}
  499. for field_name in needed_fields:
  500. field_values[field_name] = resolve_field(record, field_name, field_map)
  501. score_value = None
  502. scores = []
  503. lex_values = []
  504. if strategy == "weighted_average":
  505. total = 0.0
  506. total_weight = 0.0
  507. missing = False
  508. for item in weighted_fields:
  509. field_name = str(item.get("field", "")).strip()
  510. weight = float(item.get("weight"))
  511. raw_v = resolve_field(record, field_name, field_map)
  512. val = to_float_or_none(raw_v)
  513. scores.append(val)
  514. if val is None:
  515. missing = True
  516. continue
  517. total += val * weight
  518. total_weight += weight
  519. if not missing and total_weight > 0:
  520. score_value = total / total_weight
  521. if strategy == "lexicographic":
  522. for item in lex_descriptors:
  523. field_name = item["field"]
  524. order = item["order"]
  525. raw_v = resolve_field(record, field_name, field_map)
  526. num_v = to_float_or_none(raw_v)
  527. v = num_v if num_v is not None else raw_v
  528. lex_values.append({"field": field_name, "value": v, "order": order})
  529. scores.append(v)
  530. out.append(
  531. {
  532. "domain": domain,
  533. "created_at": created_at,
  534. "created_raw": created_raw,
  535. "scores": scores,
  536. "score_value": score_value,
  537. "lex_values": lex_values,
  538. "field_values": field_values,
  539. }
  540. )
  541. return out
  542. def cmp_scalar(a, b, order):
  543. a_none = a is None
  544. b_none = b is None
  545. if a_none and b_none:
  546. return 0
  547. if a_none:
  548. return 1
  549. if b_none:
  550. return -1
  551. if isinstance(a, dt.datetime):
  552. a = a.timestamp()
  553. if isinstance(b, dt.datetime):
  554. b = b.timestamp()
  555. a_num = to_float_or_none(a)
  556. b_num = to_float_or_none(b)
  557. if a_num is not None and b_num is not None:
  558. if a_num < b_num:
  559. base = -1
  560. elif a_num > b_num:
  561. base = 1
  562. else:
  563. base = 0
  564. else:
  565. a_s = str(a).lower()
  566. b_s = str(b).lower()
  567. if a_s < b_s:
  568. base = -1
  569. elif a_s > b_s:
  570. base = 1
  571. else:
  572. base = 0
  573. return base if order == "asc" else -base
  574. def get_sort_field_value(record, field_name):
  575. if field_name == "domain":
  576. return record.get("domain")
  577. if field_name == "created_at":
  578. return record.get("created_at")
  579. return record.get("field_values", {}).get(field_name)
  580. def rank_scored_records(records, scoring_cfg):
  581. if not records:
  582. return []
  583. within_hours = float(scoring_cfg.get("within_hours", 24))
  584. strategy = str(scoring_cfg.get("strategy", "weighted_average")).strip()
  585. prefer_lower = bool(scoring_cfg.get("prefer_lower", False))
  586. tie_breakers = scoring_cfg.get("tie_breakers", [])
  587. now = dt.datetime.now(dt.timezone.utc)
  588. cutoff = now - dt.timedelta(hours=within_hours)
  589. recent = [r for r in records if r.get("created_at") is not None and r["created_at"] >= cutoff]
  590. candidates = recent if recent else records
  591. default_lex_order = "asc" if prefer_lower else "desc"
  592. def compare(a, b):
  593. if strategy == "weighted_average":
  594. order = "asc" if prefer_lower else "desc"
  595. c = cmp_scalar(a.get("score_value"), b.get("score_value"), order)
  596. if c != 0:
  597. return c
  598. elif strategy == "lexicographic":
  599. a_lex = a.get("lex_values", [])
  600. b_lex = b.get("lex_values", [])
  601. n = max(len(a_lex), len(b_lex))
  602. for i in range(n):
  603. av = a_lex[i]["value"] if i < len(a_lex) else None
  604. bv = b_lex[i]["value"] if i < len(b_lex) else None
  605. order = default_lex_order
  606. if i < len(a_lex) and a_lex[i].get("order"):
  607. order = a_lex[i]["order"]
  608. c = cmp_scalar(av, bv, order)
  609. if c != 0:
  610. return c
  611. for item in tie_breakers:
  612. field_name = str(item.get("field", "")).strip()
  613. order = str(item.get("order", "asc")).strip().lower()
  614. av = get_sort_field_value(a, field_name)
  615. bv = get_sort_field_value(b, field_name)
  616. c = cmp_scalar(av, bv, order)
  617. if c != 0:
  618. return c
  619. return cmp_scalar(a.get("domain"), b.get("domain"), "asc")
  620. return sorted(candidates, key=functools.cmp_to_key(compare))
  621. def apply_filter(domains, filter_cfg):
  622. include_suffixes = [s.lower() for s in filter_cfg.get("include_suffixes", []) if s]
  623. exclude_regex = [re.compile(x) for x in filter_cfg.get("exclude_regex", []) if x]
  624. out = []
  625. for d in domains:
  626. if include_suffixes and not any(d.endswith(s) for s in include_suffixes):
  627. continue
  628. if any(rx.search(d) for rx in exclude_regex):
  629. continue
  630. out.append(d)
  631. return out
  632. def single_tls_check(domain, timeout_ms, port, tls_verify=True):
  633. start = time.perf_counter()
  634. timeout_sec = max(0.2, timeout_ms / 1000.0)
  635. try:
  636. infos = socket.getaddrinfo(domain, port, proto=socket.IPPROTO_TCP)
  637. if not infos:
  638. return False, None, "dns_empty"
  639. af, socktype, proto, _, sockaddr = infos[0]
  640. with socket.socket(af, socktype, proto) as sock:
  641. sock.settimeout(timeout_sec)
  642. sock.connect(sockaddr)
  643. if tls_verify:
  644. ctx = ssl.create_default_context()
  645. else:
  646. ctx = ssl._create_unverified_context()
  647. with ctx.wrap_socket(sock, server_hostname=domain) as ssock:
  648. ssock.do_handshake()
  649. elapsed = int((time.perf_counter() - start) * 1000)
  650. return True, elapsed, "ok"
  651. except Exception as e:
  652. return False, None, str(e)
  653. def check_domains(domains, hc_cfg):
  654. attempts = int(hc_cfg.get("attempts", 5))
  655. timeout_ms = int(hc_cfg.get("timeout_ms", 1800))
  656. port = int(hc_cfg.get("port", 443))
  657. tls_verify = bool(hc_cfg.get("tls_verify", True))
  658. results = []
  659. for d in domains:
  660. ok_count = 0
  661. latencies = []
  662. errors = []
  663. for _ in range(attempts):
  664. ok, latency, err = single_tls_check(d, timeout_ms, port, tls_verify=tls_verify)
  665. if ok:
  666. ok_count += 1
  667. latencies.append(latency)
  668. else:
  669. errors.append(err)
  670. success_ratio = ok_count / attempts if attempts else 0.0
  671. avg_latency = int(sum(latencies) / len(latencies)) if latencies else 999999
  672. results.append(
  673. {
  674. "domain": d,
  675. "success_ratio": success_ratio,
  676. "avg_latency_ms": avg_latency,
  677. "ok_count": ok_count,
  678. "attempts": attempts,
  679. "errors": errors[:3],
  680. }
  681. )
  682. results.sort(key=lambda x: (-x["success_ratio"], x["avg_latency_ms"], x["domain"]))
  683. return results
  684. def run_notify(cmd, domain, status):
  685. if not cmd:
  686. return
  687. env = os.environ.copy()
  688. env["AUTODOMAIN"] = domain
  689. env["AUTODOMAIN_STATUS"] = status
  690. subprocess.run(cmd, shell=True, check=False, env=env)
  691. def choose_domain(filtered_domains, check_results, top_n, ranked_scored):
  692. if ranked_scored:
  693. domains_by_score = [x["domain"] for x in ranked_scored]
  694. if check_results:
  695. check_map = {x["domain"]: x for x in check_results}
  696. top = []
  697. for d in domains_by_score:
  698. if d in check_map and check_map[d]["success_ratio"] > 0:
  699. top.append(check_map[d])
  700. if len(top) >= top_n:
  701. break
  702. if top:
  703. return top[0]["domain"], top
  704. score_only = [
  705. {
  706. "domain": x["domain"],
  707. "score_value": x.get("score_value"),
  708. "scores": x.get("scores", []),
  709. "created_raw": x.get("created_raw"),
  710. }
  711. for x in ranked_scored[:top_n]
  712. ]
  713. return score_only[0]["domain"], score_only
  714. top_scored = [
  715. {
  716. "domain": x["domain"],
  717. "score_value": x.get("score_value"),
  718. "scores": x.get("scores", []),
  719. "created_raw": x.get("created_raw"),
  720. }
  721. for x in ranked_scored[:top_n]
  722. ]
  723. if top_scored:
  724. return top_scored[0]["domain"], top_scored
  725. if check_results:
  726. top = [x for x in check_results if x["success_ratio"] > 0][:top_n]
  727. if top:
  728. return top[0]["domain"], top
  729. return None, check_results[:top_n]
  730. if filtered_domains:
  731. return filtered_domains[0], [{"domain": x} for x in filtered_domains[:top_n]]
  732. return None, []
  733. def build_output_settings(output_cfg, config_path_abs):
  734. runtime_dir_cfg = output_cfg.get("runtime_dir", "./runtime")
  735. runtime_dir = resolve_path(os.path.dirname(config_path_abs), runtime_dir_cfg)
  736. selected_text_name = output_cfg.get("selected_value_file", output_cfg.get("current_domain_file", "current_domain.txt"))
  737. selected_json_name = output_cfg.get("selected_value_json", output_cfg.get("current_domain_json", "current_domain.json"))
  738. state_name = output_cfg.get("state_file", "state.json")
  739. vars_name = output_cfg.get("export_vars_file", output_cfg.get("substore_vars_file", "substore_vars.json"))
  740. return {
  741. "runtime_dir": runtime_dir,
  742. "selected_text_path": os.path.join(runtime_dir, selected_text_name),
  743. "selected_json_path": os.path.join(runtime_dir, selected_json_name),
  744. "state_path": os.path.join(runtime_dir, state_name),
  745. "vars_path": os.path.join(runtime_dir, vars_name),
  746. "selected_json_key": str(output_cfg.get("selected_value_json_key", "domain")).strip() or "domain",
  747. "state_last_good_key": str(output_cfg.get("state_last_good_key", "last_good_domain")).strip() or "last_good_domain",
  748. "vars_value_key": str(output_cfg.get("substore_value_key", "AUTO_DOMAIN")).strip() or "AUTO_DOMAIN",
  749. }
  750. def print_output_settings(config_path_abs, cfg):
  751. output_cfg = cfg.get("output", {})
  752. settings = build_output_settings(output_cfg, config_path_abs)
  753. print(json.dumps(settings, ensure_ascii=True))
  754. def main():
  755. ap = argparse.ArgumentParser(description="Auto select preferred endpoint value")
  756. ap.add_argument("--config", default="config.server.json", help="Path to config JSON")
  757. ap.add_argument(
  758. "--print-output-settings",
  759. action="store_true",
  760. help="Print resolved output settings as JSON and exit",
  761. )
  762. args = ap.parse_args()
  763. config_path_abs = os.path.abspath(args.config)
  764. if not os.path.exists(config_path_abs):
  765. print(json.dumps({"status": "error", "error": f"config file not found: {config_path_abs}"}, ensure_ascii=True), file=sys.stderr)
  766. sys.exit(1)
  767. cfg = read_json_file(config_path_abs)
  768. try:
  769. validate_config(cfg)
  770. except Exception as e:
  771. print(json.dumps({"status": "error", "error": f"invalid config: {e}"}, ensure_ascii=True), file=sys.stderr)
  772. sys.exit(1)
  773. if args.print_output_settings:
  774. print_output_settings(config_path_abs, cfg)
  775. return
  776. output_cfg = cfg.get("output", {})
  777. output_settings = build_output_settings(output_cfg, config_path_abs)
  778. notify_cfg = cfg.get("notify", {})
  779. selected_text_file = output_settings["selected_text_path"]
  780. selected_json_file = output_settings["selected_json_path"]
  781. state_file = output_settings["state_path"]
  782. vars_file = output_settings["vars_path"]
  783. selected_json_key = output_settings["selected_json_key"]
  784. state_last_good_key = output_settings["state_last_good_key"]
  785. vars_value_key = output_settings["vars_value_key"]
  786. state = read_json_file(state_file, default={})
  787. last_good = state.get(state_last_good_key, "")
  788. source_type = get_source_type(cfg)
  789. try:
  790. top_n = int(cfg.get("selection", {}).get("top_n", 3))
  791. check_results = []
  792. payload = None
  793. if source_type == "cfst_local":
  794. cfst_rows = load_cfst_rows(cfg, config_path_abs)
  795. parsed = [row["domain"] for row in cfst_rows]
  796. filtered = apply_filter(parsed, cfg.get("domain_filter", {}))
  797. filtered_set = set(filtered)
  798. cfst_rows = [row for row in cfst_rows if row["domain"] in filtered_set]
  799. if not cfst_rows:
  800. raise RuntimeError("No valid IP available from cfst result after filtering")
  801. if cfg.get("healthcheck", {}).get("enabled", False):
  802. check_results = check_domains(filtered, cfg.get("healthcheck", {}))
  803. selected, _ = choose_domain(filtered, check_results, top_n, [])
  804. top_candidates = cfst_rows[:top_n]
  805. else:
  806. selected = cfst_rows[0]["domain"]
  807. top_candidates = cfst_rows[:top_n]
  808. else:
  809. payload = fetch_api_json(cfg)
  810. parsed = parse_domains(payload, cfg.get("parser", {}))
  811. filtered = apply_filter(parsed, cfg.get("domain_filter", {}))
  812. record_mapping_cfg = cfg.get("record_mapping", {})
  813. field_map = record_mapping_cfg.get("field_map", {})
  814. records = extract_records(payload, record_mapping_cfg)
  815. record_filter_cfg = cfg.get("record_filter", {})
  816. blocked_domains = collect_excluded_domains(records, field_map, record_filter_cfg)
  817. if blocked_domains:
  818. filtered = [d for d in filtered if d not in blocked_domains]
  819. scoring_cfg = cfg.get("scoring", {})
  820. scored_records = parse_scored_records(records, field_map, record_mapping_cfg, scoring_cfg)
  821. filtered_set = set(filtered)
  822. scored_records = [r for r in scored_records if r["domain"] in filtered_set]
  823. ranked_scored = rank_scored_records(scored_records, scoring_cfg)
  824. if cfg.get("healthcheck", {}).get("enabled", True):
  825. check_results = check_domains(filtered, cfg.get("healthcheck", {}))
  826. selected, top_candidates = choose_domain(filtered, check_results, top_n, ranked_scored)
  827. status = "ok"
  828. if not selected and last_good:
  829. selected = last_good
  830. status = "fallback_last_good"
  831. if not selected:
  832. if source_type == "cfst_local":
  833. raise RuntimeError("No valid IP available from cfst and no fallback in state")
  834. raise RuntimeError("No valid domain available from API and no fallback in state")
  835. write_text_file(selected_text_file, selected + "\n")
  836. current_json = {
  837. selected_json_key: selected,
  838. "updated_at": utc_now_iso(),
  839. "status": status,
  840. "source_type": source_type,
  841. "source_count": len(parsed),
  842. "checked_count": len(check_results),
  843. "top_candidates": top_candidates,
  844. }
  845. write_json_file(selected_json_file, current_json)
  846. write_json_file(
  847. vars_file,
  848. {
  849. vars_value_key: selected,
  850. "UPDATED_AT": current_json["updated_at"],
  851. "STATUS": status,
  852. },
  853. )
  854. new_state = {
  855. "updated_at": current_json["updated_at"],
  856. state_last_good_key: selected,
  857. "status": status,
  858. "source_count": len(parsed),
  859. "checked_count": len(check_results),
  860. "source_type": source_type,
  861. }
  862. write_json_file(state_file, new_state)
  863. run_notify(notify_cfg.get("command", ""), selected, status)
  864. print(json.dumps(current_json, ensure_ascii=True))
  865. except Exception as e:
  866. now = utc_now_iso()
  867. err_state = {
  868. "updated_at": now,
  869. "status": "error",
  870. "error": str(e),
  871. state_last_good_key: last_good,
  872. "source_type": source_type,
  873. }
  874. write_json_file(state_file, err_state)
  875. if last_good:
  876. write_text_file(selected_text_file, last_good + "\n")
  877. write_json_file(
  878. selected_json_file,
  879. {
  880. selected_json_key: last_good,
  881. "updated_at": now,
  882. "status": "error_use_last_good",
  883. "error": str(e),
  884. "source_type": source_type,
  885. },
  886. )
  887. run_notify(notify_cfg.get("command", ""), last_good, "error_use_last_good")
  888. print(json.dumps({"status": "error_use_last_good", "error": str(e)}, ensure_ascii=True))
  889. return
  890. print(json.dumps({"status": "error", "error": str(e)}, ensure_ascii=True), file=sys.stderr)
  891. sys.exit(1)
  892. if __name__ == "__main__":
  893. main()