domain_updater.py 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105
  1. #!/usr/bin/env python3
  2. import argparse
  3. import csv
  4. import datetime as dt
  5. import functools
  6. import json
  7. import math
  8. import os
  9. import re
  10. import subprocess
  11. import sys
  12. import urllib.parse
  13. import urllib.request
  14. DOMAIN_RE = re.compile(r"^(?=.{1,253}$)(?!-)[A-Za-z0-9-]{1,63}(?<!-)(\.(?!-)[A-Za-z0-9-]{1,63}(?<!-))+$")
  15. IPV4_RE = re.compile(r"^(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}$")
  16. def utc_now_iso():
  17. return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
  18. def read_json_file(path, default=None):
  19. if default is None:
  20. default = {}
  21. if not os.path.exists(path):
  22. return default
  23. try:
  24. with open(path, "r", encoding="utf-8") as f:
  25. return json.load(f)
  26. except (ValueError, json.JSONDecodeError):
  27. return default
  28. def write_json_file(path, data):
  29. parent = os.path.dirname(path)
  30. if parent:
  31. os.makedirs(parent, exist_ok=True)
  32. with open(path, "w", encoding="utf-8") as f:
  33. json.dump(data, f, ensure_ascii=True, indent=2)
  34. def write_text_file(path, data):
  35. parent = os.path.dirname(path)
  36. if parent:
  37. os.makedirs(parent, exist_ok=True)
  38. with open(path, "w", encoding="utf-8") as f:
  39. f.write(data)
  40. def build_url(base_url, params):
  41. if not params:
  42. return base_url
  43. parsed = urllib.parse.urlparse(base_url)
  44. current = urllib.parse.parse_qs(parsed.query)
  45. for k, v in params.items():
  46. current[k] = [str(v)]
  47. query = urllib.parse.urlencode(current, doseq=True)
  48. return urllib.parse.urlunparse(parsed._replace(query=query))
  49. def resolve_path(base_dir, path_value):
  50. path_text = str(path_value or "").strip()
  51. if not path_text:
  52. return ""
  53. if os.path.isabs(path_text):
  54. return os.path.normpath(path_text)
  55. return os.path.normpath(os.path.join(base_dir, path_text))
  56. def get_source_type(cfg):
  57. source_cfg = cfg.get("source", {})
  58. if isinstance(source_cfg, dict):
  59. source_type = str(source_cfg.get("type", "api")).strip().lower()
  60. if source_type:
  61. return source_type
  62. return "api"
  63. def fetch_api_json(cfg):
  64. api = cfg["api"]
  65. url = build_url(api["url"], api.get("params", {}))
  66. method = api.get("method", "GET").upper()
  67. headers = api.get("headers", {})
  68. timeout = int(api.get("timeout_sec", 10))
  69. body_obj = api.get("body")
  70. body = None
  71. if body_obj is not None:
  72. body = json.dumps(body_obj).encode("utf-8")
  73. headers = {**headers, "Content-Type": "application/json"}
  74. req = urllib.request.Request(url=url, data=body, headers=headers, method=method)
  75. with urllib.request.urlopen(req, timeout=timeout) as resp:
  76. raw = resp.read().decode("utf-8", errors="replace")
  77. return json.loads(raw)
  78. def load_cfst_rows(cfg, config_path_abs):
  79. cfst_cfg = cfg.get("cfst_local", {})
  80. config_dir = os.path.dirname(config_path_abs)
  81. work_dir = resolve_path(config_dir, cfst_cfg.get("work_dir", "./cfst"))
  82. binary_path = resolve_path(work_dir, cfst_cfg.get("binary", "./cfst"))
  83. result_file = resolve_path(work_dir, cfst_cfg.get("result_file", "result.csv"))
  84. encoding = str(cfst_cfg.get("encoding", "utf-8")).strip() or "utf-8"
  85. skip_run = bool(cfst_cfg.get("skip_run", False))
  86. timeout_sec = int(cfst_cfg.get("run_timeout_sec", 600))
  87. run_args = cfst_cfg.get("run_args", ["-o", os.path.basename(result_file)])
  88. if not isinstance(run_args, list):
  89. raise ValueError("cfst_local.run_args must be an array")
  90. command = [binary_path] + [str(x) for x in run_args]
  91. if not skip_run:
  92. completed = subprocess.run(
  93. command,
  94. cwd=work_dir,
  95. check=False,
  96. capture_output=True,
  97. text=True,
  98. encoding=encoding,
  99. errors="replace",
  100. timeout=timeout_sec,
  101. )
  102. if completed.returncode != 0:
  103. stderr = (completed.stderr or "").strip()
  104. stdout = (completed.stdout or "").strip()
  105. details = stderr or stdout or f"exit code {completed.returncode}"
  106. raise RuntimeError(f"cfst run failed: {details}")
  107. if not os.path.exists(result_file):
  108. raise RuntimeError(f"cfst result file not found: {result_file}")
  109. with open(result_file, "r", encoding=encoding, errors="replace", newline="") as f:
  110. reader = csv.reader(f)
  111. rows = [row for row in reader if any(str(col).strip() for col in row)]
  112. header_rows = int(cfst_cfg.get("header_rows", 1))
  113. if len(rows) <= header_rows:
  114. raise RuntimeError("cfst result has no data rows")
  115. columns_cfg = cfst_cfg.get("columns", {})
  116. if not isinstance(columns_cfg, dict):
  117. raise ValueError("cfst_local.columns must be an object")
  118. def col_index(name, default_index):
  119. raw = columns_cfg.get(name, default_index)
  120. try:
  121. idx = int(raw)
  122. except Exception as exc:
  123. raise ValueError(f"cfst_local.columns.{name} must be an integer") from exc
  124. if idx < 0:
  125. raise ValueError(f"cfst_local.columns.{name} must be >= 0")
  126. return idx
  127. ip_idx = col_index("ip", 0)
  128. sent_idx = col_index("sent", 1)
  129. received_idx = col_index("received", 2)
  130. loss_idx = col_index("loss_rate", 3)
  131. latency_idx = col_index("avg_latency", 4)
  132. speed_idx = col_index("download_speed", 5)
  133. region_idx = col_index("region", 6)
  134. out = []
  135. for row in rows[header_rows:]:
  136. if ip_idx >= len(row):
  137. continue
  138. domain = normalize_domain(row[ip_idx])
  139. if not domain:
  140. continue
  141. out.append(
  142. {
  143. "domain": domain,
  144. "ip": domain,
  145. "sent": row[sent_idx].strip() if sent_idx < len(row) else "",
  146. "received": row[received_idx].strip() if received_idx < len(row) else "",
  147. "loss_rate": row[loss_idx].strip() if loss_idx < len(row) else "",
  148. "avg_latency": row[latency_idx].strip() if latency_idx < len(row) else "",
  149. "download_speed": row[speed_idx].strip() if speed_idx < len(row) else "",
  150. "region": row[region_idx].strip() if region_idx < len(row) else "",
  151. }
  152. )
  153. if not out:
  154. raise RuntimeError("cfst result parsed to zero valid rows")
  155. return out
  156. def flatten_values(value):
  157. out = []
  158. if isinstance(value, str):
  159. out.append(value)
  160. elif isinstance(value, list):
  161. for item in value:
  162. out.extend(flatten_values(item))
  163. elif isinstance(value, dict):
  164. for item in value.values():
  165. out.extend(flatten_values(item))
  166. return out
  167. def get_by_json_path(data, path):
  168. cur = data
  169. for part in path.split("."):
  170. if isinstance(cur, dict) and part in cur:
  171. cur = cur[part]
  172. else:
  173. return None
  174. return cur
  175. def get_values_by_path(data, path):
  176. parts = path.split(".")
  177. def walk(cur, idx):
  178. if idx >= len(parts):
  179. return [cur]
  180. part = parts[idx]
  181. if part.endswith("[]"):
  182. key = part[:-2]
  183. if isinstance(cur, dict):
  184. arr = cur.get(key)
  185. else:
  186. arr = None
  187. if not isinstance(arr, list):
  188. return []
  189. out = []
  190. for item in arr:
  191. out.extend(walk(item, idx + 1))
  192. return out
  193. if isinstance(cur, dict) and part in cur:
  194. return walk(cur[part], idx + 1)
  195. return []
  196. return walk(data, 0)
  197. def parse_domains(payload, parser_cfg):
  198. domains = []
  199. for p in parser_cfg.get("field_paths", []):
  200. values = get_values_by_path(payload, p)
  201. domains.extend(flatten_values(values))
  202. for p in parser_cfg.get("json_paths", []):
  203. v = get_by_json_path(payload, p)
  204. if v is not None:
  205. domains.extend(flatten_values(v))
  206. if not domains:
  207. regex_s = parser_cfg.get("regex", r"[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
  208. text = json.dumps(payload, ensure_ascii=True)
  209. domains.extend(re.findall(regex_s, text))
  210. clean = []
  211. seen = set()
  212. for d in domains:
  213. d = str(d).strip().lower().rstrip(".")
  214. if (DOMAIN_RE.match(d) or IPV4_RE.match(d)) and d not in seen:
  215. seen.add(d)
  216. clean.append(d)
  217. return clean
  218. def parse_timezone(tz_raw):
  219. if tz_raw is None:
  220. return dt.timezone.utc
  221. s = str(tz_raw).strip().upper()
  222. if s in {"", "UTC", "Z", "+00:00", "+0000"}:
  223. return dt.timezone.utc
  224. m = re.match(r"^([+-])(\d{2}):?(\d{2})$", s)
  225. if not m:
  226. raise ValueError(f"invalid created_time_timezone: {tz_raw}")
  227. sign = 1 if m.group(1) == "+" else -1
  228. hh = int(m.group(2))
  229. mm = int(m.group(3))
  230. if hh > 23 or mm > 59:
  231. raise ValueError(f"invalid created_time_timezone offset: {tz_raw}")
  232. return dt.timezone(sign * dt.timedelta(hours=hh, minutes=mm))
  233. def parse_created_time(value, formats, timezone):
  234. if value is None:
  235. return None
  236. s = str(value).strip()
  237. if not s:
  238. return None
  239. for fmt in formats:
  240. try:
  241. parsed = dt.datetime.strptime(s, fmt)
  242. if parsed.tzinfo is None:
  243. parsed = parsed.replace(tzinfo=timezone)
  244. return parsed.astimezone(dt.timezone.utc)
  245. except Exception:
  246. continue
  247. try:
  248. iso_text = s.replace("Z", "+00:00")
  249. parsed = dt.datetime.fromisoformat(iso_text)
  250. if parsed.tzinfo is None:
  251. parsed = parsed.replace(tzinfo=timezone)
  252. return parsed.astimezone(dt.timezone.utc)
  253. except Exception:
  254. return None
  255. def normalize_domain(value):
  256. if value is None:
  257. return ""
  258. return str(value).strip().lower().rstrip(".")
  259. def to_float_or_none(value):
  260. try:
  261. f = float(value)
  262. if not math.isfinite(f):
  263. return None
  264. return f
  265. except Exception:
  266. return None
  267. def resolve_field(record, field_name, field_map):
  268. path = field_map.get(field_name)
  269. if not path:
  270. raise ValueError(f"field '{field_name}' is not registered in record_mapping.field_map")
  271. if not isinstance(record, dict):
  272. return None
  273. return get_by_json_path(record, path)
  274. def extract_records(payload, record_mapping):
  275. records_path = str(record_mapping.get("records_path", "")).strip()
  276. raw = get_values_by_path(payload, records_path)
  277. return [x for x in raw if isinstance(x, dict)]
  278. def validate_config(cfg):
  279. source_type = get_source_type(cfg)
  280. if source_type not in {"api", "cfst_local"}:
  281. raise ValueError("source.type must be 'api' or 'cfst_local'")
  282. output_cfg = cfg.get("output", {})
  283. if output_cfg and not isinstance(output_cfg, dict):
  284. raise ValueError("output must be an object")
  285. if source_type == "cfst_local":
  286. cfst_cfg = cfg.get("cfst_local")
  287. if not isinstance(cfst_cfg, dict):
  288. raise ValueError("cfst_local is required and must be an object when source.type=cfst_local")
  289. work_dir = str(cfst_cfg.get("work_dir", "")).strip()
  290. if not work_dir:
  291. raise ValueError("cfst_local.work_dir is required")
  292. binary = str(cfst_cfg.get("binary", "")).strip()
  293. if not binary:
  294. raise ValueError("cfst_local.binary is required")
  295. result_file = str(cfst_cfg.get("result_file", "")).strip()
  296. if not result_file:
  297. raise ValueError("cfst_local.result_file is required")
  298. run_args = cfst_cfg.get("run_args", [])
  299. if not isinstance(run_args, list):
  300. raise ValueError("cfst_local.run_args must be an array")
  301. columns_cfg = cfst_cfg.get("columns", {})
  302. if columns_cfg and not isinstance(columns_cfg, dict):
  303. raise ValueError("cfst_local.columns must be an object")
  304. return
  305. record_mapping = cfg.get("record_mapping")
  306. if not isinstance(record_mapping, dict):
  307. raise ValueError("record_mapping is required and must be an object")
  308. records_path = str(record_mapping.get("records_path", "")).strip()
  309. if not records_path:
  310. raise ValueError("record_mapping.records_path is required")
  311. field_map = record_mapping.get("field_map")
  312. if not isinstance(field_map, dict) or not field_map:
  313. raise ValueError("record_mapping.field_map is required and must be a non-empty object")
  314. for key, path in field_map.items():
  315. if not str(key).strip() or not str(path).strip():
  316. raise ValueError("record_mapping.field_map contains empty field name or path")
  317. for required in ["domain", "created_at"]:
  318. if required not in field_map:
  319. raise ValueError(f"record_mapping.field_map.{required} is required")
  320. created_time_formats = record_mapping.get("created_time_formats")
  321. if not isinstance(created_time_formats, list) or not created_time_formats:
  322. raise ValueError("record_mapping.created_time_formats is required and must be a non-empty array")
  323. for fmt in created_time_formats:
  324. if not str(fmt).strip():
  325. raise ValueError("record_mapping.created_time_formats contains empty format")
  326. parse_timezone(record_mapping.get("created_time_timezone", "UTC"))
  327. def ensure_field_registered(field_name, where):
  328. if field_name not in field_map:
  329. raise ValueError(f"{where}: field '{field_name}' is not in record_mapping.field_map")
  330. record_filter = cfg.get("record_filter", {})
  331. if record_filter.get("enabled", False):
  332. rules = record_filter.get("exclude_if_any", [])
  333. if not isinstance(rules, list):
  334. raise ValueError("record_filter.exclude_if_any must be an array")
  335. for i, rule in enumerate(rules):
  336. if not isinstance(rule, dict):
  337. raise ValueError(f"record_filter.exclude_if_any[{i}] must be an object")
  338. field_name = str(rule.get("field", "")).strip()
  339. if not field_name:
  340. raise ValueError(f"record_filter.exclude_if_any[{i}].field is required")
  341. ensure_field_registered(field_name, f"record_filter.exclude_if_any[{i}]")
  342. has_matcher = any(k in rule for k in ["contains", "equals", "regex"])
  343. if not has_matcher:
  344. raise ValueError(f"record_filter.exclude_if_any[{i}] must include one of contains/equals/regex")
  345. scoring = cfg.get("scoring", {})
  346. if scoring.get("enabled", False):
  347. strategy = str(scoring.get("strategy", "")).strip()
  348. if strategy not in {"weighted_average", "lexicographic"}:
  349. raise ValueError("scoring.strategy must be 'weighted_average' or 'lexicographic'")
  350. within_hours = to_float_or_none(scoring.get("within_hours", 0))
  351. if within_hours is None or within_hours < 0:
  352. raise ValueError("scoring.within_hours must be a non-negative number (0 = disabled)")
  353. if strategy == "weighted_average":
  354. weighted_fields = scoring.get("weighted_fields")
  355. if not isinstance(weighted_fields, list) or not weighted_fields:
  356. raise ValueError("scoring.weighted_fields is required for weighted_average strategy")
  357. for i, item in enumerate(weighted_fields):
  358. if not isinstance(item, dict):
  359. raise ValueError(f"scoring.weighted_fields[{i}] must be an object")
  360. field_name = str(item.get("field", "")).strip()
  361. if not field_name:
  362. raise ValueError(f"scoring.weighted_fields[{i}].field is required")
  363. ensure_field_registered(field_name, f"scoring.weighted_fields[{i}]")
  364. weight = to_float_or_none(item.get("weight"))
  365. if weight is None or weight <= 0:
  366. raise ValueError(f"scoring.weighted_fields[{i}].weight must be > 0")
  367. if strategy == "lexicographic":
  368. lex_fields = scoring.get("lexicographic_fields")
  369. if not isinstance(lex_fields, list) or not lex_fields:
  370. raise ValueError("scoring.lexicographic_fields is required for lexicographic strategy")
  371. for i, item in enumerate(lex_fields):
  372. if isinstance(item, str):
  373. field_name = item.strip()
  374. order = ""
  375. elif isinstance(item, dict):
  376. field_name = str(item.get("field", "")).strip()
  377. order = str(item.get("order", "")).strip().lower()
  378. else:
  379. raise ValueError(f"scoring.lexicographic_fields[{i}] must be string or object")
  380. if not field_name:
  381. raise ValueError(f"scoring.lexicographic_fields[{i}] field is required")
  382. ensure_field_registered(field_name, f"scoring.lexicographic_fields[{i}]")
  383. if order and order not in {"asc", "desc"}:
  384. raise ValueError(f"scoring.lexicographic_fields[{i}].order must be asc or desc")
  385. tie_breakers = scoring.get("tie_breakers", [])
  386. if tie_breakers is not None:
  387. if not isinstance(tie_breakers, list):
  388. raise ValueError("scoring.tie_breakers must be an array")
  389. for i, item in enumerate(tie_breakers):
  390. if not isinstance(item, dict):
  391. raise ValueError(f"scoring.tie_breakers[{i}] must be an object")
  392. field_name = str(item.get("field", "")).strip()
  393. order = str(item.get("order", "")).strip().lower()
  394. if not field_name:
  395. raise ValueError(f"scoring.tie_breakers[{i}].field is required")
  396. if order not in {"asc", "desc"}:
  397. raise ValueError(f"scoring.tie_breakers[{i}].order must be asc or desc")
  398. ensure_field_registered(field_name, f"scoring.tie_breakers[{i}]")
  399. def rule_matches(value, rule):
  400. if value is None or not isinstance(rule, dict):
  401. return False
  402. values = flatten_values(value)
  403. if not values:
  404. values = [value]
  405. case_sensitive = bool(rule.get("case_sensitive", False))
  406. if "contains" in rule:
  407. needle = str(rule.get("contains", ""))
  408. if not needle:
  409. return False
  410. for item in values:
  411. hay = str(item)
  412. if case_sensitive:
  413. if needle in hay:
  414. return True
  415. else:
  416. if needle.lower() in hay.lower():
  417. return True
  418. return False
  419. if "equals" in rule:
  420. target = str(rule.get("equals", ""))
  421. for item in values:
  422. item_s = str(item)
  423. if case_sensitive:
  424. if item_s == target:
  425. return True
  426. else:
  427. if item_s.lower() == target.lower():
  428. return True
  429. return False
  430. if "regex" in rule:
  431. pattern = str(rule.get("regex", ""))
  432. if not pattern:
  433. return False
  434. flags = 0 if case_sensitive else re.IGNORECASE
  435. try:
  436. rx = re.compile(pattern, flags)
  437. except Exception:
  438. return False
  439. for item in values:
  440. if rx.search(str(item)):
  441. return True
  442. return False
  443. return False
  444. def collect_excluded_domains(records, field_map, record_filter_cfg):
  445. if not record_filter_cfg.get("enabled", False):
  446. return set()
  447. rules = record_filter_cfg.get("exclude_if_any", [])
  448. if not rules:
  449. return set()
  450. blocked = set()
  451. for record in records:
  452. domain = normalize_domain(resolve_field(record, "domain", field_map))
  453. if not domain:
  454. continue
  455. for rule in rules:
  456. field_name = str(rule.get("field", "")).strip()
  457. if not field_name:
  458. continue
  459. value = resolve_field(record, field_name, field_map)
  460. if rule_matches(value, rule):
  461. blocked.add(domain)
  462. break
  463. return blocked
  464. def build_lexicographic_descriptors(scoring_cfg, prefer_lower):
  465. out = []
  466. for item in scoring_cfg.get("lexicographic_fields", []):
  467. if isinstance(item, str):
  468. field_name = item.strip()
  469. order = "asc" if prefer_lower else "desc"
  470. else:
  471. field_name = str(item.get("field", "")).strip()
  472. order = str(item.get("order", "")).strip().lower()
  473. if not order:
  474. order = "asc" if prefer_lower else "desc"
  475. out.append({"field": field_name, "order": order})
  476. return out
  477. def parse_scored_records(records, field_map, record_mapping_cfg, scoring_cfg):
  478. if not scoring_cfg.get("enabled", False):
  479. return []
  480. strategy = str(scoring_cfg.get("strategy", "weighted_average")).strip()
  481. prefer_lower = bool(scoring_cfg.get("prefer_lower", False))
  482. timezone = parse_timezone(record_mapping_cfg.get("created_time_timezone", "UTC"))
  483. time_formats = [str(x) for x in record_mapping_cfg.get("created_time_formats", [])]
  484. weighted_fields = scoring_cfg.get("weighted_fields", []) if strategy == "weighted_average" else []
  485. lex_descriptors = build_lexicographic_descriptors(scoring_cfg, prefer_lower) if strategy == "lexicographic" else []
  486. needed_fields = set()
  487. for item in weighted_fields:
  488. needed_fields.add(str(item.get("field", "")).strip())
  489. for item in lex_descriptors:
  490. needed_fields.add(str(item.get("field", "")).strip())
  491. for item in scoring_cfg.get("tie_breakers", []):
  492. needed_fields.add(str(item.get("field", "")).strip())
  493. needed_fields.discard("domain")
  494. needed_fields.discard("created_at")
  495. out = []
  496. for record in records:
  497. domain = normalize_domain(resolve_field(record, "domain", field_map))
  498. if not domain:
  499. continue
  500. created_raw = resolve_field(record, "created_at", field_map)
  501. created_at = parse_created_time(created_raw, time_formats, timezone)
  502. field_values = {}
  503. for field_name in needed_fields:
  504. field_values[field_name] = resolve_field(record, field_name, field_map)
  505. score_value = None
  506. scores = []
  507. lex_values = []
  508. if strategy == "weighted_average":
  509. total = 0.0
  510. total_weight = 0.0
  511. missing = False
  512. for item in weighted_fields:
  513. field_name = str(item.get("field", "")).strip()
  514. weight = float(item.get("weight"))
  515. raw_v = resolve_field(record, field_name, field_map)
  516. val = to_float_or_none(raw_v)
  517. scores.append(val)
  518. if val is None:
  519. missing = True
  520. continue
  521. total += val * weight
  522. total_weight += weight
  523. if not missing and total_weight > 0:
  524. score_value = total / total_weight
  525. if strategy == "lexicographic":
  526. for item in lex_descriptors:
  527. field_name = item["field"]
  528. order = item["order"]
  529. raw_v = resolve_field(record, field_name, field_map)
  530. num_v = to_float_or_none(raw_v)
  531. v = num_v if num_v is not None else raw_v
  532. lex_values.append({"field": field_name, "value": v, "order": order})
  533. scores.append(v)
  534. out.append(
  535. {
  536. "domain": domain,
  537. "created_at": created_at,
  538. "created_raw": created_raw,
  539. "scores": scores,
  540. "score_value": score_value,
  541. "lex_values": lex_values,
  542. "field_values": field_values,
  543. "raw_record": record,
  544. }
  545. )
  546. return out
  547. def cmp_scalar(a, b, order):
  548. a_none = a is None
  549. b_none = b is None
  550. if a_none and b_none:
  551. return 0
  552. if a_none:
  553. return 1
  554. if b_none:
  555. return -1
  556. if isinstance(a, dt.datetime):
  557. a = a.timestamp()
  558. if isinstance(b, dt.datetime):
  559. b = b.timestamp()
  560. a_num = to_float_or_none(a)
  561. b_num = to_float_or_none(b)
  562. if a_num is not None and b_num is not None:
  563. if a_num < b_num:
  564. base = -1
  565. elif a_num > b_num:
  566. base = 1
  567. else:
  568. base = 0
  569. else:
  570. a_s = str(a).lower()
  571. b_s = str(b).lower()
  572. if a_s < b_s:
  573. base = -1
  574. elif a_s > b_s:
  575. base = 1
  576. else:
  577. base = 0
  578. return base if order == "asc" else -base
  579. def get_sort_field_value(record, field_name):
  580. if field_name == "domain":
  581. return record.get("domain")
  582. if field_name == "created_at":
  583. return record.get("created_at")
  584. return record.get("field_values", {}).get(field_name)
  585. def rank_scored_records(records, scoring_cfg):
  586. if not records:
  587. return []
  588. within_hours = float(scoring_cfg.get("within_hours", 0))
  589. strategy = str(scoring_cfg.get("strategy", "weighted_average")).strip()
  590. prefer_lower = bool(scoring_cfg.get("prefer_lower", False))
  591. tie_breakers = scoring_cfg.get("tie_breakers", [])
  592. if within_hours > 0:
  593. now = dt.datetime.now(dt.timezone.utc)
  594. cutoff = now - dt.timedelta(hours=within_hours)
  595. recent = [r for r in records if r.get("created_at") is not None and r["created_at"] >= cutoff]
  596. candidates = recent if recent else records
  597. else:
  598. candidates = records
  599. default_lex_order = "asc" if prefer_lower else "desc"
  600. def compare(a, b):
  601. if strategy == "weighted_average":
  602. order = "asc" if prefer_lower else "desc"
  603. c = cmp_scalar(a.get("score_value"), b.get("score_value"), order)
  604. if c != 0:
  605. return c
  606. elif strategy == "lexicographic":
  607. a_lex = a.get("lex_values", [])
  608. b_lex = b.get("lex_values", [])
  609. n = max(len(a_lex), len(b_lex))
  610. for i in range(n):
  611. av = a_lex[i]["value"] if i < len(a_lex) else None
  612. bv = b_lex[i]["value"] if i < len(b_lex) else None
  613. order = default_lex_order
  614. if i < len(a_lex) and a_lex[i].get("order"):
  615. order = a_lex[i]["order"]
  616. c = cmp_scalar(av, bv, order)
  617. if c != 0:
  618. return c
  619. for item in tie_breakers:
  620. field_name = str(item.get("field", "")).strip()
  621. order = str(item.get("order", "asc")).strip().lower()
  622. av = get_sort_field_value(a, field_name)
  623. bv = get_sort_field_value(b, field_name)
  624. c = cmp_scalar(av, bv, order)
  625. if c != 0:
  626. return c
  627. return cmp_scalar(a.get("domain"), b.get("domain"), "asc")
  628. return sorted(candidates, key=functools.cmp_to_key(compare))
  629. def apply_filter(domains, filter_cfg):
  630. include_suffixes = [s.lower() for s in filter_cfg.get("include_suffixes", []) if s]
  631. exclude_regex = [re.compile(x) for x in filter_cfg.get("exclude_regex", []) if x]
  632. out = []
  633. for d in domains:
  634. if include_suffixes and not any(d.endswith(s) for s in include_suffixes):
  635. continue
  636. if any(rx.search(d) for rx in exclude_regex):
  637. continue
  638. out.append(d)
  639. return out
  640. def choose_top_candidate_domains(filtered_domains, top_n, ranked_scored):
  641. if ranked_scored:
  642. domains_by_score = [x["domain"] for x in ranked_scored]
  643. return domains_by_score[:top_n]
  644. return filtered_domains[:top_n]
  645. def text_or_blank(value):
  646. if value is None:
  647. return ""
  648. return str(value).strip()
  649. def set_if_nonempty_text(obj, key, value):
  650. text = text_or_blank(value)
  651. if text:
  652. obj[key] = text
  653. def base_top_candidate(domain, source_type):
  654. candidate = {"domain": domain, "source_type": source_type}
  655. if IPV4_RE.match(domain):
  656. candidate["ip"] = domain
  657. return candidate
  658. def maybe_resolve_field(record, field_name, field_map):
  659. if not isinstance(record, dict):
  660. return None
  661. if field_name not in field_map:
  662. return None
  663. return resolve_field(record, field_name, field_map)
  664. def build_cfst_candidate(row):
  665. domain = row.get("domain", "")
  666. candidate = base_top_candidate(domain=domain, source_type="cfst_local")
  667. set_if_nonempty_text(candidate, "ip", row.get("ip") or domain)
  668. set_if_nonempty_text(candidate, "loss_rate", row.get("loss_rate"))
  669. set_if_nonempty_text(candidate, "avg_latency", row.get("avg_latency"))
  670. set_if_nonempty_text(candidate, "download_speed", row.get("download_speed"))
  671. set_if_nonempty_text(candidate, "region", row.get("region"))
  672. return candidate
  673. def build_api_candidate(domain, record, field_map, scored_record=None):
  674. candidate = base_top_candidate(domain=domain, source_type="api")
  675. if record:
  676. set_if_nonempty_text(candidate, "created_raw", maybe_resolve_field(record, "created_at", field_map))
  677. set_if_nonempty_text(candidate, "avg_latency", maybe_resolve_field(record, "avg_latency", field_map))
  678. set_if_nonempty_text(candidate, "loss_rate", maybe_resolve_field(record, "avg_pkg_lost_rate", field_map))
  679. set_if_nonempty_text(candidate, "download_speed", maybe_resolve_field(record, "download_speed", field_map))
  680. location_country = text_or_blank(maybe_resolve_field(record, "location_country", field_map))
  681. location_city = text_or_blank(maybe_resolve_field(record, "location_city", field_map))
  682. region = "/".join([x for x in [location_country, location_city] if x])
  683. set_if_nonempty_text(candidate, "region", region)
  684. if scored_record:
  685. if scored_record.get("score_value") is not None:
  686. candidate["score_value"] = scored_record.get("score_value")
  687. scores = list(scored_record.get("scores", []))
  688. if scores:
  689. candidate["scores"] = scores
  690. if "created_raw" not in candidate:
  691. set_if_nonempty_text(candidate, "created_raw", scored_record.get("created_raw"))
  692. return candidate
  693. def build_top_candidates(source_type, candidate_domains, cfst_rows=None, records=None, field_map=None, ranked_scored=None):
  694. if source_type == "cfst_local":
  695. row_map = {}
  696. for row in cfst_rows or []:
  697. domain = row.get("domain", "")
  698. if domain and domain not in row_map:
  699. row_map[domain] = row
  700. return [build_cfst_candidate(row_map.get(d, {"domain": d, "ip": d})) for d in candidate_domains]
  701. ranked_map = {}
  702. for item in ranked_scored or []:
  703. domain = item.get("domain", "")
  704. if domain and domain not in ranked_map:
  705. ranked_map[domain] = item
  706. record_map = {}
  707. for record in records or []:
  708. domain = normalize_domain(maybe_resolve_field(record, "domain", field_map or {}))
  709. if domain and domain not in record_map:
  710. record_map[domain] = record
  711. out = []
  712. for domain in candidate_domains:
  713. scored_record = ranked_map.get(domain)
  714. record = None
  715. if scored_record:
  716. record = scored_record.get("raw_record")
  717. if record is None:
  718. record = record_map.get(domain)
  719. out.append(build_api_candidate(domain, record, field_map or {}, scored_record=scored_record))
  720. return out
  721. def run_notify(cmd, domain, status):
  722. if not cmd:
  723. return
  724. env = os.environ.copy()
  725. env["AUTODOMAIN"] = domain
  726. env["AUTODOMAIN_STATUS"] = status
  727. subprocess.run(cmd, shell=True, check=False, env=env)
  728. def choose_domain(filtered_domains, top_n, ranked_scored):
  729. if ranked_scored:
  730. top_domains = choose_top_candidate_domains(filtered_domains, top_n, ranked_scored)
  731. if top_domains:
  732. return top_domains[0], top_domains
  733. if filtered_domains:
  734. return filtered_domains[0], filtered_domains[:top_n]
  735. return None, []
  736. def build_output_settings(output_cfg, config_path_abs):
  737. runtime_dir_cfg = output_cfg.get("runtime_dir", "./runtime")
  738. runtime_dir = resolve_path(os.path.dirname(config_path_abs), runtime_dir_cfg)
  739. selected_text_name = output_cfg.get("selected_value_file", output_cfg.get("current_domain_file", "current_domain.txt"))
  740. selected_json_name = output_cfg.get("selected_value_json", output_cfg.get("current_domain_json", "current_domain.json"))
  741. state_name = output_cfg.get("state_file", "state.json")
  742. vars_name = output_cfg.get("export_vars_file", output_cfg.get("substore_vars_file", "substore_vars.json"))
  743. return {
  744. "runtime_dir": runtime_dir,
  745. "selected_text_path": os.path.join(runtime_dir, selected_text_name),
  746. "selected_json_path": os.path.join(runtime_dir, selected_json_name),
  747. "state_path": os.path.join(runtime_dir, state_name),
  748. "vars_path": os.path.join(runtime_dir, vars_name),
  749. "selected_json_key": str(output_cfg.get("selected_value_json_key", "domain")).strip() or "domain",
  750. "state_last_good_key": str(output_cfg.get("state_last_good_key", "last_good_domain")).strip() or "last_good_domain",
  751. "vars_value_key": str(output_cfg.get("substore_value_key", "AUTO_DOMAIN")).strip() or "AUTO_DOMAIN",
  752. }
  753. def print_output_settings(config_path_abs, cfg):
  754. output_cfg = cfg.get("output", {})
  755. settings = build_output_settings(output_cfg, config_path_abs)
  756. print(json.dumps(settings, ensure_ascii=True))
  757. def main():
  758. ap = argparse.ArgumentParser(description="Auto select preferred endpoint value")
  759. ap.add_argument("--config", default="config.server.json", help="Path to config JSON")
  760. ap.add_argument(
  761. "--print-output-settings",
  762. action="store_true",
  763. help="Print resolved output settings as JSON and exit",
  764. )
  765. args = ap.parse_args()
  766. config_path_abs = os.path.abspath(args.config)
  767. if not os.path.exists(config_path_abs):
  768. print(json.dumps({"status": "error", "error": f"config file not found: {config_path_abs}"}, ensure_ascii=True), file=sys.stderr)
  769. sys.exit(1)
  770. cfg = read_json_file(config_path_abs)
  771. try:
  772. validate_config(cfg)
  773. except Exception as e:
  774. print(json.dumps({"status": "error", "error": f"invalid config: {e}"}, ensure_ascii=True), file=sys.stderr)
  775. sys.exit(1)
  776. if args.print_output_settings:
  777. print_output_settings(config_path_abs, cfg)
  778. return
  779. output_cfg = cfg.get("output", {})
  780. output_settings = build_output_settings(output_cfg, config_path_abs)
  781. notify_cfg = cfg.get("notify", {})
  782. selected_text_file = output_settings["selected_text_path"]
  783. selected_json_file = output_settings["selected_json_path"]
  784. state_file = output_settings["state_path"]
  785. vars_file = output_settings["vars_path"]
  786. selected_json_key = output_settings["selected_json_key"]
  787. state_last_good_key = output_settings["state_last_good_key"]
  788. vars_value_key = output_settings["vars_value_key"]
  789. state = read_json_file(state_file, default={})
  790. last_good = state.get(state_last_good_key, "")
  791. source_type = get_source_type(cfg)
  792. try:
  793. top_n = int(cfg.get("selection", {}).get("top_n", 3))
  794. payload = None
  795. if source_type == "cfst_local":
  796. cfst_rows = load_cfst_rows(cfg, config_path_abs)
  797. parsed = [row["domain"] for row in cfst_rows]
  798. filtered = apply_filter(parsed, cfg.get("domain_filter", {}))
  799. filtered_set = set(filtered)
  800. cfst_rows = [row for row in cfst_rows if row["domain"] in filtered_set]
  801. if not cfst_rows:
  802. raise RuntimeError("No valid IP available from cfst result after filtering")
  803. selected = cfst_rows[0]["domain"]
  804. candidate_domains = [row["domain"] for row in cfst_rows[:top_n]]
  805. top_candidates = build_top_candidates("cfst_local", candidate_domains, cfst_rows=cfst_rows)
  806. else:
  807. payload = fetch_api_json(cfg)
  808. parsed = parse_domains(payload, cfg.get("parser", {}))
  809. filtered = apply_filter(parsed, cfg.get("domain_filter", {}))
  810. record_mapping_cfg = cfg.get("record_mapping", {})
  811. field_map = record_mapping_cfg.get("field_map", {})
  812. records = extract_records(payload, record_mapping_cfg)
  813. record_filter_cfg = cfg.get("record_filter", {})
  814. blocked_domains = collect_excluded_domains(records, field_map, record_filter_cfg)
  815. if blocked_domains:
  816. filtered = [d for d in filtered if d not in blocked_domains]
  817. scoring_cfg = cfg.get("scoring", {})
  818. scored_records = parse_scored_records(records, field_map, record_mapping_cfg, scoring_cfg)
  819. filtered_set = set(filtered)
  820. scored_records = [r for r in scored_records if r["domain"] in filtered_set]
  821. ranked_scored = rank_scored_records(scored_records, scoring_cfg)
  822. selected, candidate_domains = choose_domain(filtered, top_n, ranked_scored)
  823. top_candidates = build_top_candidates(
  824. "api",
  825. candidate_domains,
  826. records=records,
  827. field_map=field_map,
  828. ranked_scored=ranked_scored,
  829. )
  830. status = "ok"
  831. if not selected and last_good:
  832. selected = last_good
  833. status = "fallback_last_good"
  834. if not selected:
  835. if source_type == "cfst_local":
  836. raise RuntimeError("No valid IP available from cfst and no fallback in state")
  837. raise RuntimeError("No valid domain available from API and no fallback in state")
  838. write_text_file(selected_text_file, selected + "\n")
  839. current_json = {
  840. selected_json_key: selected,
  841. "updated_at": utc_now_iso(),
  842. "status": status,
  843. "source_type": source_type,
  844. "source_count": len(parsed),
  845. "top_candidates": top_candidates,
  846. }
  847. write_json_file(selected_json_file, current_json)
  848. write_json_file(
  849. vars_file,
  850. {
  851. vars_value_key: selected,
  852. "UPDATED_AT": current_json["updated_at"],
  853. "STATUS": status,
  854. },
  855. )
  856. new_state = {
  857. "updated_at": current_json["updated_at"],
  858. state_last_good_key: selected,
  859. "status": status,
  860. "source_count": len(parsed),
  861. "source_type": source_type,
  862. }
  863. write_json_file(state_file, new_state)
  864. run_notify(notify_cfg.get("command", ""), selected, status)
  865. print(json.dumps(current_json, ensure_ascii=True))
  866. except Exception as e:
  867. now = utc_now_iso()
  868. err_state = {
  869. "updated_at": now,
  870. "status": "error",
  871. "error": str(e),
  872. state_last_good_key: last_good,
  873. "source_type": source_type,
  874. }
  875. write_json_file(state_file, err_state)
  876. if last_good:
  877. write_text_file(selected_text_file, last_good + "\n")
  878. write_json_file(
  879. selected_json_file,
  880. {
  881. selected_json_key: last_good,
  882. "updated_at": now,
  883. "status": "error_use_last_good",
  884. "error": str(e),
  885. "source_type": source_type,
  886. },
  887. )
  888. write_json_file(
  889. vars_file,
  890. {
  891. vars_value_key: last_good,
  892. "UPDATED_AT": now,
  893. "STATUS": "error_use_last_good",
  894. },
  895. )
  896. run_notify(notify_cfg.get("command", ""), last_good, "error_use_last_good")
  897. print(json.dumps({"status": "error_use_last_good", "error": str(e)}, ensure_ascii=True))
  898. return
  899. print(json.dumps({"status": "error", "error": str(e)}, ensure_ascii=True), file=sys.stderr)
  900. sys.exit(1)
  901. if __name__ == "__main__":
  902. main()