|
|
@@ -1,7 +1,9 @@
|
|
|
#!/usr/bin/env python3
|
|
|
import argparse
|
|
|
import datetime as dt
|
|
|
+import functools
|
|
|
import json
|
|
|
+import math
|
|
|
import os
|
|
|
import re
|
|
|
import socket
|
|
|
@@ -143,26 +145,205 @@ def parse_domains(payload, parser_cfg):
|
|
|
clean = []
|
|
|
seen = set()
|
|
|
for d in domains:
|
|
|
- d = d.strip().lower().rstrip(".")
|
|
|
+ d = str(d).strip().lower().rstrip(".")
|
|
|
if (DOMAIN_RE.match(d) or IPV4_RE.match(d)) and d not in seen:
|
|
|
seen.add(d)
|
|
|
clean.append(d)
|
|
|
return clean
|
|
|
|
|
|
|
|
|
-def parse_created_time(s):
|
|
|
+def parse_timezone(tz_raw):
|
|
|
+ if tz_raw is None:
|
|
|
+ return dt.timezone.utc
|
|
|
+ s = str(tz_raw).strip().upper()
|
|
|
+ if s in {"", "UTC", "Z", "+00:00", "+0000"}:
|
|
|
+ return dt.timezone.utc
|
|
|
+
|
|
|
+ m = re.match(r"^([+-])(\d{2}):?(\d{2})$", s)
|
|
|
+ if not m:
|
|
|
+ raise ValueError(f"invalid created_time_timezone: {tz_raw}")
|
|
|
+
|
|
|
+ sign = 1 if m.group(1) == "+" else -1
|
|
|
+ hh = int(m.group(2))
|
|
|
+ mm = int(m.group(3))
|
|
|
+ if hh > 23 or mm > 59:
|
|
|
+ raise ValueError(f"invalid created_time_timezone offset: {tz_raw}")
|
|
|
+ return dt.timezone(sign * dt.timedelta(hours=hh, minutes=mm))
|
|
|
+
|
|
|
+
|
|
|
+def parse_created_time(value, formats, timezone):
|
|
|
+ if value is None:
|
|
|
+ return None
|
|
|
+
|
|
|
+ s = str(value).strip()
|
|
|
if not s:
|
|
|
return None
|
|
|
+
|
|
|
+ for fmt in formats:
|
|
|
+ try:
|
|
|
+ parsed = dt.datetime.strptime(s, fmt)
|
|
|
+ if parsed.tzinfo is None:
|
|
|
+ parsed = parsed.replace(tzinfo=timezone)
|
|
|
+ return parsed.astimezone(dt.timezone.utc)
|
|
|
+ except Exception:
|
|
|
+ continue
|
|
|
+
|
|
|
try:
|
|
|
- return dt.datetime.strptime(str(s).strip(), "%Y-%m-%d %H:%M:%S").replace(tzinfo=dt.timezone.utc)
|
|
|
+ iso_text = s.replace("Z", "+00:00")
|
|
|
+ parsed = dt.datetime.fromisoformat(iso_text)
|
|
|
+ if parsed.tzinfo is None:
|
|
|
+ parsed = parsed.replace(tzinfo=timezone)
|
|
|
+ return parsed.astimezone(dt.timezone.utc)
|
|
|
except Exception:
|
|
|
return None
|
|
|
|
|
|
|
|
|
-def record_field_value(record, field_path):
|
|
|
- if not isinstance(record, dict) or not field_path:
|
|
|
+def normalize_domain(value):
|
|
|
+ if value is None:
|
|
|
+ return ""
|
|
|
+ return str(value).strip().lower().rstrip(".")
|
|
|
+
|
|
|
+
|
|
|
+def to_float_or_none(value):
|
|
|
+ try:
|
|
|
+ f = float(value)
|
|
|
+ if not math.isfinite(f):
|
|
|
+ return None
|
|
|
+ return f
|
|
|
+ except Exception:
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def resolve_field(record, field_name, field_map):
|
|
|
+ path = field_map.get(field_name)
|
|
|
+ if not path:
|
|
|
+ raise ValueError(f"field '{field_name}' is not registered in record_mapping.field_map")
|
|
|
+ if not isinstance(record, dict):
|
|
|
return None
|
|
|
- return get_by_json_path(record, field_path)
|
|
|
+ return get_by_json_path(record, path)
|
|
|
+
|
|
|
+
|
|
|
+def extract_records(payload, record_mapping):
|
|
|
+ records_path = str(record_mapping.get("records_path", "")).strip()
|
|
|
+ raw = get_values_by_path(payload, records_path)
|
|
|
+ return [x for x in raw if isinstance(x, dict)]
|
|
|
+
|
|
|
+
|
|
|
+def validate_config(cfg):
|
|
|
+ record_mapping = cfg.get("record_mapping")
|
|
|
+ if not isinstance(record_mapping, dict):
|
|
|
+ raise ValueError("record_mapping is required and must be an object")
|
|
|
+
|
|
|
+ records_path = str(record_mapping.get("records_path", "")).strip()
|
|
|
+ if not records_path:
|
|
|
+ raise ValueError("record_mapping.records_path is required")
|
|
|
+
|
|
|
+ field_map = record_mapping.get("field_map")
|
|
|
+ if not isinstance(field_map, dict) or not field_map:
|
|
|
+ raise ValueError("record_mapping.field_map is required and must be a non-empty object")
|
|
|
+
|
|
|
+ for key, path in field_map.items():
|
|
|
+ if not str(key).strip() or not str(path).strip():
|
|
|
+ raise ValueError("record_mapping.field_map contains empty field name or path")
|
|
|
+
|
|
|
+ for required in ["domain", "created_at"]:
|
|
|
+ if required not in field_map:
|
|
|
+ raise ValueError(f"record_mapping.field_map.{required} is required")
|
|
|
+
|
|
|
+ created_time_formats = record_mapping.get("created_time_formats")
|
|
|
+ if not isinstance(created_time_formats, list) or not created_time_formats:
|
|
|
+ raise ValueError("record_mapping.created_time_formats is required and must be a non-empty array")
|
|
|
+
|
|
|
+ for fmt in created_time_formats:
|
|
|
+ if not str(fmt).strip():
|
|
|
+ raise ValueError("record_mapping.created_time_formats contains empty format")
|
|
|
+
|
|
|
+ parse_timezone(record_mapping.get("created_time_timezone", "UTC"))
|
|
|
+
|
|
|
+ def ensure_field_registered(field_name, where):
|
|
|
+ if field_name not in field_map:
|
|
|
+ raise ValueError(f"{where}: field '{field_name}' is not in record_mapping.field_map")
|
|
|
+
|
|
|
+ record_filter = cfg.get("record_filter", {})
|
|
|
+ if record_filter.get("enabled", False):
|
|
|
+ rules = record_filter.get("exclude_if_any", [])
|
|
|
+ if not isinstance(rules, list):
|
|
|
+ raise ValueError("record_filter.exclude_if_any must be an array")
|
|
|
+
|
|
|
+ for i, rule in enumerate(rules):
|
|
|
+ if not isinstance(rule, dict):
|
|
|
+ raise ValueError(f"record_filter.exclude_if_any[{i}] must be an object")
|
|
|
+ field_name = str(rule.get("field", "")).strip()
|
|
|
+ if not field_name:
|
|
|
+ raise ValueError(f"record_filter.exclude_if_any[{i}].field is required")
|
|
|
+ ensure_field_registered(field_name, f"record_filter.exclude_if_any[{i}]")
|
|
|
+
|
|
|
+ has_matcher = any(k in rule for k in ["contains", "equals", "regex"])
|
|
|
+ if not has_matcher:
|
|
|
+ raise ValueError(f"record_filter.exclude_if_any[{i}] must include one of contains/equals/regex")
|
|
|
+
|
|
|
+ scoring = cfg.get("scoring", {})
|
|
|
+ if scoring.get("enabled", False):
|
|
|
+ strategy = str(scoring.get("strategy", "")).strip()
|
|
|
+ if strategy not in {"weighted_average", "lexicographic"}:
|
|
|
+ raise ValueError("scoring.strategy must be 'weighted_average' or 'lexicographic'")
|
|
|
+
|
|
|
+ within_hours = to_float_or_none(scoring.get("within_hours", 24))
|
|
|
+ if within_hours is None or within_hours <= 0:
|
|
|
+ raise ValueError("scoring.within_hours must be a positive number")
|
|
|
+
|
|
|
+ if strategy == "weighted_average":
|
|
|
+ weighted_fields = scoring.get("weighted_fields")
|
|
|
+ if not isinstance(weighted_fields, list) or not weighted_fields:
|
|
|
+ raise ValueError("scoring.weighted_fields is required for weighted_average strategy")
|
|
|
+
|
|
|
+ for i, item in enumerate(weighted_fields):
|
|
|
+ if not isinstance(item, dict):
|
|
|
+ raise ValueError(f"scoring.weighted_fields[{i}] must be an object")
|
|
|
+ field_name = str(item.get("field", "")).strip()
|
|
|
+ if not field_name:
|
|
|
+ raise ValueError(f"scoring.weighted_fields[{i}].field is required")
|
|
|
+ ensure_field_registered(field_name, f"scoring.weighted_fields[{i}]")
|
|
|
+
|
|
|
+ weight = to_float_or_none(item.get("weight"))
|
|
|
+ if weight is None or weight <= 0:
|
|
|
+ raise ValueError(f"scoring.weighted_fields[{i}].weight must be > 0")
|
|
|
+
|
|
|
+ if strategy == "lexicographic":
|
|
|
+ lex_fields = scoring.get("lexicographic_fields")
|
|
|
+ if not isinstance(lex_fields, list) or not lex_fields:
|
|
|
+ raise ValueError("scoring.lexicographic_fields is required for lexicographic strategy")
|
|
|
+
|
|
|
+ for i, item in enumerate(lex_fields):
|
|
|
+ if isinstance(item, str):
|
|
|
+ field_name = item.strip()
|
|
|
+ order = ""
|
|
|
+ elif isinstance(item, dict):
|
|
|
+ field_name = str(item.get("field", "")).strip()
|
|
|
+ order = str(item.get("order", "")).strip().lower()
|
|
|
+ else:
|
|
|
+ raise ValueError(f"scoring.lexicographic_fields[{i}] must be string or object")
|
|
|
+
|
|
|
+ if not field_name:
|
|
|
+ raise ValueError(f"scoring.lexicographic_fields[{i}] field is required")
|
|
|
+ ensure_field_registered(field_name, f"scoring.lexicographic_fields[{i}]")
|
|
|
+ if order and order not in {"asc", "desc"}:
|
|
|
+ raise ValueError(f"scoring.lexicographic_fields[{i}].order must be asc or desc")
|
|
|
+
|
|
|
+ tie_breakers = scoring.get("tie_breakers", [])
|
|
|
+ if tie_breakers is not None:
|
|
|
+ if not isinstance(tie_breakers, list):
|
|
|
+ raise ValueError("scoring.tie_breakers must be an array")
|
|
|
+ for i, item in enumerate(tie_breakers):
|
|
|
+ if not isinstance(item, dict):
|
|
|
+ raise ValueError(f"scoring.tie_breakers[{i}] must be an object")
|
|
|
+ field_name = str(item.get("field", "")).strip()
|
|
|
+ order = str(item.get("order", "")).strip().lower()
|
|
|
+ if not field_name:
|
|
|
+ raise ValueError(f"scoring.tie_breakers[{i}].field is required")
|
|
|
+ if order not in {"asc", "desc"}:
|
|
|
+ raise ValueError(f"scoring.tie_breakers[{i}].order must be asc or desc")
|
|
|
+ ensure_field_registered(field_name, f"scoring.tie_breakers[{i}]")
|
|
|
|
|
|
|
|
|
def rule_matches(value, rule):
|
|
|
@@ -218,7 +399,7 @@ def rule_matches(value, rule):
|
|
|
return False
|
|
|
|
|
|
|
|
|
-def collect_excluded_domains(payload, record_filter_cfg, scoring_cfg):
|
|
|
+def collect_excluded_domains(records, field_map, record_filter_cfg):
|
|
|
if not record_filter_cfg.get("enabled", False):
|
|
|
return set()
|
|
|
|
|
|
@@ -226,24 +407,17 @@ def collect_excluded_domains(payload, record_filter_cfg, scoring_cfg):
|
|
|
if not rules:
|
|
|
return set()
|
|
|
|
|
|
- records_path = record_filter_cfg.get("records_path", scoring_cfg.get("records_path", "data.good[]"))
|
|
|
- domain_field = record_filter_cfg.get("domain_field", scoring_cfg.get("ip_field", "ip"))
|
|
|
-
|
|
|
blocked = set()
|
|
|
- for record in get_values_by_path(payload, records_path):
|
|
|
- if not isinstance(record, dict):
|
|
|
- continue
|
|
|
-
|
|
|
- domain_raw = record_field_value(record, domain_field)
|
|
|
- domain = str(domain_raw or "").strip().lower().rstrip(".")
|
|
|
+ for record in records:
|
|
|
+ domain = normalize_domain(resolve_field(record, "domain", field_map))
|
|
|
if not domain:
|
|
|
continue
|
|
|
|
|
|
for rule in rules:
|
|
|
- field_path = str(rule.get("field_path", "")).strip()
|
|
|
- if not field_path:
|
|
|
+ field_name = str(rule.get("field", "")).strip()
|
|
|
+ if not field_name:
|
|
|
continue
|
|
|
- value = record_field_value(record, field_path)
|
|
|
+ value = resolve_field(record, field_name, field_map)
|
|
|
if rule_matches(value, rule):
|
|
|
blocked.add(domain)
|
|
|
break
|
|
|
@@ -251,49 +425,157 @@ def collect_excluded_domains(payload, record_filter_cfg, scoring_cfg):
|
|
|
return blocked
|
|
|
|
|
|
|
|
|
-def parse_scored_records(payload, scoring_cfg):
|
|
|
+def build_lexicographic_descriptors(scoring_cfg, prefer_lower):
|
|
|
+ out = []
|
|
|
+ for item in scoring_cfg.get("lexicographic_fields", []):
|
|
|
+ if isinstance(item, str):
|
|
|
+ field_name = item.strip()
|
|
|
+ order = "asc" if prefer_lower else "desc"
|
|
|
+ else:
|
|
|
+ field_name = str(item.get("field", "")).strip()
|
|
|
+ order = str(item.get("order", "")).strip().lower()
|
|
|
+ if not order:
|
|
|
+ order = "asc" if prefer_lower else "desc"
|
|
|
+ out.append({"field": field_name, "order": order})
|
|
|
+ return out
|
|
|
+
|
|
|
+
|
|
|
+def parse_scored_records(records, field_map, record_mapping_cfg, scoring_cfg):
|
|
|
if not scoring_cfg.get("enabled", False):
|
|
|
return []
|
|
|
|
|
|
- records_path = scoring_cfg.get("records_path", "data.good[]")
|
|
|
- ip_field = scoring_cfg.get("ip_field", "ip")
|
|
|
- created_time_field = scoring_cfg.get("created_time_field", "createdTime")
|
|
|
- score_fields = scoring_cfg.get("score_fields", ["avgScore", "ydScore", "dxScore", "ltScore"])
|
|
|
+ strategy = str(scoring_cfg.get("strategy", "weighted_average")).strip()
|
|
|
+ prefer_lower = bool(scoring_cfg.get("prefer_lower", False))
|
|
|
+
|
|
|
+ timezone = parse_timezone(record_mapping_cfg.get("created_time_timezone", "UTC"))
|
|
|
+ time_formats = [str(x) for x in record_mapping_cfg.get("created_time_formats", [])]
|
|
|
+
|
|
|
+ weighted_fields = scoring_cfg.get("weighted_fields", []) if strategy == "weighted_average" else []
|
|
|
+ lex_descriptors = build_lexicographic_descriptors(scoring_cfg, prefer_lower) if strategy == "lexicographic" else []
|
|
|
+
|
|
|
+ needed_fields = set()
|
|
|
+ for item in weighted_fields:
|
|
|
+ needed_fields.add(str(item.get("field", "")).strip())
|
|
|
+ for item in lex_descriptors:
|
|
|
+ needed_fields.add(str(item.get("field", "")).strip())
|
|
|
+ for item in scoring_cfg.get("tie_breakers", []):
|
|
|
+ needed_fields.add(str(item.get("field", "")).strip())
|
|
|
+ needed_fields.discard("domain")
|
|
|
+ needed_fields.discard("created_at")
|
|
|
|
|
|
- raw_records = get_values_by_path(payload, records_path)
|
|
|
out = []
|
|
|
- for r in raw_records:
|
|
|
- if not isinstance(r, dict):
|
|
|
- continue
|
|
|
- domain = str(r.get(ip_field, "")).strip().lower().rstrip(".")
|
|
|
+ for record in records:
|
|
|
+ domain = normalize_domain(resolve_field(record, "domain", field_map))
|
|
|
if not domain:
|
|
|
continue
|
|
|
- created = parse_created_time(r.get(created_time_field))
|
|
|
+
|
|
|
+ created_raw = resolve_field(record, "created_at", field_map)
|
|
|
+ created_at = parse_created_time(created_raw, time_formats, timezone)
|
|
|
+
|
|
|
+ field_values = {}
|
|
|
+ for field_name in needed_fields:
|
|
|
+ field_values[field_name] = resolve_field(record, field_name, field_map)
|
|
|
+
|
|
|
+ score_value = None
|
|
|
scores = []
|
|
|
- for f in score_fields:
|
|
|
- v = r.get(f)
|
|
|
- try:
|
|
|
- scores.append(float(v))
|
|
|
- except Exception:
|
|
|
- scores.append(float("inf"))
|
|
|
+ lex_values = []
|
|
|
+
|
|
|
+ if strategy == "weighted_average":
|
|
|
+ total = 0.0
|
|
|
+ total_weight = 0.0
|
|
|
+ missing = False
|
|
|
+ for item in weighted_fields:
|
|
|
+ field_name = str(item.get("field", "")).strip()
|
|
|
+ weight = float(item.get("weight"))
|
|
|
+ raw_v = resolve_field(record, field_name, field_map)
|
|
|
+ val = to_float_or_none(raw_v)
|
|
|
+ scores.append(val)
|
|
|
+ if val is None:
|
|
|
+ missing = True
|
|
|
+ continue
|
|
|
+ total += val * weight
|
|
|
+ total_weight += weight
|
|
|
+
|
|
|
+ if not missing and total_weight > 0:
|
|
|
+ score_value = total / total_weight
|
|
|
+
|
|
|
+ if strategy == "lexicographic":
|
|
|
+ for item in lex_descriptors:
|
|
|
+ field_name = item["field"]
|
|
|
+ order = item["order"]
|
|
|
+ raw_v = resolve_field(record, field_name, field_map)
|
|
|
+ num_v = to_float_or_none(raw_v)
|
|
|
+ v = num_v if num_v is not None else raw_v
|
|
|
+ lex_values.append({"field": field_name, "value": v, "order": order})
|
|
|
+ scores.append(v)
|
|
|
+
|
|
|
out.append(
|
|
|
{
|
|
|
"domain": domain,
|
|
|
- "created_at": created,
|
|
|
- "created_raw": r.get(created_time_field),
|
|
|
+ "created_at": created_at,
|
|
|
+ "created_raw": created_raw,
|
|
|
"scores": scores,
|
|
|
+ "score_value": score_value,
|
|
|
+ "lex_values": lex_values,
|
|
|
+ "field_values": field_values,
|
|
|
}
|
|
|
)
|
|
|
return out
|
|
|
|
|
|
|
|
|
+def cmp_scalar(a, b, order):
|
|
|
+ a_none = a is None
|
|
|
+ b_none = b is None
|
|
|
+ if a_none and b_none:
|
|
|
+ return 0
|
|
|
+ if a_none:
|
|
|
+ return 1
|
|
|
+ if b_none:
|
|
|
+ return -1
|
|
|
+
|
|
|
+ if isinstance(a, dt.datetime):
|
|
|
+ a = a.timestamp()
|
|
|
+ if isinstance(b, dt.datetime):
|
|
|
+ b = b.timestamp()
|
|
|
+
|
|
|
+ a_num = to_float_or_none(a)
|
|
|
+ b_num = to_float_or_none(b)
|
|
|
+ if a_num is not None and b_num is not None:
|
|
|
+ if a_num < b_num:
|
|
|
+ base = -1
|
|
|
+ elif a_num > b_num:
|
|
|
+ base = 1
|
|
|
+ else:
|
|
|
+ base = 0
|
|
|
+ else:
|
|
|
+ a_s = str(a).lower()
|
|
|
+ b_s = str(b).lower()
|
|
|
+ if a_s < b_s:
|
|
|
+ base = -1
|
|
|
+ elif a_s > b_s:
|
|
|
+ base = 1
|
|
|
+ else:
|
|
|
+ base = 0
|
|
|
+
|
|
|
+ return base if order == "asc" else -base
|
|
|
+
|
|
|
+
|
|
|
+def get_sort_field_value(record, field_name):
|
|
|
+ if field_name == "domain":
|
|
|
+ return record.get("domain")
|
|
|
+ if field_name == "created_at":
|
|
|
+ return record.get("created_at")
|
|
|
+ return record.get("field_values", {}).get(field_name)
|
|
|
+
|
|
|
+
|
|
|
def rank_scored_records(records, scoring_cfg):
|
|
|
if not records:
|
|
|
return []
|
|
|
|
|
|
within_hours = float(scoring_cfg.get("within_hours", 24))
|
|
|
- prefer_lower = bool(scoring_cfg.get("prefer_lower", True))
|
|
|
- use_api_order = bool(scoring_cfg.get("use_api_order", False))
|
|
|
+ strategy = str(scoring_cfg.get("strategy", "weighted_average")).strip()
|
|
|
+ prefer_lower = bool(scoring_cfg.get("prefer_lower", False))
|
|
|
+ tie_breakers = scoring_cfg.get("tie_breakers", [])
|
|
|
|
|
|
now = dt.datetime.now(dt.timezone.utc)
|
|
|
cutoff = now - dt.timedelta(hours=within_hours)
|
|
|
@@ -301,25 +583,40 @@ def rank_scored_records(records, scoring_cfg):
|
|
|
recent = [r for r in records if r.get("created_at") is not None and r["created_at"] >= cutoff]
|
|
|
candidates = recent if recent else records
|
|
|
|
|
|
- if use_api_order:
|
|
|
- seen = set()
|
|
|
- ordered = []
|
|
|
- for r in candidates:
|
|
|
- d = r["domain"]
|
|
|
- if d in seen:
|
|
|
- continue
|
|
|
- seen.add(d)
|
|
|
- ordered.append(r)
|
|
|
- return ordered
|
|
|
-
|
|
|
- def key_lower(r):
|
|
|
- return tuple(r["scores"] + [r["domain"]])
|
|
|
-
|
|
|
- def key_higher(r):
|
|
|
- return tuple([-x if x != float("inf") else float("inf") for x in r["scores"]] + [r["domain"]])
|
|
|
-
|
|
|
- ranked = sorted(candidates, key=key_lower if prefer_lower else key_higher)
|
|
|
- return ranked
|
|
|
+ default_lex_order = "asc" if prefer_lower else "desc"
|
|
|
+
|
|
|
+ def compare(a, b):
|
|
|
+ if strategy == "weighted_average":
|
|
|
+ order = "asc" if prefer_lower else "desc"
|
|
|
+ c = cmp_scalar(a.get("score_value"), b.get("score_value"), order)
|
|
|
+ if c != 0:
|
|
|
+ return c
|
|
|
+ elif strategy == "lexicographic":
|
|
|
+ a_lex = a.get("lex_values", [])
|
|
|
+ b_lex = b.get("lex_values", [])
|
|
|
+ n = max(len(a_lex), len(b_lex))
|
|
|
+ for i in range(n):
|
|
|
+ av = a_lex[i]["value"] if i < len(a_lex) else None
|
|
|
+ bv = b_lex[i]["value"] if i < len(b_lex) else None
|
|
|
+ order = default_lex_order
|
|
|
+ if i < len(a_lex) and a_lex[i].get("order"):
|
|
|
+ order = a_lex[i]["order"]
|
|
|
+ c = cmp_scalar(av, bv, order)
|
|
|
+ if c != 0:
|
|
|
+ return c
|
|
|
+
|
|
|
+ for item in tie_breakers:
|
|
|
+ field_name = str(item.get("field", "")).strip()
|
|
|
+ order = str(item.get("order", "asc")).strip().lower()
|
|
|
+ av = get_sort_field_value(a, field_name)
|
|
|
+ bv = get_sort_field_value(b, field_name)
|
|
|
+ c = cmp_scalar(av, bv, order)
|
|
|
+ if c != 0:
|
|
|
+ return c
|
|
|
+
|
|
|
+ return cmp_scalar(a.get("domain"), b.get("domain"), "asc")
|
|
|
+
|
|
|
+ return sorted(candidates, key=functools.cmp_to_key(compare))
|
|
|
|
|
|
|
|
|
def apply_filter(domains, filter_cfg):
|
|
|
@@ -435,10 +732,26 @@ def choose_domain(filtered_domains, check_results, top_n, ranked_scored):
|
|
|
if top:
|
|
|
return top[0]["domain"], top
|
|
|
|
|
|
- score_only = [{"domain": x["domain"], "scores": x["scores"], "created_raw": x["created_raw"]} for x in ranked_scored[:top_n]]
|
|
|
+ score_only = [
|
|
|
+ {
|
|
|
+ "domain": x["domain"],
|
|
|
+ "score_value": x.get("score_value"),
|
|
|
+ "scores": x.get("scores", []),
|
|
|
+ "created_raw": x.get("created_raw"),
|
|
|
+ }
|
|
|
+ for x in ranked_scored[:top_n]
|
|
|
+ ]
|
|
|
return score_only[0]["domain"], score_only
|
|
|
|
|
|
- top_scored = [{"domain": x["domain"], "scores": x["scores"], "created_raw": x["created_raw"]} for x in ranked_scored[:top_n]]
|
|
|
+ top_scored = [
|
|
|
+ {
|
|
|
+ "domain": x["domain"],
|
|
|
+ "score_value": x.get("score_value"),
|
|
|
+ "scores": x.get("scores", []),
|
|
|
+ "created_raw": x.get("created_raw"),
|
|
|
+ }
|
|
|
+ for x in ranked_scored[:top_n]
|
|
|
+ ]
|
|
|
if top_scored:
|
|
|
return top_scored[0]["domain"], top_scored
|
|
|
|
|
|
@@ -447,6 +760,7 @@ def choose_domain(filtered_domains, check_results, top_n, ranked_scored):
|
|
|
if top:
|
|
|
return top[0]["domain"], top
|
|
|
return None, check_results[:top_n]
|
|
|
+
|
|
|
if filtered_domains:
|
|
|
return filtered_domains[0], [{"domain": x} for x in filtered_domains[:top_n]]
|
|
|
return None, []
|
|
|
@@ -463,6 +777,13 @@ def main():
|
|
|
sys.exit(1)
|
|
|
|
|
|
cfg = read_json_file(config_path_abs)
|
|
|
+
|
|
|
+ try:
|
|
|
+ validate_config(cfg)
|
|
|
+ except Exception as e:
|
|
|
+ print(json.dumps({"status": "error", "error": f"invalid config: {e}"}, ensure_ascii=True), file=sys.stderr)
|
|
|
+ sys.exit(1)
|
|
|
+
|
|
|
output_cfg = cfg.get("output", {})
|
|
|
runtime_dir_cfg = output_cfg.get("runtime_dir", "./runtime")
|
|
|
if os.path.isabs(runtime_dir_cfg):
|
|
|
@@ -485,14 +806,20 @@ def main():
|
|
|
parsed = parse_domains(payload, cfg.get("parser", {}))
|
|
|
filtered = apply_filter(parsed, cfg.get("domain_filter", {}))
|
|
|
|
|
|
+ record_mapping_cfg = cfg.get("record_mapping", {})
|
|
|
+ field_map = record_mapping_cfg.get("field_map", {})
|
|
|
+ records = extract_records(payload, record_mapping_cfg)
|
|
|
+
|
|
|
record_filter_cfg = cfg.get("record_filter", {})
|
|
|
- blocked_domains = collect_excluded_domains(payload, record_filter_cfg, cfg.get("scoring", {}))
|
|
|
+ blocked_domains = collect_excluded_domains(records, field_map, record_filter_cfg)
|
|
|
if blocked_domains:
|
|
|
filtered = [d for d in filtered if d not in blocked_domains]
|
|
|
|
|
|
- scored_records = parse_scored_records(payload, cfg.get("scoring", {}))
|
|
|
- scored_records = [r for r in scored_records if r["domain"] in set(filtered)]
|
|
|
- ranked_scored = rank_scored_records(scored_records, cfg.get("scoring", {}))
|
|
|
+ scoring_cfg = cfg.get("scoring", {})
|
|
|
+ scored_records = parse_scored_records(records, field_map, record_mapping_cfg, scoring_cfg)
|
|
|
+ filtered_set = set(filtered)
|
|
|
+ scored_records = [r for r in scored_records if r["domain"] in filtered_set]
|
|
|
+ ranked_scored = rank_scored_records(scored_records, scoring_cfg)
|
|
|
|
|
|
check_results = []
|
|
|
if cfg.get("healthcheck", {}).get("enabled", True):
|